C#HTML 转文本及HTML内容提取


//1、HTML直接转文本 //使用方法 HtmlToText convert = new HtmlToText(); textBox2.Text = convert.Convert(textBox1.Text); //代码 /// <summary> /// Converts HTML to plain text. /// </summary> class HtmlToText { // Static data tables protected static Dictionary<string, string> _tags; protected static HashSet<string> _ignoreTags; // Instance variables protected TextBuilder _text; protected string _html; protected int _pos; // Static constructor (one time only) static HtmlToText() { _tags = new Dictionary<string, string>(); _tags.Add("address", "\n"); _tags.Add("blockquote", "\n"); _tags.Add("div", "\n"); _tags.Add("dl", "\n"); _tags.Add("fieldset", "\n"); _tags.Add("form", "\n"); _tags.Add("h1", "\n"); _tags.Add("/h1", "\n"); _tags.Add("h2", "\n"); _tags.Add("/h2", "\n"); _tags.Add("h3", "\n"); _tags.Add("/h3", "\n"); _tags.Add("h4", "\n"); _tags.Add("/h4", "\n"); _tags.Add("h5", "\n"); _tags.Add("/h5", "\n"); _tags.Add("h6", "\n"); _tags.Add("/h6", "\n"); _tags.Add("p", "\n"); _tags.Add("/p", "\n"); _tags.Add("table", "\n"); _tags.Add("/table", "\n"); _tags.Add("ul", "\n"); _tags.Add("/ul", "\n"); _tags.Add("ol", "\n"); _tags.Add("/ol", "\n"); _tags.Add("/li", "\n"); _tags.Add("br", "\n"); _tags.Add("/td", "\t"); _tags.Add("/tr", "\n"); _tags.Add("/pre", "\n"); _ignoreTags = new HashSet<string>(); _ignoreTags.Add("script"); _ignoreTags.Add("noscript"); _ignoreTags.Add("style"); _ignoreTags.Add("object"); } /// <summary> /// Converts the given HTML to plain text and returns the result. /// </summary> /// <param name="html">HTML to be converted</param> /// <returns>Resulting plain text</returns> public string Convert(string html) { // Initialize state variables _text = new TextBuilder(); _html = html; _pos = 0; // Process input while (!EndOfText) { if (Peek() == '<') { // HTML tag bool selfClosing; string tag = ParseTag(out selfClosing); // Handle special tag cases if (tag == "body") { // Discard content before <body> _text.Clear(); } else if (tag == "/body") { // Discard content after </body> _pos = _html.Length; } else if (tag == "pre") { // Enter preformatted mode _text.Preformatted = true; EatWhitespaceToNextLine(); } else if (tag == "/pre") { // Exit preformatted mode _text.Preformatted = false; } string value; if (_tags.TryGetValue(tag, out value)) _text.Write(value); if (_ignoreTags.Contains(tag)) EatInnerContent(tag); } else if (Char.IsWhiteSpace(Peek())) { // Whitespace (treat all as space) _text.Write(_text.Preformatted ? Peek() : ' '); MoveAhead(); } else { // Other text _text.Write(Peek()); MoveAhead(); } } // Return result return HttpUtility.HtmlDecode(_text.ToString()); } // Eats all characters that are part of the current tag // and returns information about that tag protected string ParseTag(out bool selfClosing) { string tag = String.Empty; selfClosing = false; if (Peek() == '<') { MoveAhead(); // Parse tag name EatWhitespace(); int start = _pos; if (Peek() == '/') MoveAhead(); while (!EndOfText && !Char.IsWhiteSpace(Peek()) && Peek() != '/' && Peek() != '>') MoveAhead(); tag = _html.Substring(start, _pos - start).ToLower(); // Parse rest of tag while (!EndOfText && Peek() != '>') { if (Peek() == '"' || Peek() == '\'') EatQuotedValue(); else { if (Peek() == '/') selfClosing = true; MoveAhead(); } } MoveAhead(); } return tag; } // Consumes inner content from the current tag protected void EatInnerContent(string tag) { string endTag = "/" + tag; while (!EndOfText) { if (Peek() == '<') { // Consume a tag bool selfClosing; if (ParseTag(out selfClosing) == endTag) return; // Use recursion to consume nested tags if (!selfClosing && !tag.StartsWith("/")) EatInnerContent(tag); } else MoveAhead(); } } // Returns true if the current position is at the end of // the string protected bool EndOfText { get { return (_pos >= _html.Length); } } // Safely returns the character at the current position protected char Peek() { return (_pos < _html.Length) ? _html[_pos] : (char)0; } // Safely advances to current position to the next character protected void MoveAhead() { _pos = Math.Min(_pos + 1, _html.Length); } // Moves the current position to the next non-whitespace // character. protected void EatWhitespace() { while (Char.IsWhiteSpace(Peek())) MoveAhead(); } // Moves the current position to the next non-whitespace // character or the start of the next line, whichever // comes first protected void EatWhitespaceToNextLine() { while (Char.IsWhiteSpace(Peek())) { char c = Peek(); MoveAhead(); if (c == '\n') break; } } // Moves the current position past a quoted value protected void EatQuotedValue() { char c = Peek(); if (c == '"' || c == '\'') { // Opening quote MoveAhead(); // Find end of value int start = _pos; _pos = _html.IndexOfAny(new char[] { c, '\r', '\n' }, _pos); if (_pos < 0) _pos = _html.Length; else MoveAhead(); // Closing quote } } /// <summary> /// A StringBuilder class that helps eliminate excess whitespace. /// </summary> protected class TextBuilder { private StringBuilder _text; private StringBuilder _currLine; private int _emptyLines; private bool _preformatted; // Construction public TextBuilder() { _text = new StringBuilder(); _currLine = new StringBuilder(); _emptyLines = 0; _preformatted = false; } /// <summary> /// Normally, extra whitespace characters are discarded. /// If this property is set to true, they are passed /// through unchanged. /// </summary> public bool Preformatted { get { return _preformatted; } set { if (value) { // Clear line buffer if changing to // preformatted mode if (_currLine.Length > 0) FlushCurrLine(); _emptyLines = 0; } _preformatted = value; } } /// <summary> /// Clears all current text. /// </summary> public void Clear() { _text.Length = 0; _currLine.Length = 0; _emptyLines = 0; } /// <summary> /// Writes the given string to the output buffer. /// </summary> /// <param name="s"></param> public void Write(string s) { foreach (char c in s) Write(c); } /// <summary> /// Writes the given character to the output buffer. /// </summary> /// <param name="c">Character to write</param> public void Write(char c) { if (_preformatted) { // Write preformatted character _text.Append(c); } else { if (c == '\r') { // Ignore carriage returns. We'll process // '\n' if it comes next } else if (c == '\n') { // Flush current line FlushCurrLine(); } else if (Char.IsWhiteSpace(c)) { // Write single space character int len = _currLine.Length; if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1])) _currLine.Append(' '); } else { // Add character to current line _currLine.Append(c); } } } // Appends the current line to output buffer protected void FlushCurrLine() { // Get current line string line = _currLine.ToString().Trim(); // Determine if line contains non-space characters string tmp = line.Replace(" ", String.Empty); if (tmp.Length == 0) { // An empty line _emptyLines++; if (_emptyLines < 2 && _text.Length > 0) _text.AppendLine(line); } else { // A non-empty line _emptyLines = 0; _text.AppendLine(line); } // Reset current line _currLine.Length = 0; } /// <summary> /// Returns the current output as a string. /// </summary> public override string ToString() { if (_currLine.Length > 0) FlushCurrLine(); return _text.ToString(); } } } //2、提取html的正文 类 using System; using System.Text; namespace HtmlStrip { class MainClass { public static void Main (string[] args) { string str = "<div>abc</div><span>efg</span><br /><script>888</script><!--<PA>WW</PA-->oo"; //System.IO.StreamReader rd=new System.IO.StreamReader ("/home/lx/test.html"); //str=rd.ReadToEnd (); HtmlParser t = new HtmlParser (str); // t.KeepTag (new string[] { "br" }); //设置br标签不过虑 Console.Write (t.Text ()); } } class HtmlParser { private string[] htmlcode; //把html转为数组形式用于分析 private StringBuilder result = new StringBuilder (); //输出的结果 private int seek; //分析文本时候的指针位置 private string[] keepTag; //用于保存要保留的尖括号内容 private bool _inTag; //标记现在的指针是不是在尖括号内 private bool needContent = true; //是否要提取正文 private string tagName; //当前尖括号的名字 private string[] specialTag = new string[] { "script", "style", "!--" }; //特殊的尖括号内容,一般这些标签的正文是不要的 /// <summary> /// 当指针进入尖括号内,就会触发这个属性。这里主要逻辑是提取尖括号里的标签名字 /// </summary> public bool inTag { get { return _inTag; } set { _inTag = value; if (!value) return; bool ok = true; tagName = ""; while (ok) { string word = read (); if (word != " " && word != ">") { tagName += word; } else if (word == " " && tagName.Length > 0) { ok = false; } else if (word == ">") { ok = false; inTag = false; seek -= 1; } } } } /// <summary> /// 初始化类 /// </summary> /// <param name="html"> /// 要分析的html代码 /// </param> public HtmlParser (string html) { htmlcode = new string[html.Length]; for (int i = 0; i < html.Length; i++) { htmlcode[i] = html[i].ToString (); } KeepTag (new string[] { }); } /// <summary> /// 设置要保存那些标签不要被过滤掉 /// </summary> /// <param name="tags"> /// /// </param> public void KeepTag (string[] tags) { keepTag = tags; } /// <summary> /// /// </summary> /// <returns> /// 输出处理后的文本 /// </returns> public string Text () { int startTag = 0; int endTag = 0; while (seek < htmlcode.Length) { string word = read (); if (word.ToLower () == "<") { startTag = seek; inTag = true; } else if (word.ToLower () == ">") { endTag = seek; inTag = false; if (iskeepTag (tagName.Replace ("/", ""))) { for (int i = startTag - 1; i < endTag; i++) { result.Append (htmlcode[i].ToString ()); } } else if (tagName.StartsWith ("!--")) { bool ok = true; while (ok) { if (read () == "-") { if (read () == "-") { if (read () == ">") { ok = false; } else { seek -= 1; } } } } } else { foreach (string str in specialTag) { if (tagName == str) { needContent = false; break; } else needContent = true; } } } else if (!inTag && needContent) { result.Append (word); } } return result.ToString (); } /// <summary> /// 判断是否要保存这个标签 /// </summary> /// <param name="tag"> /// A <see cref="System.String"/> /// </param> /// <returns> /// A <see cref="System.Boolean"/> /// </returns> private bool iskeepTag (string tag) { foreach (string ta in keepTag) { if (tag.ToLower () == ta.ToLower ()) { return true; } } return false; } private string read () { return htmlcode[seek++]; } } }

  ===========该文转自=========

http://blog.csdn.net/cjh200102/article/details/6824895#

================================

原文地址:https://www.cnblogs.com/zjw520/p/3014848.html