获取Html页面元素属性内容

获取网页HTML元素内容方法
①通过正则表达式匹配获取

View Code

1 string resquestUrl = string.Empty;
2                 //过来HTML元素方法
3                 Regex rxGetInfo = new Regex("<label for=\"caller\">.*?</label>", RegexOptions.IgnoreCase);
4                 Regex rxFilter = new Regex("<.*?>");
5                 HttpWebRequest request = WebRequest.Create(resquestUrl) as HttpWebRequest;
6                 HttpWebResponse response = request.GetResponse() as HttpWebResponse;
7                 StreamReader sr = new StreamReader(response.GetResponseStream());
8                 string returnContent = sr.ReadToEnd();
9                 sr.Close();
10                 response.Close();
11                 MatchCollection mc = rxGetInfo.Matches(returnContent);

②根据元素属性 GetElementById获取
HtmlDocument temphtml = new HtmlDocument();
temphtml.GetElementById();
③过滤html标签

View Code

1  /// <summary>
2         /// 过滤html标签
3         /// </summary>
4         /// <param name="strHtml">html的内容</param>
5         /// <returns></returns>
6         public static string StripHTML(string stringToStrip)
7         {
8             // paring using RegEx           //
9             stringToStrip = Regex.Replace(stringToStrip, "</p(?:\\s*)>(?:\\s*)<p(?:\\s*)>", "\n\n", RegexOptions.IgnoreCase | RegexOptions.Compiled);
10             stringToStrip = Regex.Replace(stringToStrip, "<br(?:\\s*)/>", "\n", RegexOptions.IgnoreCase | RegexOptions.Compiled);
11             stringToStrip = Regex.Replace(stringToStrip, "\"", "''", RegexOptions.IgnoreCase | RegexOptions.Compiled);
12             stringToStrip = StripHtmlXmlTags(stringToStrip);
13             return stringToStrip;
14         }
15
16         private static string StripHtmlXmlTags(string content)
17         {
18             return Regex.Replace(content, "<[^>]+>", "", RegexOptions.IgnoreCase | RegexOptions.Compiled);
19         }