HtmlAgilityPach基本使用方法

 1         //过滤html标签
 2         static void InnerText()
 3         {
 4             HtmlWeb htmlWeb = new HtmlWeb();
 5             HtmlDocument doc = htmlWeb.Load("http://www.cnblogs.com/", "GET");
 6             HtmlNode rootNode = doc.DocumentNode;
 7             Console.WriteLine(rootNode.InnerHtml);
 8             //Console.WriteLine(rootNode.InnerText);
 9         }
10 
11         //选择器
12         static void GetBlogs()
13         {
14             string url = "http://www.cnblogs.com/";
15             HtmlWeb htmlWeb = new HtmlWeb();
16             HtmlDocument doc = htmlWeb.Load(url, "GET");
17             //doc.GetElementbyId("aa");
18             HtmlNode rootNode = doc.DocumentNode;
19             HtmlNodeCollection h3Nodes = rootNode.SelectNodes("//div[@class='post_item_body']/h3");
20             foreach (var h3Node in h3Nodes)
21             {
22                 HtmlNode aNode = h3Node.SelectSingleNode("a");    //筛选a标签节点
23                 HtmlNode pNode = h3Node.NextSibling.NextSibling;  //下一个节点
24                 string blogLink = aNode.GetAttributeValue("href", "");  //获取元素属性
25                 string title = aNode.InnerText;
26                 string content = pNode.InnerText;
27                 Console.WriteLine(title);
28                 Console.WriteLine(blogLink);
29                 Console.WriteLine(content);
30                 Console.WriteLine("------------------------------------------------------");
31             }
32             return;
33         }
34 
35         //XPath表达式
36         static void XPathTest()
37         {
38             string path = @"test.html";
39             HtmlDocument doc = new HtmlDocument();
40             HtmlNode rootNode = doc.DocumentNode;
41             doc.Load(path);
42             //获取h1标签
43             var h1 = rootNode.SelectSingleNode("/html/body/div[1]/h1[1]");
44             Console.WriteLine(h1.InnerText);
45             //获取ul>li 姓名标签
46             var liName = rootNode.SelectSingleNode("/html/body/div[2]/ul[1]/li[1]");
47             Console.WriteLine(liName.InnerText);
48             //获取ul>li 年龄标签
49             var liAge = rootNode.SelectSingleNode("/html/body/div[2]/ul[1]/li[2]");
50             Console.WriteLine(liAge.InnerText);
51         }

test.html代码如下:

 1 <html>
 2 <head>
 3 </head>
 4 <body>
 5     <div>   
 6         <h1>欢迎访问这个网页!</h1>
 7     </div>
 8 
 9     <div>
10         <ul class="user_match clear">
11             <li>姓名:张三</li>
12             <li>年龄:18</li>
13         </ul>
14     </div>
15 </body>
16 </html>
原文地址:https://www.cnblogs.com/miaosha5s/p/5453534.html