HtmlAgilityPack教程

解析html教程(重点) http://www.cnblogs.com/kissdodog/archive/2013/02/28/2936950.html

完整的教程 http://www.cnblogs.com/kissdodog/category/453229.html

1 解析html

路径
//div 属于平行路径
/html/body/div/ul 属于xml类型的路径
//table/tr 平行路径+xml类型路径,混合使用
//*[@id='div1'] 可以根据id选择,也可以根据其它的属性
*代表匹配所有类型的标签,也可以换成其它的标签,如div等
如果要选择多个使用:var nodes = doc.DocumentNode.SelectNodes("//*[@class='a']");
按节点的ChildNodes选择
divInfo.ChildNodes[0].ChildNodes[0].Attributes["src"].Value


1 选择网页中的所有的div
doc.DocumentNode.SelectNodes("//div")

2 选择doc.DocumentNode.SelectSingleNode("/html/body/div/ul")

3 根据属性id选择节点
HtmlNode node8 = doc.DocumentNode.SelectSingleNode("//*[@id='div1']");
Response.Write(node8.Id);
Response.Write(node8.InnerText);


属性
Name
InnerHtml
InnerText
OuterHtml
ParentNode
XPath

2 Get/Post请求网页

  1 using System;
  2 using System.Collections.Generic;
  3 using System.Linq;
  4 using System.Web;
  5 using System.Net;
  6 using System.Configuration;
  7 using System.IO;
  8 using System.Text;
  9 
 10 namespace MyLibrary.Common
 11 {
 12     public class BaseParser
 13     {
 14         private string _encode = "utf-8"; //默认编码格式
 15 
 16         #region 1.0 下载指定URL的HTML代码(默认编码格式) + string GetHtml(string strUrl)
 17         /// <summary>
 18         /// 下载指定URL的HTML代码
 19         /// </summary>
 20         /// <param name="strUrl">目标页URL</param>
 21         /// <returns>目标URL的HTML代码</returns>
 22         public string GetHtml(string strUrl)
 23         {
 24             HttpWebRequest httpReq;
 25             HttpWebResponse httpResp;
 26 
 27             httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
 28             httpReq.AllowAutoRedirect = true;
 29             CookieContainer cc = new CookieContainer();
 30             httpReq.CookieContainer = cc;
 31 
 32             httpResp = (HttpWebResponse)httpReq.GetResponse();
 33             Stream respStream = httpResp.GetResponseStream();
 34             StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
 35             string html = respStreamReader.ReadToEnd();
 36             respStream.Close();
 37             respStreamReader.Close();
 38 
 39             return html;
 40         }
 41         #endregion
 42 
 43         #region 1.1 下载指定URL的HTML代码(默认编码格式,并加了try catch) + string GetHtml2(string strUrl)
 44         /// <summary>
 45         /// 下载指定URL的HTML代码
 46         /// </summary>
 47         /// <param name="strUrl">目标页URL</param>
 48         /// <returns>目标URL的HTML代码,如果报错,则返回error</returns>
 49         public string GetHtml2(string strUrl)
 50         {
 51             HttpWebRequest httpReq;
 52             HttpWebResponse httpResp;
 53 
 54             httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
 55             httpReq.AllowAutoRedirect = true;
 56             CookieContainer cc = new CookieContainer();
 57             httpReq.CookieContainer = cc;
 58             try
 59             {
 60                 httpResp = (HttpWebResponse)httpReq.GetResponse();
 61                 Stream respStream = httpResp.GetResponseStream();
 62                 StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
 63                 string html = respStreamReader.ReadToEnd();
 64                 respStream.Close();
 65                 respStreamReader.Close();
 66 
 67                 return html;
 68             }
 69             catch
 70             {
 71                 return "error";
 72             }
 73 
 74             
 75         }
 76         #endregion
 77 
 78         #region 2.0 下载指定URL的HTML代码 + string GetHtml(string strUrl, Encoding encode)
 79         /// <summary>
 80         /// 下载指定URL的HTML代码
 81         /// </summary>
 82         /// <param name="strUrl">目标页URL</param>
 83         ///<param name="encode">编码格式</param>
 84         /// <returns>目标URL的HTML代码</returns>
 85         public string GetHtml(string strUrl, Encoding encode)
 86         {
 87             HttpWebRequest httpReq;
 88             HttpWebResponse httpResp;
 89 
 90             httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
 91             httpReq.AllowAutoRedirect = true;
 92             CookieContainer cc = new CookieContainer();
 93             httpReq.CookieContainer = cc;
 94 
 95             httpResp = (HttpWebResponse)httpReq.GetResponse();
 96             Stream respStream = httpResp.GetResponseStream();
 97             StreamReader respStreamReader = new StreamReader(respStream, encode);
 98             string html = respStreamReader.ReadToEnd();
 99             respStream.Close();
100             respStreamReader.Close();
101 
102             return html;
103         }
104         #endregion
105 
106         #region 3.0 带Cookie凭据下载有登录限制URL的HTML代码(默认编码格式) + string GetHtml(string strUrl, CookieContainer cc)
107         /// <summary>
108         /// 带Cookie凭据下载有登录限制URL的HTML代码
109         /// </summary>
110         /// <param name="strUrl">目标URL</param>
111         /// <param name="cc">Cookie凭据</param>
112         /// <returns>目标URL的HTML代码</returns>
113         public string GetHtml(string strUrl, CookieContainer cc)
114         {
115             HttpWebRequest httpReq;
116             HttpWebResponse httpResp;
117 
118             httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
119             httpReq.AllowAutoRedirect = true;
120             httpReq.CookieContainer = cc;
121 
122             httpResp = (HttpWebResponse)httpReq.GetResponse();
123             Stream respStream = httpResp.GetResponseStream();
124             StreamReader respStreamReader = new StreamReader(respStream, Encoding.GetEncoding(_encode));
125             string html = respStreamReader.ReadToEnd();
126             respStream.Close();
127             respStreamReader.Close();
128 
129             return html;
130         }
131         #endregion
132 
133         #region 4.0 带Cookie凭据下载有登录限制URL的HTML代码 + string GetHtml(string strUrl, CookieContainer cc, Encoding encode)
134         /// <summary>
135         /// 带Cookie凭据下载有登录限制URL的HTML代码
136         /// </summary>
137         /// <param name="strUrl">目标URL</param>
138         /// <param name="cc">Cookie凭据</param>
139         /// <param name="encode">编码格式</param>
140         /// <returns>目标URL的HTML代码</returns>
141         public string GetHtml(string strUrl, CookieContainer cc, Encoding encode)
142         {
143             HttpWebRequest httpReq;
144             HttpWebResponse httpResp;
145 
146             httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
147             httpReq.AllowAutoRedirect = true;
148             httpReq.CookieContainer = cc;
149 
150             httpResp = (HttpWebResponse)httpReq.GetResponse();
151             Stream respStream = httpResp.GetResponseStream();
152             StreamReader respStreamReader = new StreamReader(respStream, encode);
153             string html = respStreamReader.ReadToEnd();
154             respStream.Close();
155             respStreamReader.Close();
156 
157             return html;
158         }
159         #endregion
160 
161         #region 5.0 带Cookie凭据模拟发送POST请求(默认编码格式) + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
162         /// <summary>
163         /// 带Cookie凭据模拟发送POST请求
164         /// </summary>
165         /// <param name="strUrl">目标URL</param>
166         /// <param name="dicParams">参数列表</param>
167         /// <param name="container">Cookie凭据</param>
168         /// <param name="encode">编码格式</param>
169         /// <returns>请求成功返回目标URL的HTML代码,失败则返回error</returns>
170         public string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
171         {
172             string postData = string.Empty;
173             if (dicParams != null)
174             {
175                 foreach (string key in dicParams.Keys)
176                 {
177                     postData += string.Format("{0}={1}&", key, dicParams[key]);
178                 }
179                 if (postData != string.Empty) postData = postData.Substring(0, postData.Length - 1);
180             }
181             byte[] byteArray = Encoding.GetEncoding(_encode).GetBytes(postData);
182             HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
183             httpReq.AllowAutoRedirect = true;
184             //httpReq.Credentials = CredentialCache.DefaultCredentials;
185             httpReq.KeepAlive = true;
186             httpReq.Method = "POST";
187             httpReq.ContentType = "application/x-www-form-urlencoded";
188             httpReq.ContentLength = byteArray.Length;
189 
190             if (container != null) httpReq.CookieContainer = container;
191             else httpReq.CookieContainer = new CookieContainer();
192 
193             Stream reqStream = httpReq.GetRequestStream();
194             reqStream.Write(byteArray, 0, byteArray.Length);    //写入参数
195             reqStream.Close();
196 
197 
198             HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
199             httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
200             int cookies = httpResp.Cookies.Count;
201             if (container == null) container = httpReq.CookieContainer;
202 
203             StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), Encoding.GetEncoding(_encode));
204             string html = respStream.ReadToEnd();
205 
206             respStream.Close();
207             httpReq.Abort();
208             httpResp.Close();
209 
210             if (cookies > 0) return html;
211             else return "error";
212         }
213         #endregion
214 
215         #region 5.1 带Cookie凭据模拟发送POST请求(默认编码格式,即使报错也返回HTML代码) + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
216         /// <summary>
217         /// 带Cookie凭据模拟发送POST请求(即使报错也返回HTML代码)
218         /// </summary>
219         /// <param name="strUrl">目标URL</param>
220         /// <param name="dicParams">参数列表</param>
221         /// <param name="container">Cookie凭据</param>
222         /// <param name="encode">编码格式</param>
223         /// <returns>请求成功返回目标URL的HTML代码,失败则返回error和HTML代码(格式:error|HTML代码)</returns>
224         public string PostWebRequest2(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container)
225         {
226             string postData = string.Empty;
227             if (dicParams != null)
228             {
229                 foreach (string key in dicParams.Keys)
230                 {
231                     postData += string.Format("{0}={1}&", key, dicParams[key]);
232                 }
233                 if (postData != string.Empty) postData = postData.Substring(0, postData.Length - 1);
234             }
235             byte[] byteArray = Encoding.GetEncoding(_encode).GetBytes(postData);
236             HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
237             httpReq.AllowAutoRedirect = true;
238             //httpReq.Credentials = CredentialCache.DefaultCredentials;
239             httpReq.KeepAlive = true;
240             httpReq.Method = "POST";
241             httpReq.ContentType = "application/x-www-form-urlencoded";
242             httpReq.ContentLength = byteArray.Length;
243 
244             if (container != null) httpReq.CookieContainer = container;
245             else httpReq.CookieContainer = new CookieContainer();
246 
247             Stream reqStream = httpReq.GetRequestStream();
248             reqStream.Write(byteArray, 0, byteArray.Length);    //写入参数
249             reqStream.Close();
250 
251 
252             HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
253             httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
254             int cookies = httpResp.Cookies.Count;
255             if (container == null) container = httpReq.CookieContainer;
256 
257             StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), Encoding.GetEncoding(_encode));
258             string html = respStream.ReadToEnd();
259 
260             respStream.Close();
261             httpReq.Abort();
262             httpResp.Close();
263 
264             if (cookies > 0) return html;
265             else return "error|"+html;
266         }
267         #endregion
268 
269         #region 6.0 带Cookie凭据模拟发送POST请求 + string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container, Encoding encode)
270         /// <summary>
271         /// 带Cookie凭据模拟发送POST请求
272         /// </summary>
273         /// <param name="strUrl">目标URL</param>
274         /// <param name="dicParams">参数列表</param>
275         /// <param name="container">Cookie凭据</param>
276         /// <param name="encode">编码格式</param>
277         /// <returns>请求成功返回目标URL的HTML代码,失败则返回error</returns>
278         public string PostWebRequest(string strUrl, IDictionary<string, string> dicParams, ref CookieContainer container, Encoding encode)
279         {
280             string postData = string.Empty;
281             if (dicParams != null)
282             {
283                 foreach (string key in dicParams.Keys)
284                 {
285                     postData += string.Format("{0}={1}&", key, dicParams[key]);
286                 }
287                 if (postData != string.Empty) postData = postData.Substring(0, postData.Length - 1);
288             }
289             byte[] byteArray = encode.GetBytes(postData);
290             HttpWebRequest httpReq = (HttpWebRequest)WebRequest.Create(new Uri(strUrl));
291             httpReq.AllowAutoRedirect = true;
292             //httpReq.Credentials = CredentialCache.DefaultCredentials;
293             httpReq.KeepAlive = true;
294             httpReq.Method = "POST";
295             httpReq.ContentType = "application/x-www-form-urlencoded";
296             httpReq.ContentLength = byteArray.Length;
297 
298             if (container != null) httpReq.CookieContainer = container;
299             else httpReq.CookieContainer = new CookieContainer();
300 
301             Stream reqStream = httpReq.GetRequestStream();
302             reqStream.Write(byteArray, 0, byteArray.Length);    //写入参数
303             reqStream.Close();
304 
305 
306             HttpWebResponse httpResp = (HttpWebResponse)httpReq.GetResponse();
307             httpResp.Cookies = httpReq.CookieContainer.GetCookies(httpReq.RequestUri);
308             int cookies = httpResp.Cookies.Count;
309             if (container == null) container = httpReq.CookieContainer;
310 
311             StreamReader respStream = new StreamReader(httpResp.GetResponseStream(), encode);
312             string html = respStream.ReadToEnd();
313 
314             respStream.Close();
315             httpReq.Abort();
316             httpResp.Close();
317 
318             if (cookies > 0) return html;
319             else return "error";
320         }
321         #endregion
322         
332     }
333 }
原文地址:https://www.cnblogs.com/james641/p/4903463.html