使用代理(WebProxy)爬虫

关键代码:

 1 private Hashtable hash;//储存代理ip
 2         private WebProxy currentdaili;
 3         private int dailiExecMaxCount; //每个代理执行最大次数
 4         private int currentDailiExecCount; //当前代理执行次数
 5         public Handler2() //构造函数
 6         {
 7             dailiExecMaxCount = 100;
 8             currentDailiExecCount = 0;
 9             //hash = GetDailiList();
10             currentdaili = GetOneDaili();
11         }
12 
13 
14         //http://www.xici.net.co
15         /// <summary>
16         /// 获取代理ip返回hashtable
17         /// KK 2015-04-22
18         /// </summary>
19         /// <returns></returns>
20         private Hashtable GetDailiList()
21         {
22             Hashtable result = new Hashtable();
23             string strUrl = string.Format("http://www.xici.net.co");
24             string detailContext = GetHtmlByUrl(strUrl);
25             if (!string.IsNullOrEmpty(detailContext))
26             {
27                 HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
28                 string strkeyvalue = string.Empty;
29                 try
30                 {
31                     doc.LoadHtml(detailContext);
32                     HtmlNode node = doc.DocumentNode;
33                     HtmlNodeCollection trlist = node.SelectNodes("//table[@id='ip_list']//tr[@class='odd' or @class='']");
34                     foreach (HtmlNode item in trlist)
35                     {
36                         if (item.SelectNodes("td")[5].InnerText.ToUpper() == "HTTP")
37                         {
38                             strkeyvalue = item.SelectNodes("td")[1].InnerText + ":" + item.SelectNodes("td")[2].InnerText;
39                             result.Add(strkeyvalue, strkeyvalue);
40                         }
41                     }
42                 }
43                 catch (Exception ex)
44                 {
45                     webframework.common.logclass.Debug("======取代理ip出错====GetDaili==" + ex.Message);
46                     result = null;
47                 }
48 
49 
50             }
51             else
52             {
53                 result = null;
54             }
55             return result;
56         }
57 
58         /// <summary>
59         /// 从hashtable代理中取任意ip代理
60         /// </summary>
61         /// <param name="hash"></param>
62         /// <returns></returns>
63         private WebProxy GetOneDaili()
64         {
65             try
66             {
67                 if (hash == null || hash.Count == 0)
68                     hash = GetDailiList();
69                 if (currentdaili != null && hash.Contains(currentdaili.Address.Authority + ":" + currentdaili.Address.Port))
70                 {
71                     hash.Remove(currentdaili.Address.Authority + ":" + currentdaili.Address.Port);
72                 }
73                 System.Collections.IDictionaryEnumerator enumerator = hash.GetEnumerator();
74 
75                 //随机取代理
76                 Random rd = new Random();
77                 int n = rd.Next(hash.Count);
78                 int intCount = 0;
79                 while (enumerator.MoveNext())
80                 {
81                     intCount++;
82                     if (intCount == n)
83                     {
84                         currentdaili = new WebProxy(enumerator.Key.ToString(), true);
85                         break;
86                     }                    
87                 }
88             }
89             catch (Exception ex)
90             {
91                 webframework.common.logclass.Debug("======从hashtable代理中取任意ip代理出错====GetOneDaili==" + ex.Message);
92                 currentdaili = null;
93             }
94             logclass.Debug("======当前代理======" + currentdaili.Address.Authority + ":" + currentdaili.Address.Port );
95             return currentdaili;
96         }
View Code

使用:

 1 /// <summary>
 2         /// 发送get请求
 3         /// </summary>
 4         /// <param name="strUrl"></param>
 5         /// <param name="isRetry"></param>
 6         /// <returns></returns>
 7         private string GetHtmlByUrl(string strUrl, bool isRetry = false, WebProxy daili = null)
 8         {
 9             currentDailiExecCount++;
10             if (currentDailiExecCount > dailiExecMaxCount)
11             {
12                 logclass.Debug("======当前代理======" + currentdaili.Address.Authority + ":" + currentdaili.Address.Port + "==跑的次数超过了设置的最大次数(" + dailiExecMaxCount.ToString()+"");
13             }
14             try
15             {
16                 HttpWebResponse response = new webframework.common.HttpHelper()
17                 {
18                     URL = string.Format("{0}", strUrl),
19                     //Proxy = daili == null ? currentdaili : daili,
20                     //Proxy = new WebProxy("218.204.140.97:8118", true),
21                     Proxy = daili == null ? (currentDailiExecCount > dailiExecMaxCount ? GetOneDaili() : currentdaili) : daili,
22                     Timeout = 5 * 1000,
23                 }.CreateGetHttpResponse();
24 
25                 return response.HttpString(Encoding.UTF8);
26             }
27             catch (Exception)
28             {
29                 //重试请求
30                 if (!isRetry)
31                     return GetHtmlByUrl(strUrl, true, GetOneDaili());
32                 else
33                     throw null;
34             }
35 
36         }
37 
38 
39         /// <summary>
40         /// 发送post请求
41         /// </summary>
42         /// <param name="strUrl"></param>
43         /// <param name="isRetry"></param>
44         /// <returns></returns>
45         private string PostHtmlByUrl(string strUrl, string strPostString, bool isRetry = false, WebProxy daili = null)
46         {
47             currentDailiExecCount++;
48             if (currentDailiExecCount > dailiExecMaxCount)
49             {
50                 logclass.Debug("======当前代理======" + currentdaili.Address.Authority + ":" + currentdaili.Address.Port + "==跑的次数超过了设置的最大次数(" + dailiExecMaxCount.ToString()+"");
51             }
52             try
53             {
54                 HttpWebResponse response = new HttpHelper()
55                 {
56                     URL = strUrl,
57                     PostString = strPostString,
58                     //Proxy = new WebProxy("218.204.140.97:8118", true),
59                     Proxy = daili == null ? (currentDailiExecCount>dailiExecMaxCount?GetOneDaili(): currentdaili) : daili,
60                     //Proxy = daili == null ? currentdaili : daili,
61                     PostEncoding = Encoding.UTF8,
62                     Timeout = 5 * 1000,
63                 }.CreatePostHttpResponse();
64 
65                 return response.HttpString(Encoding.UTF8);
66             }
67             catch (Exception)
68             {
69                 //重试请求
70                 if (!isRetry)
71                     return PostHtmlByUrl(strUrl, strPostString, true, GetOneDaili());
72                 else
73                     throw null;
74             }
75 
76         }
View Code

参考资料:

http://www.haolizi.net/example/view_199.html

原文地址:https://www.cnblogs.com/systemkk/p/4449634.html