代理池抓取基础版-(python协程)--抓取网站(西刺-后期会持续更新)

 1 # coding = utf-8
 2 
 3 __autor__ = 'litao'
 4 
 5 import urllib.request
 6 import urllib.request
 7 import urllib.error
 8 import socket
 9 import gevent
10 from gevent import monkey
11 from bs4 import BeautifulSoup
12 import time
13 import random
14 home = "http://www.xicidaili.com/wt/"
15 first_proxy_list = []
16 end_proxy_list = []
17 # proxy_support = urllib.request.ProxyHandler({"http": "http://10.10.1.10:3128", "https": "http://10.10.1.10:1080"})
18 headers = {
19     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36"
20 }
21 monkey.patch_all()
22 def test_proxy(proxy_key):
23     # for i in range(len(first_proxy_list)):
24         # proxy_support = urllib.request.ProxyHandler({"http":proxy_list[i]})
25         print(proxy_key)
26         proxy = {"http":proxy_key}
27         url = "https://www.baidu.com/"
28 
29 
30         proxy_support = urllib.request.ProxyHandler(proxy)
31         opener = urllib.request.build_opener(proxy_support)
32         urllib.request.install_opener(opener)
33         res = urllib.request.Request(url=url, headers=headers)
34         try:
35             response = urllib.request.urlopen(res,timeout=5)
36             if response.code == 200:
37                 end_proxy_list.append(proxy_key)
38         except Exception as e:
39             print("error:",e)
40         # except socket.timeout as e:
41         #     print("This proxy is socket.timeout")
42         # except urllib.error.URLError as e:
43         #     print("This proxy is timeout")
44 
45 def get_proxy_list():
46     for i in range(20):
47         url =home + str(i+1)
48         print(url)
49         # proxy_support = urllib.request.ProxyHandler({"http":"123.125.5.100:3128"})
50         # opener = urllib.request.build_opener(proxy_support)
51         # urllib.request.install_opener(opener)
52         res = urllib.request.Request(url=url, headers=headers)
53         response =urllib.request.urlopen(res,timeout=20).read().decode()
54         soup = BeautifulSoup(response,'html.parser')
55         print(response)
56         content = soup.find_all("table",attrs={"id":"ip_list"})[0].find_all('tr')[1:]
57         for i in range(len(content)):
58             result = content[i].find_all('td')
59             proxy_enum = result[1].text+":"+result[2].text
60             print(proxy_enum)
61             first_proxy_list.append(proxy_enum)
62         time.sleep(random.randint(120,240))
63 
64 
65 def join_gevent(first_proxy_list,gevent_list):
66     for i in range(len(first_proxy_list)):
67         gevent_list.append(gevent.spawn(test_proxy,first_proxy_list[i]))
68 
69 def main():
70     gevent_list = []
71     get_proxy_list()
72     with open("proxy_first.txt",'a',encoding='utf-8') as f:
73         for item in first_proxy_list:
74             f.write(item+'
')
75     join_gevent(first_proxy_list, gevent_list)
76     gevent.joinall(gevent_list)
77     print(end_proxy_list)
78     with open("proxy_end.txt",'a',encoding='utf-8') as f:
79         for item in end_proxy_list:
80             f.write(item+'
')
81 
82 if __name__ == "__main__":
83     main()
原文地址:https://www.cnblogs.com/crawer-1/p/7638799.html