Python爬取代理ip

 1 # -*- coding:utf-8 -*-
 2 #author : willowj
 3 import urllib
 4 import urllib2
 5 from bs4 import BeautifulSoup
 6 import re
 7 import bs4
 8 
 9 import sys
10 
11 
12 reload(sys)  
13 sys.setdefaultencoding('utf8') 
14 
15 
16 def ip_test(ip,url= "https://www.baidu.com"):
17     #test ip if can be used
18     #url = "http://ip.chinaz.com/getip.aspx"# 默认测试网址
19     ip1="http://"+ip
20     try :
21         res = urllib.urlopen(url,proxies={'http:':ip1}).read() #尝试代理访问
22         print 'ok',ip1 #,res
23         return True
24     except Exception,e:
25         print "failed"
26         return False
27     
28 
29 def get_iphtml_inyoudaili():
30     url='http://www.youdaili.net'
31     html=urllib2.urlopen(url) 
32     code=html.read()
33     #href="http://www.youdaili.net/Daili/http/26672.html" title="12月27号 最新代理http服务器ip地址"
34     regexp='href="(.*?)" .*?最新代理http服务器ip地址'
35     pat=re.compile(regexp)
36     met=re.findall(pat,code)
37     print met[0]
38     #最新代理http服务器ip地址 html
39     return met[0]
40     
41 
42 def getIps(url):
43     #getip from website, test,and  return,save aviable ips in 'ips.txt'
44     htmlip=urllib2.urlopen(url) 
45     codeip=htmlip.read()
46 
47     regexpip='([1-9][0-9]{0,2}.S*?)@HTTP#'  #IP样式
48     pat_ip=re.compile(regexpip) 
49 
50     met_ip=re.findall(pat_ip,codeip)
51 
52     ips=[]
53     file_open=open('ips.txt','w')
54     for x in met_ip:
55         print x
56         if ip_test(x):
57             ips.append(x)
58             file_open.write(x+'
')
59     file_open.close()
60     #print ips,'youdaili'
61     return ips
62 
63 
64 def saveIps(list):
65     file_open=open('ips.txt','w')
66     for ip in list:
67         file_open.write(ip+'
')
68     file_open.close()
69 
70 
71 def read_ips(file='ips.txt'):
72     '''读取IP 以list返回'''
73     file_open=open(file)
74     lines=file_open.readlines()
75     ips=[]
76     for line in  lines:
77         ip=line.strip("
")
78         ips.append(ip)
79     print ips
80     return ips
81  
82  
83 if __name__=="__main__":
84     
85     ips = getIps(get_iphtml_inyoudaili())
86 
87     saveIps(ips)
原文地址:https://www.cnblogs.com/willowj/p/6246640.html