学习爬虫的day02 (用线程去爬虫 提高速度)

通过lxml的方式去分析数据,将爬到的数据放到file中的html中

代码如下

# 用线程去爬虫
from urllib.request import Request from urllib.request import urlopen from time import sleep,ctime from lxml import etree import _thread; ii=0 headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} def spilder(page): global ii; url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%%E5%%8C%%97%%E4%%BA%%AC&kw=python&sm=0&p=%i"%(page); req = Request(url=url, headers=headers); req_timeout = 5; f = urlopen(req, None, req_timeout); s = f.read(); s=s.decode("UTF-8"); s=str(s) selector = etree.HTML(s); links = selector.xpath('//tr/td[@class="zwmc"]/div/a/@href|//tr/td[@class="zwmc"]/div/a/text()'); f=open("file/%i.html" %page,'w') for link in links: f.write("%s<br>"%link); print(link); ii+=1; print(ii) f.close(); def main(): global ii; for i in range(1,11): _thread.start_new_thread(spilder,(i,)) for kk in range(15): if(ii>9): break; else : sleep(2) main()

会出错,socket.timeout: timed out
可以通过代理IP来解决
原文地址:https://www.cnblogs.com/qieyu/p/7800918.html