3.21

转载一个养老院的爬虫项目:(为毕设数据做铺垫的)

import requests
from lxml import etree
import csv
import threading
from queue import Queue
import time

parse_count = 1
crawl_fail_list = []
parse_fail_list = []
#http://www.yanglao.com.cn/resthome_2
class crawl_thread(threading.Thread):

def __init__(self, name, page_queue, data_queue):
super().__init__()
self.name = name
self.page_queue = page_queue
self.data_queue =data_queue

def run(self):
global crawl_fail_list
print("*********%s开始************" % self.name)
while 1:
#如果page-queue空就终止线程
if self.page_queue.empty():
break
#从页码池获取数据,拼接url
try:
page = self.page_queue.get()
url = 'http://www.yanglao.com.cn/resthome_' + str(page)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
#发送请求,获得响应
r = requests.get(url, headers=headers)
#将响应放入数据队列
self.data_queue.put(r.text)
print('%s:第%s页爬取完成' % (self.name, page))
time.sleep(0.3)
except Exception as e:
print(e)
crawl_fail_list.appen(page)
print("*********%s结束************" % self.name)

class parse_thread(threading.Thread):

def __init__(self, name, data_queue, suo, writer):
super().__init__()
self.name = name
self.data_queue = data_queue
self.suo = suo
self.writer = writer

def run(self):
global parse_count
global parse_fail_list
print("*********%s开始************" % self.name)
while 1:
#从数据队列获得数据,如果超过30s没有新数据就终止
try:
content = self.data_queue.get(True,15)
except:
break
#解析数据
try:
tree = etree.HTML(content)
li_list = tree.xpath('//li[@class="rest-item"]')
for li in li_list:
name = li.xpath('.//h4/a/text()')[0]
location = li.xpath('.//ul/li[1]/text()')[0].replace('地址:','')
beds = li.xpath('.//ul/li[2]/text()')[0].replace('床位数:','').replace('张','')
money = li.xpath('.//ul/li[3]/text()')[0].replace('收费区间:','')
lt = [name, location, beds, money]
#上锁写csv
self.suo.acquire()
self.writer.writerow(lt)
self.suo.release()
print("%s:第%s页写入完成" %(self.name, parse_count))
#如果解析失败就抛出错误,继续循环
except Exception as e:
print(e)
parse_fail_list.append(parse_count)
parse_count += 1
print("*********%s结束************" % self.name)

##################################################################
def create_queue():
#创建页码队列
page_queue = Queue()
#总页数
for page in range(1, 1676):
page_queue.put(page)
#创建数据队列
data_queue = Queue()
return page_queue, data_queue

def create_crawl_list(page_queue, data_queue):
crawl_list = []
name_list = ['爬虫1号', '爬虫2号']
for name in name_list:
crawl = crawl_thread(name, page_queue, data_queue)
crawl_list.append(crawl)
return crawl_list

def create_parse_list(data_queue, suo, writer):
parse_list = []
name_list = ['解析1号', '解析2号']
for name in name_list:
parse = parse_thread(name, data_queue, suo, writer)
parse_list.append(parse)
return parse_list

###################################################
def main():
#创建队列
page_queue, data_queue = create_queue()
#创建锁
suo = threading.Lock()
#打开文件,创建writer
f = open('养老院数据_全.csv', 'a', encoding='utf8', newline='')
writer = csv.writer(f)
#创建爬虫队列和解析队列
crawl_list = create_crawl_list(page_queue, data_queue)
parse_list = create_parse_list(data_queue, suo, writer)
print(crawl_list, parse_list)
#启动爬虫
for crawl in crawl_list:
crawl.start()
for parse in parse_list:
parse.start()
#确保主线程最后关闭
for crawl in crawl_list:
crawl.join()
for parse in parse_list:
parse.join()
#收尾
f.close()
print('所有线程关闭,程序结束!!!')
print(crawl_fail_list)
print(parse_fail_list)


if __name__ == '__main__':
main()

原文地址:https://www.cnblogs.com/maxin123/p/12536922.html