爬虫_斗图啦(队列,多线程)

 1 import threading
 2 import requests
 3 from lxml import etree
 4 from urllib import request
 5 import os
 6 import re
 7 from queue import Queue
 8 
 9 
10 class Producer(threading.Thread):
11     headers = {
12         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
13     }
14     def __init__(self,page_queue,img_queue,*args,**kwargs):
15         super(Producer, self).__init__(*args,**kwargs)
16         self.page_queue = page_queue
17         self.img_queue = img_queue
18 
19 
20     def run(self):
21         while True:
22             if self.page_queue.empty():
23                 break
24             url = self.page_queue.get()
25             self.parse_page(url)
26 
27 
28     def parse_page(self,url):
29         response = requests.get(url,headers=self.headers)
30         text = response.text
31         html = etree.HTML(text)
32         imgs = html.xpath("//div[@class='page-content text-center']//a//img")
33         for img in imgs:
34             if img.get('class') == 'gif':
35                 continue
36             img_url = img.xpath(".//@data-original")[0]
37             suffix = os.path.splitext(img_url)[1]
38             alt = img.xpath(".//@alt")[0]
39             alt = re.sub(r'[,。??,/\·]','',alt)
40             img_name = alt + suffix
41             self.img_queue.put((img_url,img_name))
42 
43 
44 class Consumer(threading.Thread):
45     def __init__(self,page_queue,img_queue,*args,**kwargs):
46         super(Consumer, self).__init__(*args,**kwargs)
47         self.page_queue = page_queue
48         self.img_queue = img_queue
49 
50 
51     def run(self):
52         while True:
53             if self.img_queue.empty():
54                 if self.page_queue.empty():
55                     return
56             img = self.img_queue.get(block=True)
57             url,filename = img
58             request.urlretrieve(url,'images/'+filename)
59             print(filename+'  下载')
60 
61 
62 def main():
63     page_queue = Queue(100)
64     img_queue = Queue(500)
65 
66     for x in range(1,101):
67         url = "http://www.doutula.com/photo/list/?page=%d" % x
68         page_queue.put(url)
69     for x in range(5):
70         t = Producer(page_queue,img_queue)
71         t.start()
72     for x in range(5):
73         t = Consumer(page_queue,img_queue)
74         t.start()
75 
76 
77 if __name__ == '__main__':
78     main()

下载是相当快啊

原文地址:https://www.cnblogs.com/MC-Curry/p/9459640.html