python爬虫多线程编程

#使用了线程库
import threading
from queue import  Queue
from bs4 import BeautifulSoup
import  json
import requests
class ThreadCrawl(threading.Thread):
    def __init__(self,threadNmae,pageQueue,dataQueue):
        #threading.Thread.__init__(self)
        #多个父类的话下面这个方便
         super(ThreadCrawl,self).__init__( )
         self.threadNmae=threadNmae
         self.pageQueue=pageQueue
         self.dataQueue=dataQueue
         self.headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"}
    def run(self):
         print("启动"+self.threadNmae)
         while not CRAWL_EXIT:
            try:
             #取出一个数字,先进先出
             #1可选参数block默认值是true,不会结束,会进入阻塞状态,直到队列有新的数据
             #2.如果队列为空,block为Flase的话,就会弹出一个Queue.empty()异常
                 page=self.pageQueue.get(False)
                 url="https://www.qiushibaike.com/8hr/page/"+str(page)+"/"
                 content=requests.get(url,headers=self.headers)
                 self.dataQueue.put(content)
            except:
                 pass
         print("结束"+self.threadNmae)
CRAWL_EXIT=False
PARSE_EXIT=False
def main():
    #页面的队列可以存储10页
    pageQueue=Queue(10)
    #放入1-10  先进先出
    for i in range(1,11):
         pageQueue.put(i)
         #采集结果的数据队列,参数为空
         dataQueue=Queue()
         #存储三个线程采集的名字
         crawList=["采集线程1号","采集线程2号","采集线程3号"]
         #存储三个采集线程
         threadcrawl=[]
         for threadNmae in crawList:
            thread=ThreadCrawl(threadNmae,pageQueue,dataQueue)
            thread.start()
            threadcrawl.append(thread)

         while not pageQueue.empty():
             pass
         global  CRAWL_EXIT
         CRAWL_EXIT=True
         print("Queue为空")
         for thread in threadcrawl:
             thread.join()
             print("joining...............")
if __name__=="__main__":
    main()

  

原文地址:https://www.cnblogs.com/c-x-a/p/8027281.html