多线程爬取豆瓣音乐

该模块相当于爬取100个页面,打开每个页面提取自己所需要的东西

更改的话食用方式是创建个urls列表(对应下方的get_url()函数)

 urls = ['https://music.douban.com/top250?start={}'.format(str(i)) for i in range(0,100,25)]

然后就是更改下从主页获取的url链接提取方式,提取出来的url页面需要解析的数据(对应下方的get_url_music()和get_music_info()函数)

import threading
import datetime
import requests
from bs4 import BeautifulSoup
import re
import time

# 记录开始时间
starttime = datetime.datetime.now()
# 创建线程锁
lock = threading.Lock()
# 从url列表中获取URL,这是一个同步函数
def get_url():
    global urls # 通过这个全局变量url来获取变量
    # 获取url之前加锁
    lock.acquire()
    if len(urls) == 0:
        lock.release()
        return ""
    else:
        url = urls[0]
        del urls[0] # 提取一个url后将这个url删掉,以保障每次提取的都是最新的url
    # 完成工作释放锁
    lock.release()
    return url



print(time.time())
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
    'cookie':'豆瓣需要自己获取cookie'
}
def get_url_music(url,thread_name):
    html = requests.get(url,headers=headers)

    soup = BeautifulSoup(html.text, 'lxml')

    aTags = soup.find_all("a",attrs={"class": "nbg"})
    for aTag in aTags:
        get_music_info(aTag['href'],thread_name)


def get_music_info(url,thread_name):
    html = requests.get(url,headers=headers)
    soup = BeautifulSoup(html.text, 'lxml')
    name = soup.find (attrs={'id':'wrapper'}).h1.span.text
    author = soup.find(attrs={'id':'info'}).find('a').text
    styles = re.findall('<span class="pl">流派:</span>&nbsp;(.*?)<br />', html.text, re.S)
    if len(styles) == 0:
        style = '未知'
    else:

        style = styles[0].strip()
    time = re.findall('发行时间:</span>&nbsp;(.*?)<br />', html.text, re.S)[0].strip()
    publishers = re.findall('<span class="pl">出版者:</span>&nbsp;(.*?)<br />', html.text, re.S)
    if len(publishers) == 0:
        publisher = '未知'
    else:
        publisher = publishers[0].strip()


    # score = soup.find(class_='ll rating_num').text
    info = {
        'name': name,
        'author': author,
        'style': style,
        'time': time,
        'publisher': publisher,
        # 'score': score
    }
    # 输出线程名称;thread_name没定义吧,这个是函数中的参数;这sprint就是在函数中
    print(thread_name, info) # 这一句话也没执行
# 这是一个线程类
class SpiderThread (threading.Thread):   #继承父类threading.Thread

    def __init__(self,name):
        threading.Thread.__init__(self)
        # name是线程名
        self.name = name
    def run(self): # 这一句没有运行
        # 多线程的话更改下这里
        while True:
            url = get_url()
            if url != "":
                # 线程一旦运行就会从列表中不断获取URL,知道列表为空
                get_url_music(url,self.name)
            else:
                break


if __name__ == '__main__':
    url_index = 0
    urls = ['https://music.douban.com/top250?start={}'.format(str(i)) for i in range(0,100,25)]
    print(len(urls))
    # 创建4个新线程
    thread1 = SpiderThread('thread1') # 这里为啥可以传字符串进去;init中定义了就是线程的名字
    thread2 = SpiderThread('thread2')
    thread3 = SpiderThread('thread3')
    thread4 = SpiderThread('thread4')

    # 开启4个线程
    thread1.start()
    thread2.start()
    thread3.start()
    thread4.start()
    # 等4个线程结束,才会退出爬虫
    thread1.join()
    thread2.join()
    thread3.join()
    thread4.join()
    print("退出爬虫")
    endtime = datetime.datetime.now()
    print('需要时间:',(endtime - starttime).seconds,'秒')


原文地址:https://www.cnblogs.com/wkhzwmr/p/15693789.html