ximalaya-spider

import requests
import parsel, re, json

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36',
}
path = "./video/"
for i in range(1):
    url = 'https://www.ximalaya.com/youshengshu/4256765/p%d/' % i
    response = requests.get(url, headers=headers, proxies=proxies)
    html_data = response.text

    selector = parsel.Selector(html_data)
    lis = selector.xpath('//*[@id="anchor_sound_list"]/div[2]/ul/li')

    for li in lis:
        title = li.xpath('.//a/@title').get()

        href = li.xpath('.//a/@href').get()

        m4a_id = href.split('/')[-1]
        video_url = f'https://www.ximalaya.com/revision/play/v1/audio?id={m4a_id}&ptype=1'
        print("开始下载音频数据:%s" % title)
        m4a_str = requests.get(url=video_url, headers=headers, proxies=proxies).text
        m4a_dict = json.loads(m4a_str)
        m4a_url = m4a_dict['data']['src']
        m4a_data = requests.get(m4a_url, headers=headers, proxies=proxies).content
        pattern = r'[\/:*?"<>|
]+'
        pat = re.compile(pattern)
        sign = pat.search(title)
        if sign:
            new_title = re.sub(pattern, '_', title)
            with open(path + new_title + '.mp3', "wb") as w:
                w.write(m4a_data)
            print("%s音频数据保存完毕" % title)

        else:
            with open(path + title + '.mp3', "wb") as w:
                w.write(m4a_data)
            print("%s音频数据保存完毕" % title)

  

原文地址:https://www.cnblogs.com/hello-python2020/p/14187350.html