爬取喜马拉雅免费有声小说

import requests
import re
from bs4 import BeautifulSoup
import json
import math

'''
写文件

'''


def json_sanalyzes(legal):

contents ={}
for i in legal:
li=[]
for k in i:
contents['name']=k['trackName']
contents['src'] =k['src']
yield contents



# return contents
#
#


def dump_load(url,id):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
}
res = requests.get(url,headers=header)
while res.status_code!=200:
continue
else:
counts = res.text
supers=BeautifulSoup(counts,"html.parser")
res=supers.find("h2",class_="rC5T")
ressss=re.findall(r"<h2 class="rC5T">专辑里的声音(<!-- -->(d+)<!-- -->)",str(res))[0]#获取小说总条数
n=math.ceil(int(ressss)/30) # 区分页漂移量
for i in range(n):#循环请求要爬取的页面url 每页爬取30条
url ="https://www.ximalaya.com/revision/play/album?albumId=%s&pageNum=%d&sort=-1&pageSize =30"%(id,i+1)
contens_rs = requests.get(url,headers=header)
while contens_rs.status_code!=200:
continue
else:
rescsa=json.loads(contens_rs.content.decode())
yield rescsa['data']['tracksAudioPlay'] #采用yield 生成器

if __name__=="__main__":
import os
id = 12642314 #小说id
url = "https://www.ximalaya.com/youshengshu/%d/"%id #请求要访问小说页面的主页面
#dump_load(url,id)
response = json_sanalyzes(dump_load(url,id))# 获取小说每页列表并解析出 音频地址 和 小说单张名称
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
}
for i in response: # 循环下载小说内容
dump_cont = requests.get(i['src'], headers=header)
while dump_cont.status_code!=200:
continue
else:
p = "D:/untitled2/venv/theand/小说/" # 小说存放地址
if os.path.exists(p) == False: # 判断小说地址是否存在 不存在
os.mkdir(p) #则创建目录
b_name =i['name'].replace('"'," ").replace('"'," ")
path_paths = p + b_name + ".mp3" #拼接小说单张字节名称
path_p = path_paths.replace(' ', '')
with open(str(path_p), "wb") as f: #写入文件音频
f.write(requests.get(i['src']).content)

原文地址:https://www.cnblogs.com/wxc1/p/10237354.html