爬取三寸人间

#coding=gbk
import requests
from fake_useragent import UserAgent
from lxml import etree

url = 'https://www.81zw.com/book/32934/'
headers = {
    'User-Agent':UserAgent().random
}

response = requests.get(url,headers = headers)
e = etree.HTML(response.text)
txt_urls = e.xpath('//div[@id="list"]//@href')
txt_urls = ['https://www.81zw.com/' + txt_url[1:] for txt_url in txt_urls]

for num in range(len(txt_urls)):
    file = open('三寸人间.txt', 'a', encoding='utf-8-sig')
    response = requests.get(txt_urls[num], headers=headers)
    e = etree.HTML(response.content.decode('utf-8'))
    txt_title = e.xpath('//h1/text()')[0]
    txt_content = e.xpath('//div[@id="content"]/text()')
    file.write(str(txt_title) + '
')
    for line in txt_content:
        file.write(line + '
')
    # time.sleep(random.randint(1,3))
    print("第 {} 章下载完毕".format(num+1))
    file.close()

2020-07-15

原文地址:https://www.cnblogs.com/hany-postq473111315/p/13306001.html