Python爬虫爬取中国古诗词网上的名句

运行截图：

 1 import requests
 2 from lxml import etree
 3 from urllib import request
 4 
 5 # 全局变量（请求头+文件IO对象）
 6 headers = {
 7     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'}
 8 file=open('./古诗名句.txt','w',encoding='utf-8')
 9 
10 
11 # 采集前端源码
12 def index():
13     for num in range(1, 12):
14         base_url = 'https://so.gushiwen.cn/mingju/default.aspx?p={}&c=&t='.format(num)  # 网站翻页
15         print('正在写入', base_url, '中的数据信息...')
16         response = requests.get(base_url, headers=headers)  # 模拟访问+请求头
17         response.encoding = 'utf-8'  # 解码
18         html = response.text  # 获取源码
19         clean(html)  # 清洗数据
20 
21 
22 # 清洗数据
23 def clean(html):
24     htmls=etree.HTML(html)#预处理
25     #xpath守则
26     Mingjus_urls=htmls.xpath('//div[@class="cont"]/a[1]/@href')#名句网址（待处理）
27     #print(Mingjus_urls)
28     Mingjus=htmls.xpath('//div[@class="cont"]/a[1]/text()')#名句
29     #print(Mingjus)
30     Poem_titles=htmls.xpath('//div[@class="cont"]/a[2]/text()')
31     #print(Poem_titles)
32     sto(Mingjus_urls,Mingjus,Poem_titles)
33 
34 # 打印数据
35 def sto(Mingjus_urls,Mingjus,Poem_titles):
36     for M,Mingju,Poem_title in zip(Mingjus_urls,Mingjus,Poem_titles):
37         #拼接网址
38         Mingju_url='https://so.gushiwen.cn/'+M
39         #整合数据信息
40         full_info=Mingju+'	'+Poem_title+'
'+'名句网址：'+Mingju_url
41         #写入文件
42         file.write(full_info+'
')
43 
44 if __name__ == '__main__':
45     index()
46     file.close()
View Code