爬虫之 段子网

爬虫之 段子网

https://ishuo.cn/{i} i in ['duanzi','yulu','joke','xiaozhishi','duanyu']

import requests
import re
for i in ['duanzi','yulu','joke','xiaozhishi','duanyu']:  # 对多个页面进行爬文字
    response = requests.get(f'https://ishuo.cn/{i}')  # 模拟浏览器打开网页
    data = response.text  #

    # .匹配所有字符,*表示前面的字符0到无穷个
    content_res = re.findall('<div class="content">(.*?)</div>', data)
    title_res = re.findall('<a href="/subject/.*?">(.*?)</a>', data)
    title_res= title_res[10:60]
    title_content_dic = {}
    for i in range(len(title_res)):
        title_content_dic[title_res[i]] = content_res[i]
        # print(title_content_dic)

    # print(title_content_dic)
    for i in title_content_dic.items():
        # print(str(i)+'
')
        print(f'{i[0]:<40} | {i[1]:<1000}')
原文地址:https://www.cnblogs.com/dadazunzhe/p/11232539.html