百度贴吧爬虫代码

import requests
from lxml import etree

url= "https://tieba.baidu.com/p/6585139804"
headers={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"}
page_text = requests.get(url=url,headers=headers).text
tree= etree.HTML(page_text)
div_list= tree.xpath('//div[@class="l_post l_post_bright j_l_post clearfix  "]')
all_data_list=list()
for div in div_list[1:2]:
    # print(div)
    desc = div.xpath('//div[2]/div[1]/cc/div[2]/text()')
    all_data_list.append("".join(desc).strip())
print(all_data_list)

 

原文地址:https://www.cnblogs.com/groundcontrol/p/12608258.html