利用xpath来解析douban电影相对应的信息

from lxml import etree
import requests

url = "https://movie.douban.com/chart"

headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}

response = requests.get(url,headers = headers)
html_str = response.content.decode()

#print(response)

html = etree.HTML(html_str)
#将html_str文本转化为对象
print(html)

#1.获取所有电影的url
# url_list = html.xpath("//div[@class = 'indent']/div/table//div[@class='pl2']/a/@href")
#利用xpath来获取在tabal下面中class为pl2中a的href的值
#print(url_list)

#2.获取所有图片的地址
# img_list = html.xpath("//div[@class = 'indent']/div/table/tr/td/a/img/@src")
#print(img_list)

# img = html.xpath("//div[@class = 'indent']/div/table//a[@class='nbg']/img/@src")
#print(img)
#上面两个功能一样,只要找到相应位置的字段值里面含有class,就可以利用//快速定位。

#3.需要每部电影里对应的参数
ret1 = html.xpath("//div[@class = 'indent']/div/table")
print(ret1)
for table in ret1:
    item = {}
    #标题
    item["title"] = table.xpath(".//div[@class='pl2']/a/text()")[0].replace("/","").strip()
    # 去字典里面的第一个数值,且将/符号用控制符来替代,并且将外皮全部剥去。

    #电影地址
    item["url"] = table.xpath(".//a[@class='nbg']/@href")
    #图片
    item["img"] = table.xpath(".//a[@class='nbg']/img/@src")[0]
    #评分
    item["grade"] = table.xpath(".//div[@class='star clearfix']/span[@class = 'rating_nums']/text()")
    item["comment"] = table.xpath(".//div[@class = 'star clearfix']/span[@class = 'pl']/text()")



    print(item)

这样就能解析出电影的信息了。

原文地址:https://www.cnblogs.com/zengsf/p/8552218.html