豆瓣最佳影评-星级转换

总的来说,爬取豆瓣信息不算难,因为在网上一抓一大把教程,但是自己写的代码还是和别人的不一样,特别是自己一个一个想出来一个一个敲出来的那种酸爽赶脚

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from lxml import etree
import csv

file = open('douban1.csv','a',newline='',encoding='utf-8')
writer = csv.writer(file)#这种方式比with open慢!
writer.writerow(['星级','内容'])

def id(url):
    #获取每个影评的id
    rsp=requests.get(url)
    html=etree.HTML(rsp.text)
    id=html.xpath('//div[@class="main-bd"]/div/@data-rid')
    return id


def next_url(url):    #获取下一页url
    rsp = requests.get(url)
    html = etree.HTML(rsp.text)
    next_url=html.xpath('//div[@class="paginator"]/span[@class="next"]/a/@href')
    next_url='https://movie.douban.com'+next_url[0]
    # print(next_url)
    sel(next_url)

def star(url,i):
    #获取每个影评的星级
    rsp=requests.get(url)
    html=etree.HTML(rsp.text)
    star=html.xpath('//div[@data-cid="'+i+'"]/div[@class="main review-item"]/header[@class="main-hd"]/span/@class')
    return star


def translate(star):#把代码转换为相应的星级
    if star[0]=='allstar50 main-title-rating':
        star=str('五星')
        return star
    if star[0]=='allstar40 main-title-rating':
        star=str('四星')
        return star
    if star[0]=='allstar30 main-title-rating':
        star=str('三星')
        return star
    if star[0]=='allstar20 main-title-rating':
        star=str('二星')
        return star
    if star[0]=='allstar10 main-title-rating':
        star=str('一星')
        return star
    if star[0]=='main-meta':#有些没有星级
        star=str('默认好评')
        return star

def sel(url):
    # 加载动态页面,使用selenium调用Chrome浏览器点击展开按钮
    brow = webdriver.Chrome(r"D:PythonScriptschromedriver.exe")
    brow.get(url)
    id1=id(url)
    for i in id1:
        box1=brow.find_element_by_xpath('//div[@class="short-content"]/a[@id="toggle-'+i+'-copy"]')
        brow.execute_script("window.scrollTo(0,"+i*1200+")")#让屏幕滚动,使得鼠标可以聚焦!
        # print(box1)#1200是实验得到的数值,没有统一性
        box1.click() #点击展开按钮
        time.sleep(2)  # 必须等待两秒,否则获取到的代码是未点击加载的代码
        html = brow.page_source
        soup = BeautifulSoup(html, "lxml")
        for j in soup.find_all('div', class_='review-content clearfix'):
            content = j.get_text()
            content=content.strip()  #去掉空格
            content = content.replace('
', '').replace('	', '').replace('xa0', '').replace('
', '')#去掉转义字符
            content = content.split('*')#转换为列表
        # print(content)
        # print(type(content))
        star1=star(url,i)#获取星级代码
        # print(star1)  # ['allstar50 main-title-rating']
        star2 = translate(star1)#获取转换后的相应星级
        star2 = star2.split('*')
        # print(star2)
        for i,j in zip(star2,content):
            params=(i,j)
            print(params)
            writer.writerow(params)#写入文件中
    brow.close()#关闭浏览器
    next_url(url)#点击下一页


if __name__=='__main__':
    url = 'https://movie.douban.com/review/best/'
    sel(url)#启动程序

'''
selenium.common.exceptions.WebDriverException: Message: unknown error: Element <a href="javascript:;" id="toggle-9590829-copy" class="unfold" title="...">展开</a> is not clickable at point (120, 586). Other element would receive the click: <div class="review-content clearfix" data-author="夜第七章" data-url="https://movie.douban.com/review/9592082/" data-original="1">...</div>
  (Session info: chrome=54.0.2840.99)
  (Driver info: chromedriver=2.27.440174 (e97a722caafc2d3a8b807ee115bfb307f7d2cfd9),platform=Windows NT 10.0.14393 x86_64)
  错误原因:选的元素不是input,无法聚集焦点,使用sleep,window.scrollTo(0,x)
'''

  

原文地址:https://www.cnblogs.com/fodalaoyao/p/10474958.html