总的来说,爬取豆瓣信息不算难,因为在网上一抓一大把教程,但是自己写的代码还是和别人的不一样,特别是自己一个一个想出来一个一个敲出来的那种酸爽赶脚
import requests from bs4 import BeautifulSoup from selenium import webdriver import time from lxml import etree import csv file = open('douban1.csv','a',newline='',encoding='utf-8') writer = csv.writer(file)#这种方式比with open慢! writer.writerow(['星级','内容']) def id(url): #获取每个影评的id rsp=requests.get(url) html=etree.HTML(rsp.text) id=html.xpath('//div[@class="main-bd"]/div/@data-rid') return id def next_url(url): #获取下一页url rsp = requests.get(url) html = etree.HTML(rsp.text) next_url=html.xpath('//div[@class="paginator"]/span[@class="next"]/a/@href') next_url='https://movie.douban.com'+next_url[0] # print(next_url) sel(next_url) def star(url,i): #获取每个影评的星级 rsp=requests.get(url) html=etree.HTML(rsp.text) star=html.xpath('//div[@data-cid="'+i+'"]/div[@class="main review-item"]/header[@class="main-hd"]/span/@class') return star def translate(star):#把代码转换为相应的星级 if star[0]=='allstar50 main-title-rating': star=str('五星') return star if star[0]=='allstar40 main-title-rating': star=str('四星') return star if star[0]=='allstar30 main-title-rating': star=str('三星') return star if star[0]=='allstar20 main-title-rating': star=str('二星') return star if star[0]=='allstar10 main-title-rating': star=str('一星') return star if star[0]=='main-meta':#有些没有星级 star=str('默认好评') return star def sel(url): # 加载动态页面,使用selenium调用Chrome浏览器点击展开按钮 brow = webdriver.Chrome(r"D:PythonScriptschromedriver.exe") brow.get(url) id1=id(url) for i in id1: box1=brow.find_element_by_xpath('//div[@class="short-content"]/a[@id="toggle-'+i+'-copy"]') brow.execute_script("window.scrollTo(0,"+i*1200+")")#让屏幕滚动,使得鼠标可以聚焦! # print(box1)#1200是实验得到的数值,没有统一性 box1.click() #点击展开按钮 time.sleep(2) # 必须等待两秒,否则获取到的代码是未点击加载的代码 html = brow.page_source soup = BeautifulSoup(html, "lxml") for j in soup.find_all('div', class_='review-content clearfix'): content = j.get_text() content=content.strip() #去掉空格 content = content.replace(' ', '').replace(' ', '').replace('xa0', '').replace(' ', '')#去掉转义字符 content = content.split('*')#转换为列表 # print(content) # print(type(content)) star1=star(url,i)#获取星级代码 # print(star1) # ['allstar50 main-title-rating'] star2 = translate(star1)#获取转换后的相应星级 star2 = star2.split('*') # print(star2) for i,j in zip(star2,content): params=(i,j) print(params) writer.writerow(params)#写入文件中 brow.close()#关闭浏览器 next_url(url)#点击下一页 if __name__=='__main__': url = 'https://movie.douban.com/review/best/' sel(url)#启动程序 ''' selenium.common.exceptions.WebDriverException: Message: unknown error: Element <a href="javascript:;" id="toggle-9590829-copy" class="unfold" title="...">展开</a> is not clickable at point (120, 586). Other element would receive the click: <div class="review-content clearfix" data-author="夜第七章" data-url="https://movie.douban.com/review/9592082/" data-original="1">...</div> (Session info: chrome=54.0.2840.99) (Driver info: chromedriver=2.27.440174 (e97a722caafc2d3a8b807ee115bfb307f7d2cfd9),platform=Windows NT 10.0.14393 x86_64) 错误原因:选的元素不是input,无法聚集焦点,使用sleep,window.scrollTo(0,x) '''