团队-爬取豆瓣Top250电影-开发文档

地址:https://movie.douban.com/top250

开发内容:

首先 我们选择使用Python脚本语言开发这个项目

代码:

import os
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException


"""

Author:

Damon

功能:

爬取豆瓣网Top250电影信息保存到本地

"""


# 目标网址

URL = "https://movie.douban.com/top250?start={}"
# 按照爬取顺序保存每个电影的网址

entity_url = []


def save_data(result):
"""

保存爬取信息到本地

:return: None

"""
f = open('movice.txt', "a", encoding="utf8")
f.write("======================================================================================================== ")
f.write("排名:" + result['top'] + " ")
f.write("评分:" + result['grade'] + " ")
f.write("名称:" + result['name'] + " ")
f.write("导演:" + result['director'] + " ")
f.write("编剧:" + result['scriptwriter'] + " ")
f.write("主演:" + result['protagonist'] + " ")
f.write("简介:" + result['synopsis'] + " ")
f.write("影评:" + " ")
f.write(" " + result['film_review']['first_user'] + ":" + result['film_review']['first_discuss'] + " ")
f.write(" " + result['film_review']['second_user'] + ":" + result['film_review']['second_discuss'] + " ")
f.write(" " + result['film_review']['thirdly_user'] + ":" + result['film_review']['thirdly_discuss'] + " ")
f.write(" " + result['film_review']['fourthly_user'] + ":" + result['film_review']['fourthly_discuss'] + " ")
f.write(" " + result['film_review']['fifth_user'] + ":" + result['film_review']['fifth_discuss'] + " ")
f.write("网址" + result['url'] + " ")
f.close()

print("已处理:" + result['name'] + " " + result['top'])


def analysis_page(num, url):
"""

解析网页,获取想要的数据

:param num: Top排行

:param url: 电影详情url

:return: None

"""
# 保存电影整体信息

result = {}
# 保存影评信息

film_review = {}
try:
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
}
res = requests.get(url, headers=headers)
res.encoding = "utf-8"
except RequestException as e:
print("请求索引页异常:", repr(e))
print("链接:", url)
# [待处理异常] 目标网址崩溃的异常处理。 解决方法:退出程序

os._exit(0)

soup = BeautifulSoup(res.text, "html.parser")

# 如果该网页不存在则 跳过 执行下一个网页

title = soup.select("title")[0].text
if title == "页面不存在":
f = open('movice.txt', "a", encoding="utf8")
f.write("======================================================================================================== ")
f.write("排名:Top" + str(num) + " ")
f.write("ERROR:页面不存在 ")
f.write("网址:" + url + " ")
f.close()
return -1

try:
# 排名

result['top'] = "Top" + str(num)
# 评分

result['grade'] = soup.select("#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > strong")[0].text
# 名称

result['name'] = soup.select("#content > h1")[0].text.replace(" ", "")
# 导演

result['director'] = soup.select("#info > span > span.attrs")[0].text
try:
# 编剧

result['scriptwriter'] = soup.select("#info > span > span.attrs")[1].text
# 主演

result['protagonist'] = soup.select("#info > span.actor > span.attrs")[0].text
except:
# 编剧

result['scriptwriter'] = ""
# 主演

result['protagonist'] = ""
try:
# 简介

result['synopsis'] = soup.select("#link-report > span.short > span")[0].text.replace(" ", "").replace(" ", "")
except:
# 简介

result['synopsis'] = soup.select("#link-report > span")[0].text.replace(" ", "").replace(" ", "")
# 第一个影评用户名

film_review['first_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[0].text
# 第一个影评用户评论

film_review['first_discuss'] = soup.select("#hot-comments > div > div > p")[0].text
# 第二个影评用户名

film_review['second_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[1].text
# 第二个影评用户评论

film_review['second_discuss'] = soup.select("#hot-comments > div > div > p")[1].text
# 第三个影评用户名

film_review['thirdly_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[2].text
# 第三个影评用户评论

film_review['thirdly_discuss'] = soup.select("#hot-comments > div > div > p")[2].text
# 第四个影评用户名

film_review['fourthly_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[3].text
# 第四个影评用户评论

film_review['fourthly_discuss'] = soup.select("#hot-comments > div > div > p")[3].text
# 第五个影评用户名

film_review['fifth_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[4].text
# 第五个影评用户评论

film_review['fifth_discuss'] = soup.select("#hot-comments > div > div > p")[4].text
# 影评

result['film_review'] = film_review
# 网址

result['url'] = url
except:
print("异常链接:", url, "------------------------------------")
# [待处理异常] 目标网址崩溃的异常处理。 解决方法:退出程序

os._exit(0)

# 保存数据到本地 txt

save_data(result)


def get_entity_url(url):
"""

爬取目标网址中每一个电影的网址

:param url: 目标网址

:return: None

"""
try:
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
}
res = requests.get(url, headers=headers)
res.encoding = "utf-8"
except RequestException as e:
print("请求索引页异常:", repr(e))
print("链接:", url)
# [待处理异常] 目标网址崩溃的异常处理。 解决方法:退出程序

os._exit(0)

soup = BeautifulSoup(res.text, "html.parser")

entity = soup.select("#content > div > div.article > ol > li > div > div.info > div.hd > a")
for i in range(len(entity)):
entity_url.append(entity[i]['href'])


def make_url(num):
"""

生成每页URL

:param num: 待生成网页 index

:return: None

"""
url = URL.format(num * 25)
get_entity_url(url)

if __name__ == '__main__':
# 获取所有电影的url

for i in range(10):
make_url(i)
print("已成功获取所有电影URL!")
# 根据获取到的url解析出想要的数据保存到本地

for i in range(len(entity_url)):
state = analysis_page((i + 1), entity_url[i])

if state == -1:
continue

原文地址:https://www.cnblogs.com/ccccryst/p/7780920.html