第3课-电影天堂爬虫实战

#电影天堂电影爬虫

import requests
from lxml import etree
import time

import warnings

warnings.filterwarnings('ignore')
DOMAIN = "https://dytt8.net"

HEADERS = {
"Referer": "https://dytt8.net/html/gndy/dyzz/index.html",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",

}

#获取元素对象
def get_page_info(url,flag=True):
html = ""
my_time = 0
time.sleep(1)
while(True):
response = requests.get(url=url,headers=HEADERS,verify=False)
if response.status_code == 200:
if flag:
text = response.text
else:
text = response.content.decode("gbk")
html = etree.HTML(text)
break
else:
my_time = my_time + 1
# print(response.status_code,my_time)
time.sleep(my_time)
return html

#获取页数
def get_pages():
url = DOMAIN+"/html/gndy/dyzz/index.html"
html = get_page_info(url)
pages = html.xpath("//select[@name='sldd']/option[last()]/text()")[0]

return int(pages)

#获取电影信息
def get_movie_info(detail_url):
html = get_page_info(detail_url,False)
infos = html.xpath("//div[@id='Zoom']//p/text()")
index = 0
for info in infos:
index = index + 1
if info != '':
info = str(info).replace(u'u3000',u' ').strip()
if info.startswith("◎译 名"):
info = info.replace("◎译 名 ", "").strip()
print(" ======================================译 名:{}===============================".format(info))
elif info.startswith("◎片 名"):
info = info.replace("◎片 名", "").strip()
print("片 名:{}".format(info))
elif info.startswith("◎年 代"):
info = info.replace("◎年 代", "").strip()
print("年 代:{}".format(info))
elif info.startswith("◎产 地"):
info = info.replace("◎产 地", "").strip()
print("产 地:{}".format(info))
elif info.startswith("◎类 别"):
info = info.replace("◎类 别", "").strip()
print("类 别:{}".format(info))
elif info.startswith("◎语 言"):
info = info.replace("◎语 言", "").strip()
print("语 言:{}".format(info))
elif info.startswith("◎字 幕"):
info = info.replace("◎字 幕", "").strip()
print("字 幕:{}".format(info))
elif info.startswith("◎上映日期"):
info = info.replace("◎上映日期", "").strip()
print("上映日期:{}".format(info))
elif info.startswith("◎IMDb评分"):
info = info.replace("◎IMDb评分", "").strip()
print("◎IMDb评分:{}".format(info))
elif info.startswith("◎豆瓣评分"):
info = info.replace("◎豆瓣评分", "").strip()
print("豆瓣评分:{}".format(info))
elif info.startswith("◎文件格式"):
info = info.replace("◎文件格式", "").strip()
print("文件格式:{}".format(info))
elif info.startswith("◎视频尺寸"):
info = info.replace("◎视频尺寸", "").strip()
print("视频尺寸:{}".format(info))
elif info.startswith("◎文件大小"):
info = info.replace("◎文件大小", "").strip()
print("文件大小:{}".format(info))
elif info.startswith("◎片 长"):
info = info.replace("◎片 长", "").strip()
print("片 长:{}".format(info))
elif info.startswith("◎导 演"):
info = info.replace("◎导 演", "").strip()
print("导 演:{}".format(info))
elif info.startswith("◎编 剧"):
info = info.replace("◎编 剧", "").strip()
print("编 剧:{}".format(info))
elif info.startswith("◎主 演"):
actors = []
info = info.replace("◎主 演", "").strip()
actors.append(info)
for i in range(index,len(infos)):
info = infos[i].strip()
if info.startswith("◎"):
break
else:
actors.append(info)
print("主演:{}".format(actors))
elif info.startswith("◎标 签"):
info = info.replace("◎标 签", "").strip()
print("标 签:{}".format(info))
elif info.startswith("◎简 介"):
info = info.replace("◎简 介", "").strip()
info = infos[index].strip()
print("简 介:{}".format(info))
download_url = html.xpath("//table//td[@bgcolor='#fdfddf']/a/@href")
if len(download_url) > 0:
print("迅雷下载地址:{}".format(download_url[0]))
def get_detail_url():#获取电影详情链接
for i in range(1,get_pages()+1):
url = "{}/html/gndy/dyzz/list_23_{}.html".format(DOMAIN,i)
print(url)
html = get_page_info(url)
detail_urls = html.xpath("//table[@class='tbspan']//a[@class='ulink']/@href")
for detail_url in detail_urls:
detail_url = DOMAIN + detail_url

get_movie_info(detail_url)


if __name__ == '__main__':

get_detail_url()


原文地址:https://www.cnblogs.com/win0211/p/11991185.html