爬取学习 屠戮盗版天堂

今天学习了屠戮盗版天堂来爬取信息:

代码

import requests
import re
url = "https://www.dy2018.com/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'
}
resp = requests.get(url,verify=False,headers = headers)
resp.encoding = 'gb2312'
#print(resp.text)
obj = re.compile(r"2021必看热片.*?<ul>(?P<ul>.*?)</ul>",re.S)
obj2 = re.compile(r"<a href='(?P<href>.*?)'",re.S)
obj3 = re.compile(r'◎片  名(?P<name>.*?)<br />.*?<td style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<xia>.*?)">',re.S)
result = obj.finditer(resp.text)
child_href_list = []
for it in result:
ul = it.group('ul')
result2 = obj2.finditer(ul)
for itt in result2:
child_href = url+itt.group('href').strip('/')
child_href_list.append(child_href)
for href in child_href_list:
resp2 = requests.get(href,verify = False)
resp2.encoding = 'gb2312'
result3 = obj3.search(resp2.text)
print(result3.group("name"))
print(result3.group("xia"))








原文地址:https://www.cnblogs.com/092e/p/14955017.html