说明:大家可以看崔庆才老师的博客或者买他的书。基本框架来自崔庆才老师的书本。
链接:https://github.com/Python3WebSpider/MaoYan/blob/master/spider.py
基本思路:
拿到网页,解析网页,保存结果,循环操作。
方法:正则表达式、beatifulsoup、pyquery。
我比较喜欢pyquery。
一、正则表达式
这一个也是直接copy崔庆才老师的代码,没做任何修改。
import json import requests from requests.exceptions import RequestException import re import time def get_one_page(url): try: headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36' } response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: return None def parse_one_page(html): pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a' + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S) items = re.findall(pattern, html) for item in items: yield { 'index': item[0], 'image': item[1], 'title': item[2], 'actor': item[3].strip()[3:], 'time': item[4].strip()[5:], 'score': item[5] + item[6] } def write_to_file(content): with open('result.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False) + ' ') def main(offset): url = 'http://maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item) if __name__ == '__main__': for i in range(10): main(offset=i * 10) time.sleep(1)
二、BeautifulSoup
主要是对解析部分进行了更改。
from bs4 import BeautifulSoup import json import requests from requests.exceptions import RequestException import re import time def get_one_page(url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'} try: response = requests.get(url, headers=headers) time.sleep(2) if response.status_code == 200: return response.text else: return None except RequestException: return None def parse_one_page(html): soup = BeautifulSoup(html, 'lxml') # 使用lxml XML 解析库 ranking = soup.select('#app > div > div > div.main > dl > dd > i') imgsrc = soup.select('#app > div > div > div.main > dl > dd > a > img.board-img') # 子代选择器 title = soup.select('#app > div > div > div.main > dl > dd > div > div > div.movie-item-info > p.name > a') star = soup.select('#app > div > div > div.main > dl > dd > div > div > div.movie-item-info > p.star') releasetime = soup.select('#app > div > div > div.main > dl > dd > div > div > div.movie-item-info > p.releasetime') integer = soup.select('#app > div > div > div.main > dl > dd > div > div > div.movie-item-number.score-num > p > i.integer') fraction = soup.select('#app > div > div > div.main > dl > dd > div > div > div.movie-item-number.score-num > p > i.fraction') for i in range(10): yield { 'index': ranking[i].text, 'image': imgsrc[i]['data-src'], 'title': title[i].text, 'actor': star[i].text.strip()[3:], 'date': releasetime[i].text.strip()[5:15], 'score': integer[i].text + fraction[i].text } def write_to_file(items): for item in items: print(item) with open('result.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(item, ensure_ascii=False) + ' ') def main(offset): url = 'http://maoyan.com/board/4?offset=' + str(offset) # print(url) html = get_one_page(url) # print(html) items = parse_one_page(html) write_to_file(items) if __name__ == '__main__': for i in range(10): main(i*10)
time.sleep(1)
如何对soup.select中填写,对于影片名:
Copy selector: #app > div > div > div.main > dl > dd:nth-child(1) > div > div > div.movie-item-info > p.name > a
soup.select: #app > div > div > div.main > dl > dd > div > div > div.movie-item-info > p.name > a
其他的也类似。对Copy selector中的内容稍加修改,便是填入soup.select中的内容。
三、pyquery
也是对解析部分进行了更改。
import json import requests from requests.exceptions import RequestException import time from pyquery import PyQuery as pq def get_one_page(url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'} try: response = requests.get(url, headers=headers) time.sleep(2) if response.status_code == 200: return response.text else: return None except RequestException: return None def parse_one_page(page): for data in page.items(): dict = {} ranking = data.children('i').text() imgsrc = data.find('.board-img').attr('data-src') title = data.find('.name').text().strip() star = data.find('.star').text().strip() releasetime = data.find('.releasetime').text() score = data.find('.score').text() dict['index'] = ranking dict['image'] = imgsrc dict['title'] = title dict['star'] = star[3:] dict['data'] = releasetime[5:15] dict['score'] = score yield dict def write_to_file(items): for item in items: # print(type(item)) print(item) with open('result.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(item, ensure_ascii=False) + ' ') def main(offset): url = 'http://maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) doc = pq(html) page = doc.find('dd') items = parse_one_page(page) write_to_file(items) if __name__ == '__main__': for i in range(10): main(i*10)
time.sleep(1)