缓存网页

实现每个链接都能独立缓存，如果存在直接读取，如果不存在，则获取网页，然后系列化后保存到本地

目录功能比较简单：后续可能会改进

#!/usr/bin/env python
#coding:utf-8
#Created by Andy @ 2017/6/28


import os
import hashlib
import urllib.request
import random
import time
import gzip
import pickle

# 简单的反防爬，每次随机选下header
headers = [{'User-Agent': 'Mozilla/5.0 (Windows NT 6.3 WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.8.1000 Chrome/30.0.1599.101 Safari/537.36'},
           {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0'},
           {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 10.0; WOW64; Trident/7.0)'}]


def cache_html(url, header):
	# 根据不同链接产生不同的哈希值，并以此为文件名，保证一链接一缓存文件
	md = hashlib.md5()
	md.update(url.encode(encoding='utf8'))
	file_name = md.hexdigest()

	# 判断缓存文件是否已经存在，如果存在就直接读取缓存文件
	path = os.path.join(base_path, file_name)
	if os.path.exists(path) and os.path.getsize(path):
		print("Cache file already exist!")
		with open(path, 'rb') as read_f:
			html = pickle.load(read_f)
			try:
				html = gzip.decompress(html).decode('utf8') # 有些网站的数据经过压缩了
			except:
				html = html.decode('utf8')
	else:
		req = urllib.request.Request(url, headers=header)
		html = urllib.request.urlopen(req).read()

		if not req or not html:
			print("Connection failed...")
		else:
			time.sleep(random.randint(1, 3))

			with open(file_name, 'wb') as write_f:
				pickle.dump(html, write_f)

		try:
			html = gzip.decompress(html).decode('utf8')
		except:
			html = html.decode('utf-8')


	return html

if __name__ == '__main__':
	header = random.choice(headers)

	base_path = os.path.dirname(os.path.abspath(__file__))

	url = 'http://www.python.org'
	html = cache_html(url, header)
	print(html)

下面是将上面的缓存网页当作一个模块来调用，实现爬取豆瓣电影排行top 250的爬虫，因为豆瓣采取了一定的反爬虫策略

所以这里的缓存的作用就在于，只要成功爬取一次网页，后面就可以从本地读取缓存，而不用对豆瓣发起请求：

#!/usr/bin/env python
#coding:utf-8
#Created by Andy @ 2017/6/28


import os
import hashlib
import urllib.request
import random
import time
import gzip
import pickle

# 简单的反防爬，每次随机选下header
headers = [{'User-Agent': 'Mozilla/5.0 (Windows NT 6.3 WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.8.1000 Chrome/30.0.1599.101 Safari/537.36'},
           {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:45.0) Gecko/20100101 Firefox/45.0'},
           {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 10.0; WOW64; Trident/7.0)'}]

header = random.choice(headers)
base_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),"cache")
if not os.path.exists(base_path):
	os.mkdir(base_path)

def cache_html(url, header=header):
	# 根据不同链接产生不同的哈希值，并以此为文件名，保证一链接一缓存文件
	md = hashlib.md5()
	md.update(url.encode(encoding='utf8'))
	file_name = md.hexdigest()

	# 判断缓存文件是否已经存在，如果存在就直接读取缓存文件
	path = os.path.join(base_path, file_name)
	if os.path.exists(path) and os.path.getsize(path):
		print("Cache file already exist!")
		with open(path, 'rb') as read_f:
			html = pickle.load(read_f)
			try:
				html = gzip.decompress(html).decode('utf8') # 有些网站的数据经过压缩了
			except:
				html = html.decode('utf8')
	else:
		req = urllib.request.Request(url, headers=header)
		html = urllib.request.urlopen(req).read()
		time.sleep(random.randint(2,5))
		if not req or not html:
			print("Connection failed...")
		else:
			with open(path, 'wb') as write_f:
				pickle.dump(html, write_f)

		try:
			html = gzip.decompress(html).decode('utf8')
		except:
			html = html.decode('utf-8')


	return html

#!/usr/bin/env python
#coding:utf-8
#Created by Andy @ 2017/6/27


import urllib.request
import re
import random
import pickle
import time
import sys
import io
from cache_html import cache_html


movie = []
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')

for i in range(10):
	page = i*25
	url = "https://movie.douban.com/top250?start=%s&filter=" % page

	html = cache_html(url)
	p = re.compile(r'<div class="item">.*?<em class="">(d{1,3})</em>.*?<span class="title">([u4e00-u9fa5]{0,})</span>.*?<p class="">(.+?)</p>.*?<span.*?>(d.d)</span>.*?<span>(d{1,7}).+?</span>',re.DOTALL)

	res = p.findall(html)


	with open('movie.pkl', 'wb') as f:
		for m in res:
			movie_dic = {}			
			index, name, director, grade, estimate = m[0], m[1], m[2], m[3], m[4]
			movie_dic['index'] = index
			movie_dic['name'] = name
			movie_dic['director'] = director.replace(' ', '').replace('...<br>
', '').replace(' ', '').strip()
			movie_dic['grade'] = grade
			movie_dic['estimate'] = estimate
			movie.append(movie_dic)
		pickle.dump(movie, f)

# 评论超过500000的电影
print([i['name'] for i in [m for m in movie if int(m['estimate']) > 500000]])