高度伪装浏览器封装类

1.高度伪装封装类HeadersHelper.py

 1 import urllib.request
 2 import http.cookiejar
 3 
 4 class HeadersHelper:
 5     def __init__(self, url, path=None):
 6         self.url = urllib.request.quote(url,safe='/:?=', encoding='utf-8')
 7         self.path = path
 8 
 9     # 设置信息头,高度仿照浏览器
10     def set_Headers(self):
11         # 添加报头 注意"Accept-Encoding": "gb2312, utf-8" 防止解码而出现乱码
12         headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
13                    "Accept-Encoding": "gb2312, utf-8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
14                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0",
15                    "Connection": "keep-alive", "Host": "baidu.com"
16                    }
17         cjar = http.cookiejar.CookieJar()
18         opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar))
19         headall = []
20         for key, value in headers.items():
21             item = (key, value)
22             headall.append(item)
23         opener.addheaders = headall
24         urllib.request.install_opener(opener)
25 
26     # 信息返回
27     def feedbak_info(self):
28         self.set_Headers()
29         # 有时候用utf-8,有时候用gbk
30         # http://fjrs168.blog.hexun.com 就需要gbk
31         try:
32             info = urllib.request.urlopen(self.url).read().decode('utf-8')
33         except:
34             info = urllib.request.urlopen(self.url).read().decode('gbk')
35         return str(info)
36 
37     # 信息存档
38     def save_InFile(self):
39         self.set_Headers()
40         info = urllib.request.urlopen(self.url).read()
41         file = open(self.path, 'wb')
42         file.write(info)
43         file.close()

2.测试headershelper_test.py

1 from HeadersHelper import HeadersHelper
2 url = "https://www.zhibo8.cc"
3 #==============================
4 #hh = HeadersHelper(url)
5 #print(hh.feedbak_info())
6 #==============================
7 path = "E:/workspace/PyCharm/codeSpace/books/python_web_crawler_book/chapter6/demo5/headershelper.html"
8 hh = HeadersHelper(url, path=path)
9 hh.save_InFile()
原文地址:https://www.cnblogs.com/xiaomingzaixian/p/7134738.html