python爬虫之趟雷

python爬虫之趟雷整理

雷一:URLError

  问题具体描述:urllib.error.URLError: <urlopen error [Errno 11004] getaddrinfo failed

 1 import urllib.request
 2 
 3 
 4 def load_message():
 5     url = 'http://www.baidu.com'
 6 
 7     request = urllib.request.Request(url)
 8     response = urllib.request.urlopen(request)
 9     response_str = response.read().decode('utf-8')
10 
11     return response.headers, request.headers, response_str
12 
13 
14 response_header, request_header, response_data = load_message()
15 print(request_header)
16 print('----------------------------------------')
17 print(response_header)
18 print('----------------------------------------')
19 print(response_data)
View Code

  分析:报错原因为URLError,产生原因为URL,简单来说,就是URL资源无法访问或者访问不了。具体问题出在三个方向,URL本身,客户端,服务器。

  解决办法:第一点,检查URL书写是否正确;第二点,检查客户端网络连接状态;第三点,使用URL在浏览器地址栏访问验证服务器是否存在。

  问题具体描述:urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)>

 1 #!/usr/bin/env python
 2 # -*- coding=utf-8 -*-
 3 # Author: Snow
 4 
 5 
 6 import urllib.request
 7 
 8 
 9 def create_cookie():
10     url = 'https://www.yaozh.com/member/'
11     headers = {
12         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko
13 Chrome/69.0.3497.92 Safari/537.36',
14         'Cookie': 'think_language=zh-CN; _ga=GA1.2.179792116.1550119571; _gat=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; _gid=GA1.2.111857803.1550119581; yaozh_logintime=1550119751; yaozh_user=692948%09snown_1; yaozh_userId=692948; yaozh_uidhas=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; MEIQIA_VISIT_ID=1H9g97Ef1WpjYsWf4b7UlGe3wel; PHPSESSID=5itl5rejqnekb07bfrtmuvr3l6; yaozh_mylogin=1550196658; MEIQIA_VISIT_ID=1HCCOYdyjR0FalzMfFm4vYsqevT; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1550119570%2C1550119584%2C1550119751%2C1550196659; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1550196663'
15     }
16 
17     request = urllib.request.Request(url, headers=headers)
18     response = urllib.request.urlopen(request)
19     response_data = response.read().decode('utf-8')
20 
21     return response_data
22 
23 
24 result = create_cookie()
25 with open('cookies.html', 'w', encoding='utf-8') as f:
26     f.write(result)
View Code

  分析:问题产生原因python使用urllib.request,urlopen()打开https链接时,需要验证SSL证书,如果网站使用自签名的证书会抛出异常。

  解决办法:第一点,使用SSL创建context验证上下文,传入urlopen()中context上下文参数;第二点,取消证书验证。

 1 #!/usr/bin/env python
 2 # -*- coding=utf-8 -*-
 3 # Author: Snow
 4 
 5 
 6 import urllib.request
 7 import ssl    #导入ssl模块
 8 
 9 
10 def create_cookie():
11     url = 'https://www.yaozh.com/member/'
12     headers = {
13         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko
14 Chrome/69.0.3497.92 Safari/537.36',
15         'Cookie': 'think_language=zh-CN; _ga=GA1.2.179792116.1550119571; _gat=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; _gid=GA1.2.111857803.1550119581; yaozh_logintime=1550119751; yaozh_user=692948%09snown_1; yaozh_userId=692948; yaozh_uidhas=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; MEIQIA_VISIT_ID=1H9g97Ef1WpjYsWf4b7UlGe3wel; PHPSESSID=5itl5rejqnekb07bfrtmuvr3l6; yaozh_mylogin=1550196658; MEIQIA_VISIT_ID=1HCCOYdyjR0FalzMfFm4vYsqevT; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1550119570%2C1550119584%2C1550119751%2C1550196659; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1550196663'
16     }
17 
18     context = ssl._create_unverified_context()  # 创建验证SSL上下文
19 
20     request = urllib.request.Request(url, headers=headers)
21     response = urllib.request.urlopen(request, context=context)  # 传入context参数
22     response_data = response.read().decode('utf-8')
23 
24     return response_data
25 
26 
27 result = create_cookie()
28 with open('cookies.html', 'w', encoding='utf-8') as f:
29     f.write(result)
View Code
 1 #!/usr/bin/env python
 2 # -*- coding=utf-8 -*-
 3 # Author: Snow
 4 
 5 
 6 import urllib.request
 7 import ssl
 8 
 9 
10 def create_cookie():
11     url = 'https://www.yaozh.com/member/'
12     headers = {
13         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko
14 Chrome/69.0.3497.92 Safari/537.36',
15         'Cookie': 'think_language=zh-CN; _ga=GA1.2.179792116.1550119571; _gat=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; _gid=GA1.2.111857803.1550119581; yaozh_logintime=1550119751; yaozh_user=692948%09snown_1; yaozh_userId=692948; yaozh_uidhas=1; acw_tc=2f624a2115501195808648935e4f2de7e89205315a7c9e8934c938389d8999; MEIQIA_VISIT_ID=1H9g97Ef1WpjYsWf4b7UlGe3wel; PHPSESSID=5itl5rejqnekb07bfrtmuvr3l6; yaozh_mylogin=1550196658; MEIQIA_VISIT_ID=1HCCOYdyjR0FalzMfFm4vYsqevT; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1550119570%2C1550119584%2C1550119751%2C1550196659; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1550196663'
16     }
17 
18     ssl._create_default_https_context = ssl._create_unverified_context  # 缺省context参数不做验证,取消验证ssl证书
19 
20     request = urllib.request.Request(url, headers=headers)
21     response = urllib.request.urlopen(request)
22     response_data = response.read().decode('utf-8')
23 
24     return response_data
25 
26 
27 result = create_cookie()
28 with open('cookies.html', 'w', encoding='utf-8') as f:
29     f.write(result)
View Code

雷二:HTTPError

  问题具体描述:urllib.error.HTTPError: HTTP Error 503: Service Temporarily Unavailable

 1 #!/usr/bin/env python
 2 # -*- coding=utf-8 -*-
 3 # Author: Snow
 4 
 5 import urllib.request
 6 
 7 
 8 def fee_proxy():
 9     url = 'https://www.xicidaili.com/nn/'
10 
11     # 付费代理IP第一种方式
12     # proxy_1 = {
13     #     'http': 'user_name:passswor@121.61.1.222:9999'
14     # }
15 
16     # 付费代理IP第二种方式
17     user_name = 'admin'
18     password = '123456'
19     proxy_ip = '121.61.1.222:9999'
20     proxy_manage = urllib.request.HTTPPasswordMgrWithDefaultRealm()  # 密码管理器
21     proxy_manage.add_password(None, proxy_ip, user_name, password)
22 
23     # proxy_handler = urllib.request.ProxyHandler(proxy_1)
24     proxy_handler = urllib.request.ProxyBasicAuthHandler(proxy_manage)  # 代理IP验证处理器
25     proxy_openner = urllib.request.build_opener(proxy_handler)
26 
27     response = proxy_openner.open(url)
28     response_str = response.read().decode('utf-8')
29 
30     return response_str
31 
32 
33 data = fee_proxy()
34 print(data)
View Code

  分析

  解决办法

原文地址:https://www.cnblogs.com/snow-lanuage/p/10361844.html