python爬虫：使用账号、密码和验证码登录知乎网页

先上代码，后分析出现的问题：

 1 #coding:utf-8
 2 import re
 3 from bs4 import BeautifulSoup
 4 import gzip
 5 import urllib.request
 6 import urllib.parse
 7 import http.cookiejar
 8 import ssl
 9 import time
10 
11 def get_opener(heads):
12     cj=http.cookiejar.CookieJar()
13     pro=urllib.request.HTTPCookieProcessor(cj)
14     opener=urllib.request.build_opener(pro)
15     header=[]
16     for key,value in heads.items():
17         header.append((key,value))
18     opener.addheaders=header
19     return opener
20 
21 def ungzip(data):
22     try:
23         print("正在解压....")
24         data=gzip.decompress(data)
25         print("解压完成")
26     except:
27         print("无需解压")
28     return data    
29 
30 if __name__=="__main__":
31     ssl._create_default_https_context = ssl._create_unverified_context 
32     heads={
33             "Accept":"text/html, application/xhtml+xml, */*",
34             "Accept-Language":"zh-CN",
35             "User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0",
36             "Accept-Encoding": "gzip, deflate",
37             "Host": "www.zhihu.com",
38             "DNT": "1",
39             "Connection": "Keep-Alive"
40             }
41     opener=get_opener(heads)
42     url="https://www.zhihu.com/"
43     op=opener.open(url)
44     data1=op.read()
45     data1=ungzip(data1).decode('utf-8')
46     #print(data1.decode('utf-8'))
47     #print(op.read().decode('utf-8'))
48 ##    xsrf=re.findall(r'name="_xsrf" value=".*"',data1)
49 ##    print(xsrf[0])
50 ##    print(type(xsrf[0]))
51 ##    value=xsrf[0].split(" ")
52 ##    print(value)
53 ##    _xsrf=re.findall(r'".*"',value[1])[0]
54 ##    print(_xsrf)
55     soup=BeautifulSoup(data1,"html.parser")
56     _xsrf=soup.find("input",{'type':'hidden'}).get("value")
57     password="hzc19911005"
58     #captcha_type="cn"
59     phone_num="13267243809"
60     captcha_url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000)
61     captchadata=opener.open(captcha_url).read()
62     with open("1.gif",'wb') as file:
63         file.write(captchadata)
64     yanzhengma=input("captcha:")
65     postdata={
66         "_xsrf":_xsrf,
67         "password":password,
68         #"captcha_type":captcha_type,#不能带有这个字段
69         "phone_num":phone_num,
70         "captcha":yanzhengma
71         }
72     postdata=urllib.parse.urlencode(postdata).encode()
73     login_url="https://www.zhihu.com/login/phone_num"
74     op2=opener.open(login_url,postdata)
75     login_data=op2.read()
76     data=ungzip(login_data).decode("utf-8")
77     print(data)
78     result=dict(eval(data))
79     if result["r"]==0:
80         print("登录成功")
81

1、出现“SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)”：

Python 2.7.9 之后版本引入了一个新特性

当你urllib.urlopen一个 https 的时候会验证一次 SSL 证书

当目标使用的是自签名的证书时就会爆出一个

urllib.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)> 的错误消息，

处理方法：

import ssl 
ssl._create_default_https_context = ssl._create_unverified_context

2、出现验证码错误，返回：验证码过期：{ "r": 1, "errcode": 1991829, "data": {"captcha":"验证码回话无效 :(","name":"ERR_VERIFY_CAPTCHA_SESSION_INVALID"}, "msg": "验证码回话无效 :(" }:

发给服务器的post数据没有带验证码："captcha"，解决办法：postdata={
        "_xsrf":_xsrf,
        "password":password,
        #"captcha_type":captcha_type,#不能带有这个字段
        "phone_num":phone_num,
        "captcha":yanzhengma
        }
验证码过期，解决办法：先从url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000)下载图片保存在本地，然后人工识别，手动输入验证码

1 captcha_url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000)
2 captchadata=opener.open(captcha_url).read()
3 with open("1.gif",'wb') as file:
4       file.write(captchadata)
5 yanzhengma=input("captcha:")