基于session以及验证码处理登录网站 -- 爬虫

  1 import http.client, mimetypes, urllib, json, time, requests
  2 
  3 
  4 class YDMHttp:
  5     apiurl = 'http://api.yundama.com/api.php'
  6     username = ''
  7     password = ''
  8     appid = ''
  9     appkey = ''
 10 
 11     def __init__(self, username, password, appid, appkey):
 12         self.username = username
 13         self.password = password
 14         self.appid = str(appid)
 15         self.appkey = appkey
 16 
 17     def request(self, fields, files=[]):
 18         response = self.post_url(self.apiurl, fields, files)
 19         response = json.loads(response)
 20         return response
 21 
 22     def balance(self):
 23         data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,
 24                 'appkey': self.appkey}
 25         response = self.request(data)
 26         if (response):
 27             if (response['ret'] and response['ret'] < 0):
 28                 return response['ret']
 29             else:
 30                 return response['balance']
 31         else:
 32             return -9001
 33 
 34     def login(self):
 35         data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,
 36                 'appkey': self.appkey}
 37         response = self.request(data)
 38         if (response):
 39             if (response['ret'] and response['ret'] < 0):
 40                 return response['ret']
 41             else:
 42                 return response['uid']
 43         else:
 44             return -9001
 45 
 46     def upload(self, filename, codetype, timeout):
 47         data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,
 48                 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
 49         file = {'file': filename}
 50         response = self.request(data, file)
 51         if (response):
 52             if (response['ret'] and response['ret'] < 0):
 53                 return response['ret']
 54             else:
 55                 return response['cid']
 56         else:
 57             return -9001
 58 
 59     def result(self, cid):
 60         data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,
 61                 'appkey': self.appkey, 'cid': str(cid)}
 62         response = self.request(data)
 63         return response and response['text'] or ''
 64 
 65     def decode(self, filename, codetype, timeout):
 66         cid = self.upload(filename, codetype, timeout)
 67         if (cid > 0):
 68             for i in range(0, timeout):
 69                 result = self.result(cid)
 70                 if (result != ''):
 71                     return cid, result
 72                 else:
 73                     time.sleep(1)
 74             return -3003, ''
 75         else:
 76             return cid, ''
 77 
 78     def report(self, cid):
 79         data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid,
 80                 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
 81         response = self.request(data)
 82         if (response):
 83             return response['ret']
 84         else:
 85             return -9001
 86 
 87     def post_url(self, url, fields, files=[]):
 88         for key in files:
 89             files[key] = open(files[key], 'rb')
 90         res = requests.post(url, files=files, data=fields)
 91         return res.text
 92 
 93 
 94 def parse_code_img(username, password, appid, appkey, filename, codetype):
 95     # 用户名
 96     username = username
 97     # 密码
 98     password = password
 99     # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
100     appid = appid
101     # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
102     appkey = appkey
103     # 图片文件
104     filename = filename
105     # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
106     codetype = codetype
107     # 超时时间,秒
108     timeout = 30
109 
110     # 检查
111     if (username == ''):
112         print('请设置好相关参数再测试')
113     else:
114         # 初始化
115         yundama = YDMHttp(username, password, appid, appkey)
116         # 登陆云打码
117         uid = yundama.login()
118         print('uid: %s' % uid)
119         # 查询余额
120         balance = yundama.balance()
121         print('balance: %s' % balance)
122         # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
123         cid, result = yundama.decode(filename, codetype, timeout)
124         print('cid: %s, result: %s' % (cid, result))
125         return result
验证码处理
 1 from YDMHTTPDemo import parse_code_img
 2 from lxml import etree
 3 import requests
 4 
 5 # 创建一个session对象:会自动保存cookie
 6 session = requests.session()
 7 # 获取验证码的文本数据
 8 url = 'http://www.renren.com/'
 9 login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018113926761'
10 headers = {
11     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
12 }
13 data = {
14     "email": "13120303891",
15     "icode": "",
16     "origURL": "http://www.renren.com/home",
17     "domain": "renren.com",
18     "key_id": "1",
19     "captcha_type": "web_login",
20     "password": "0fa20889fa1bf409dda8d0d684a8b168aa1850283dd96ebd37462b843610d1cb",
21     "rkey": "fa647ac3744264c8eece684183c36fe5",
22     "f": "http%3A%2F%2Fzhibo.renren.com%2Ftop"
23 }
24 page_text = requests.get(url=url, headers=headers).text
25 # 解析验证码图片,保存本地
26 tree = etree.HTML(page_text)
27 code_img_url = tree.xpath('//img[@id="verifyPic_login"]/@src')[0]
28 
29 
30 def personal_info(url, headers, data):
31     # 使用session发起请求:将cookie存储到session。保证请求成功,将cookie存储到session中即可
32     session.post(url=url, headers=headers, data=data)
33     # 进行个人主页的页面数据的获取
34     personal_url = 'http://www.renren.com/969091665/profile'
35     page_text = session.get(url=personal_url, headers=headers).text
36     with open('./renren.html', 'w', encoding='utf-8') as f:
37         f.write(page_text)
38 
39 
40 # 部分网站多次登录失败后会出现验证码情况
41 if code_img_url:
42     # 识别验证码文本信息
43     code_img_data = requests.get(url=code_img_url, headers=headers).content
44     code_path = './code.jpg'
45     with open(code_path, 'wb') as f:
46         f.write(code_img_data)
47     code_text = parse_code_img(filename=code_path, username='bobo328410948', password='bobo328410948', appid=6003,
48                                appkey='1f4b564483ae5c907a1d34f8e2f2776c', codetype=2004)
49     print(code_text)
50     # 登录操作:只为了获取cookie
51     data["icode"] = code_text
52     personal_info(url=login_url, headers=headers, data=data)
53 # 没有验证码出现的情况
54 else:
55     personal_info(url=login_url, headers=headers, data=data)
原文地址:https://www.cnblogs.com/wj12312/p/10108650.html