模拟登录爬取数据、线程池的使用

云打码平台

  • 注册:普通用户和开发者用户
  • 登录:
  • 登录开发者用户
  • 创建一个软件:我的软件 -> 创建软件
  • 下载示例代码:开发者中心 -> 下载最新的DDL -> pythonHttp示例代码下载

1、模拟人人网登录

代码示例

  1 import http.client, mimetypes, urllib, json, time, requests
  2 
  3 ######################################################################
  4 
  5 class YDMHttp:
  6 
  7     apiurl = 'http://api.yundama.com/api.php'
  8     username = ''
  9     password = ''
 10     appid = ''
 11     appkey = ''
 12 
 13     def __init__(self, username, password, appid, appkey):
 14         self.username = username  
 15         self.password = password
 16         self.appid = str(appid)
 17         self.appkey = appkey
 18 
 19     def request(self, fields, files=[]):
 20         response = self.post_url(self.apiurl, fields, files)
 21         response = json.loads(response)
 22         return response
 23     
 24     def balance(self):
 25         data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
 26         response = self.request(data)
 27         if (response):
 28             if (response['ret'] and response['ret'] < 0):
 29                 return response['ret']
 30             else:
 31                 return response['balance']
 32         else:
 33             return -9001
 34     
 35     def login(self):
 36         data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
 37         response = self.request(data)
 38         if (response):
 39             if (response['ret'] and response['ret'] < 0):
 40                 return response['ret']
 41             else:
 42                 return response['uid']
 43         else:
 44             return -9001
 45 
 46     def upload(self, filename, codetype, timeout):
 47         data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
 48         file = {'file': filename}
 49         response = self.request(data, file)
 50         if (response):
 51             if (response['ret'] and response['ret'] < 0):
 52                 return response['ret']
 53             else:
 54                 return response['cid']
 55         else:
 56             return -9001
 57 
 58     def result(self, cid):
 59         data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)}
 60         response = self.request(data)
 61         return response and response['text'] or ''
 62 
 63     def decode(self, filename, codetype, timeout):
 64         cid = self.upload(filename, codetype, timeout)
 65         if (cid > 0):
 66             for i in range(0, timeout):
 67                 result = self.result(cid)
 68                 if (result != ''):
 69                     return cid, result
 70                 else:
 71                     time.sleep(1)
 72             return -3003, ''
 73         else:
 74             return cid, ''
 75 
 76     def report(self, cid):
 77         data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
 78         response = self.request(data)
 79         if (response):
 80             return response['ret']
 81         else:
 82             return -9001
 83 
 84     def post_url(self, url, fields, files=[]):
 85         for key in files:
 86             files[key] = open(files[key], 'rb');
 87         res = requests.post(url, files=files, data=fields)
 88         return res.text
 89 
 90 ######################################################################
 91 
 92 # 用户名(普通用户)
 93 username    = 'bobo328410948'
 94 
 95 # 密码
 96 password    = 'bobo328410948'                            
 97 
 98 # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
 99 appid       = 6003                                    
100 
101 # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
102 appkey      = '1f4b564483ae5c907a1d34f8e2f2776c'    
103 
104 # 图片文件
105 filename    = 'getimage.jpg'                        
106 
107 # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
108 codetype    = 1004
109 
110 # 超时时间,秒
111 timeout     = 10                                    
112 
113 # 检查
114 if (username == 'username'):
115     print('请设置好相关参数再测试')
116 else:
117     # 初始化
118     yundama = YDMHttp(username, password, appid, appkey)
119 
120     # 登陆云打码
121     uid = yundama.login();
122     print('uid: %s' % uid)
123 
124     # 查询余额
125     balance = yundama.balance();
126     print('balance: %s' % balance)
127 
128     # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
129     cid, result = yundama.decode(filename, codetype, timeout);
130     print('cid: %s, result: %s' % (cid, result))
131 
132 ######################################################################
View Code

解析验证码

 1 def getCodeDate(userName,pwd,codePath,codeType):
 2     # 用户名(普通用户)
 3     username    = userName
 4 
 5     # 密码
 6     password    = pwd                            
 7 
 8     # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
 9     appid       = 6003                                    
10 
11     # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
12     appkey      = '1f4b564483ae5c907a1d34f8e2f2776c'    
13 
14     # 图片文件
15     filename    = codePath                       
16 
17     # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
18     codetype    = codeType
19 
20     # 超时时间,秒
21     timeout     = 2                                   
22     result = None
23     # 检查
24     if (username == 'username'):
25         print('请设置好相关参数再测试')
26     else:
27         # 初始化
28         yundama = YDMHttp(username, password, appid, appkey)
29 
30         # 登陆云打码
31         uid = yundama.login();
32         #print('uid: %s' % uid)
33 
34         # 查询余额
35         balance = yundama.balance();
36         #print('balance: %s' % balance)
37 
38         # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
39         cid, result = yundama.decode(filename, codetype, timeout);
40         #print('cid: %s, result: %s' % (cid, result))
41     return result
View Code

利用抓包工具获取请求的url和参数,这里发送的是post请求

模拟人人网登录

 1 import requests
 2 import urllib
 3 from lxml import etree
 4 #获取session对象
 5 session = requests.Session()
 6 #将验证码图片进行下载
 7 headers = {
 8     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
 9 }
10 url = 'http://www.renren.com/'
11 page_text = requests.get(url=url,headers=headers).text
12 
13 tree = etree.HTML(page_text)
14 code_img_url = tree.xpath('//*[@id="verifyPic_login"]/@src')[0]
15 urllib.request.urlretrieve(url=code_img_url,filename='code.jpg')
16 
17 #识别验证码图片中的数据值
18 code_data = getCodeDate('bobo328410948','bobo328410948','./code.jpg',2004)
19 
20 #模拟登录
21 login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=201914927558'
22 data = {
23     "email":"www.zhangbowudi@qq.com",
24     "icode":code_data,
25     "origURL":"http://www.renren.com/home",
26     "domain":"renren.com",
27     "key_id":"1",
28     "captcha_type":"web_login",
29     "password":"4f0350f09aeffeef86307747218b214b0960bdf35e30811c0d611fe39db96ec1",
30     "rkey":"9e75e8dc3457b14c55a74627fa64fb43",
31     "f":"http%3A%2F%2Fwww.renren.com%2F289676607",
32 }
33 #该次请求产生的cookie会被自动存储到session对象中
34 session.post(url=login_url,data=data,headers=headers)
35 
36 url = ""  # 这里是登录之后才能访问的页面的url
37 page_text = session.get(url=url,headers=headers).text
38 
39 with open('renren.html','w',encoding='utf-8') as fp:
40     fp.write(page_text)
View Code

二 利用线程池爬取数据

import requests
import re
from lxml import etree
from multiprocessing.dummy import Pool
import random
 1 #实例化一个线程池对象
 2 pool = Pool(5)
 3 url = 'https://www.pearvideo.com/category_1'
 4 headers = {
 5     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
 6 }
 7 page_text = requests.get(url=url,headers=headers).text
 8 tree = etree.HTML(page_text)
 9 li_list = tree.xpath('//div[@id="listvideoList"]/ul/li')
10 
11 video_url_list = []
12 for li in li_list:
13     detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
14     detail_page = requests.get(url=detail_url,headers=headers).text
15     video_url = re.findall('srcUrl="(.*?)",vdoUrl',detail_page,re.S)[0]
16     video_url_list.append(video_url)
17     
18 video_data_list = pool.map(getVideoData,video_url_list)
19 
20 pool.map(saveVideo,video_data_list)

由于我们要获取的视屏连接不是在标签里,而是在js代码中,因此只能通过正则表达式来获取

通过回调函数来下载和保存列表里的视频

def getVideoData(url):
    return requests.get(url=url,headers=headers).content


def saveVideo(data):
    fileName = str(random.randint(0,5000))+'.mp4'
    with open(fileName,'wb') as fp:
        fp.write(data)
原文地址:https://www.cnblogs.com/liaopeng123/p/10452827.html