python爬虫：登录百度账户，并上传文件到百度云盘

/**
 * Created by resolvewang on 2017/4/15.
 */
function getGid() {
    return "xxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx".replace(/[xy]/g, function (e) {
        var t = 16 * Math.random() | 0, n = "x" == e ? t : 3 & t | 8;
        return n.toString(16)
    }).toUpperCase()
}

function  getCallback() {
    return "bd__cbs__" + Math.floor(2147483648 * Math.random()).toString(36)
}

Pyhton实现代码：

#-*- coding:utf-8 -*-
__author__ = 'Administrator'

import time
import json
import re
import requests
import execjs
import base64
from urllib.parse import urlencode
from requests_toolbelt import MultipartEncoder
from Crypto.Cipher import PKCS1_v1_5
from Crypto.PublicKey import RSA
from hashlib import md5
from zlib import crc32

try:
    requests.packages.urllib3.disable_warnings()
except:
    pass

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 '
                         '(KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
           }

# 全局的session
session = requests.session()
session.get('https://pan.baidu.com', headers=headers)

class BufferReader(MultipartEncoder):
    """将multipart-formdata转化为stream形式的Proxy类
    """

    def __init__(self, fields, boundary=None, callback=None, cb_args=(), cb_kwargs=None):
        self._callback = callback
        self._progress = 0
        self._cb_args = cb_args
        self._cb_kwargs = cb_kwargs or {}
        super(BufferReader, self).__init__(fields, boundary)

    def read(self, size=None):
        chunk = super(BufferReader, self).read(size)
        self._progress += int(len(chunk))
        self._cb_kwargs.update({
            'size': self._len,
            'progress': self._progress
        })
        if self._callback:
            try:
                self._callback(*self._cb_args, **self._cb_kwargs)
            except:  # catches exception from the callback
                # raise CancelledError('The upload was cancelled.')
                pass
        return chunk

def _get_runntime():
    """
    :param path: 加密js的路径,注意js中不要使用中文！估计是pyexecjs处理中文还有一些问题
    :return: 编译后的js环境，不清楚pyexecjs这个库的用法的请在github上查看相关文档
    """
    phantom = execjs.get()  # 这里必须为phantomjs设置环境变量，否则可以写phantomjs的具体路径
    with open('login.js', 'r') as f:
        source = f.read()
    return phantom.compile(source)

def get_gid():
    return _get_runntime().call('getGid')

def get_callback():
    return _get_runntime().call('getCallback')

def _get_curtime():
    return int(time.time()*1000)

# 抓包也不是百分百可靠啊,这里?getapi一定要挨着https://passport.baidu.com/v2/api/写，才会到正确的路由
def get_token(gid, callback):
    cur_time = _get_curtime()
    get_data = {
        'tpl': 'netdisk',
        'subpro': 'netdisk_web',
        'apiver': 'v3',
        'tt': cur_time,
        'class': 'login',
        'gid': gid,
        'logintype': 'basicLogin',
        'callback': callback
    }
    headers.update(dict(Referer='http://pan.baidu.com/', Accept='*/*', Connection='keep-alive', Host='passport.baidu.com'))
    resp = session.get(url='https://passport.baidu.com/v2/api/?getapi', params=get_data, headers=headers)
    if resp.status_code == 200 and callback in resp.text:
        # 如果json字符串中带有单引号，会解析出错，只有统一成双引号才可以正确的解析
        #data = eval(re.search(r'.*?((.*))', resp.text).group(1))
        data = json.loads(re.search(r'.*?((.*))', resp.text).group(1).replace("'", '"'))
        return data.get('data').get('token')
    else:
        print('获取token失败')
        return None

def get_rsa_key(token, gid, callback):
    cur_time = _get_curtime()
    get_data = {
        'token': token,
        'tpl': 'netdisk',
        'subpro': 'netdisk_web',
        'apiver': 'v3',
        'tt': cur_time,
        'gid': gid,
        'callback': callback,
    }
    resp = session.get(url='https://passport.baidu.com/v2/getpublickey', headers=headers, params=get_data)
    if resp.status_code == 200 and callback in resp.text:
        data = json.loads(re.search(r'.*?((.*))', resp.text).group(1).replace("'", '"'))
        return data.get('pubkey'), data.get('key')
    else:
        print('获取rsa key失败')
        return None

def encript_password(password, pubkey):
    """
    import rsa
    使用rsa库加密（法一）
    pub = rsa.PublicKey.load_pkcs1_openssl_pem(pubkey.encode('utf-8'))
    encript_passwd = rsa.encrypt(password.encode('utf-8'), pub)
    return base64.b64encode(encript_passwd).decode('utf-8')

    """
    # pubkey必须为bytes类型
    pub=RSA.importKey(pubkey.encode('utf-8'))
    #构造“加密器”
    encryptor=PKCS1_v1_5.new(pub)
    #加密的内容必须为bytes类型
    encript_passwd =encryptor.encrypt(password.encode('utf-8'))
    return base64.b64encode(encript_passwd).decode('utf-8')

def login(token, gid, callback, rsakey, username, password):
    post_data = {
        'staticpage': 'http://pan.baidu.com/res/static/thirdparty/pass_v3_jump.html',
        'charset': 'utf-8',
        'token': token,
        'tpl': 'netdisk',
        'subpro': 'netdisk_web',
        'apiver': 'v3',
        'tt': _get_curtime(),
        'codestring': '',
        'safeflg': 0,
        'u': 'http://pan.baidu.com/disk/home',
        'isPhone': '',
        'detect': 1,
        'gid': gid,
        'quick_user': 0,
        'logintype': 'basicLogin',
        'logLoginType': 'pc_loginBasic',
        'idc': '',
        'loginmerge': 'true',
        'foreignusername': '',
        'username': username,
        'password': password,
        'mem_pass': 'on',
        # 返回的key
        'rsakey': rsakey,
        'crypttype': 12,
        'ppui_logintime': 33554,
        'countrycode': '',
        'callback': 'parent.'+callback
    }
    resp = session.post(url='https://passport.baidu.com/v2/api/?login', data=post_data, headers=headers)
    if 'err_no=0' in resp.text:
        print('登录成功')
    else:
        print('登录失败')

def upload(dest_path,file_handle,token):
     params = {
            'method': 'upload',
            'app_id': "250528",
            'BDUSS': session.cookies['BDUSS'],
            't': str(int(time.time())),
            'bdstoken': token,
            'path': dest_path,
            'ondup': "newcopy"
        }
     # print(params)
     files = {'file': (str(int(time.time())), file_handle)}
     url = 'https://{0}/rest/2.0/pcs/file'.format('pcs.baidu.com')
     api = '%s?%s' % (url, urlencode(params))
     # print(api)
     body = BufferReader(files)
     # print(body)
     baibupan_header = {"Referer": "http://pan.baidu.com/disk/home",
                    "User-Agent": "netdisk;4.6.2.0;PC;PC-Windows;10.0.10240;WindowsBaiduYunGuanJia"}
     header = dict(baibupan_header.items())
     # print(headers)
     header.update({"Content-Type": body.content_type})
     response = session.post(api, data=body, verify=False, headers=header)
     return response

def rapidupload(dest_path,file_handler,token):
    """秒传一个文件
    :param file_handler: 文件handler, e.g. open('file','rb')
    :type file_handler: file

    :param dest_path: 上传到服务器的路径，包含文件名
    :type dest_path: str

    :return: requests.Response
        .. note::
            * 文件已在服务器上存在，不上传，返回示例
            {
                "path" : "/apps/album/1.jpg",
                "size" : 372121,
                "ctime" : 1234567890,
                "mtime" : 1234567890,
                "md5" : "cb123afcc12453543ef",
                "fs_id" : 12345,
                "isdir" : 0,
                "request_id" : 12314124
            }
            * 文件不存在，需要上传
            {"errno":404,"info":[],"request_id":XXX}
            * 文件大小不足 256kb （slice-md5 == content-md5) 时
            {"errno":2,"info":[],"request_id":XXX}
            * 远程文件已存在
            {"errno":-8,"info":[],"request_id":XXX}
    """

    file_handler.seek(0, 2)
    _BLOCK_SIZE = 2 ** 20
    content_length = file_handler.tell()
    file_handler.seek(0)

    # 校验段为前 256KB
    first_256bytes = file_handler.read(256 * 1024)
    slice_md5 = md5(first_256bytes).hexdigest()

    content_crc32 = crc32(first_256bytes).conjugate()
    content_md5 = md5(first_256bytes)

    while True:
        block = file_handler.read(_BLOCK_SIZE)
        if not block:
            break
        # 更新crc32和md5校验值
        content_crc32 = crc32(block, content_crc32).conjugate()
        content_md5.update(block)

    params = {
            'method': 'rapidupload',
            'app_id': "250528",
            'BDUSS': session.cookies['BDUSS'],
            't': str(int(time.time())),
            'bdstoken': token,
            'path': dest_path,
            'ondup': "newcopy"
            }

    data = {
            'content-length': content_length,
            'content-md5': content_md5.hexdigest(),
            'slice-md5': slice_md5,
            'content-crc32': '%d' % (content_crc32.conjugate() & 0xFFFFFFFF)
            }
    baibupan_header = {"Referer": "http://pan.baidu.com/disk/home",
                    "User-Agent": "netdisk;4.6.2.0;PC;PC-Windows;10.0.10240;WindowsBaiduYunGuanJia"}
    header = dict(baibupan_header.items())
    url = 'https://{0}/rest/2.0/pcs/file'.format('pcs.baidu.com')
    api = '%s?%s' % (url, urlencode(params))
    # print(api)
    response= session.post(api, data=data, verify=False,headers=header)
    return response

if __name__ == '__main__':
    user='xxx'  #用户名
    password='xxx'  #密码

    cur_gid = get_gid()
    cur_callback = get_callback()
    cur_token = get_token(cur_gid, cur_callback)
    # print("token:%s" %(cur_token))
    cur_pubkey, cur_key = get_rsa_key(cur_token, cur_gid, cur_callback)
    encript_password = encript_password(password, cur_pubkey)
    login(cur_token, cur_gid, cur_callback, cur_key, user, encript_password)
    # print("cookies:%s" %(session.cookies['BDUSS']))

    # res=upload("/hello/temp.txt",open("temp.txt",'rb'),cur_token)
    # print(res.content.decode('utf-8'))
    res=rapidupload("/hello/words.txt",open("words.txt",'rb'),cur_token)
    print(res.content.decode('utf-8'))

  1 #-*- coding:utf-8 -*-
  2 __author__ = 'Administrator'
  3 
  4 import time
  5 import json
  6 import re
  7 import requests
  8 import execjs
  9 import base64
 10 from urllib.parse import urlencode
 11 from requests_toolbelt import MultipartEncoder
 12 from Crypto.Cipher import PKCS1_v1_5
 13 from Crypto.PublicKey import RSA
 14 from hashlib import md5
 15 from zlib import crc32
 16 # import progressbar
 17 import sys
 18 from contextlib import closing
 19 import time
 20 import os
 21 
 22 try:
 23     requests.packages.urllib3.disable_warnings()
 24 except:
 25     pass
 26 
 27 headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 '
 28                          '(KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
 29            }
 30 
 31 # 全局的session
 32 session = requests.session()
 33 session.get('https://pan.baidu.com', headers=headers)
 34 
 35 
 36 class BufferReader(MultipartEncoder):
 37     """将multipart-formdata转化为stream形式的Proxy类
 38     """
 39 
 40     def __init__(self, fields, boundary=None, callback=None, cb_args=(), cb_kwargs=None):
 41         self._callback = callback
 42         self._progress = 0
 43         self._cb_args = cb_args
 44         self._cb_kwargs = cb_kwargs or {}
 45         super(BufferReader, self).__init__(fields, boundary)
 46 
 47     def read(self, size=None):
 48         chunk = super(BufferReader, self).read(size)
 49         self._progress += int(len(chunk))
 50         self._cb_kwargs.update({
 51             'size': self._len,
 52             'progress': self._progress
 53         })
 54         if self._callback:
 55             try:
 56                 self._callback(*self._cb_args, **self._cb_kwargs)
 57             except:  # catches exception from the callback
 58                 # raise CancelledError('The upload was cancelled.')
 59                 pass
 60         return chunk
 61 
 62 class ProgressBar():
 63     """
 64     import progressbar
 65     使用第三方库显示上传进度
 66 
 67     """
 68     def __init__(self):
 69         self.first_call = True
 70     def __call__(self, *args, **kwargs):
 71         if self.first_call:
 72             self.widgets = [progressbar.Percentage(), ' ', progressbar.Bar(marker=progressbar.RotatingMarker('>')),
 73                             ' ', progressbar.FileTransferSpeed()]
 74             self.pbar = progressbar.ProgressBar(widgets=self.widgets, maxval=kwargs['size']).start()
 75             self.first_call = False
 76 
 77         if kwargs['size'] <= kwargs['progress']:
 78             self.pbar.finish()
 79         else:
 80             self.pbar.update(kwargs['progress'])
 81 
 82 
 83 def _get_runntime():
 84     """
 85     :param path: 加密js的路径,注意js中不要使用中文！估计是pyexecjs处理中文还有一些问题
 86     :return: 编译后的js环境，不清楚pyexecjs这个库的用法的请在github上查看相关文档
 87     """
 88     phantom = execjs.get()  # 这里必须为phantomjs设置环境变量，否则可以写phantomjs的具体路径
 89     with open('login.js', 'r') as f:
 90         source = f.read()
 91     return phantom.compile(source)
 92 
 93 def get_gid():
 94     return _get_runntime().call('getGid')
 95 
 96 def get_callback():
 97     return _get_runntime().call('getCallback')
 98 
 99 def _get_curtime():
100     return int(time.time()*1000)
101 
102 # 抓包也不是百分百可靠啊,这里?getapi一定要挨着https://passport.baidu.com/v2/api/写，才会到正确的路由
103 def get_token(gid, callback):
104     cur_time = _get_curtime()
105     get_data = {
106         'tpl': 'netdisk',
107         'subpro': 'netdisk_web',
108         'apiver': 'v3',
109         'tt': cur_time,
110         'class': 'login',
111         'gid': gid,
112         'logintype': 'basicLogin',
113         'callback': callback
114     }
115     headers.update(dict(Referer='http://pan.baidu.com/', Accept='*/*', Connection='keep-alive', Host='passport.baidu.com'))
116     resp = session.get(url='https://passport.baidu.com/v2/api/?getapi', params=get_data, headers=headers)
117     if resp.status_code == 200 and callback in resp.text:
118         # 如果json字符串中带有单引号，会解析出错，只有统一成双引号才可以正确的解析
119         #data = eval(re.search(r'.*?((.*))', resp.text).group(1))
120         data = json.loads(re.search(r'.*?((.*))', resp.text).group(1).replace("'", '"'))
121         return data.get('data').get('token')
122     else:
123         print('获取token失败')
124         return None
125 
126 def get_rsa_key(token, gid, callback):
127     cur_time = _get_curtime()
128     get_data = {
129         'token': token,
130         'tpl': 'netdisk',
131         'subpro': 'netdisk_web',
132         'apiver': 'v3',
133         'tt': cur_time,
134         'gid': gid,
135         'callback': callback,
136     }
137     resp = session.get(url='https://passport.baidu.com/v2/getpublickey', headers=headers, params=get_data)
138     if resp.status_code == 200 and callback in resp.text:
139         data = json.loads(re.search(r'.*?((.*))', resp.text).group(1).replace("'", '"'))
140         return data.get('pubkey'), data.get('key')
141     else:
142         print('获取rsa key失败')
143         return None
144 
145 def encript_password(password, pubkey):
146     """
147     import rsa
148     使用rsa库加密（法一）
149     pub = rsa.PublicKey.load_pkcs1_openssl_pem(pubkey.encode('utf-8'))
150     encript_passwd = rsa.encrypt(password.encode('utf-8'), pub)
151     return base64.b64encode(encript_passwd).decode('utf-8')
152 
153     """
154     # pubkey必须为bytes类型
155     pub=RSA.importKey(pubkey.encode('utf-8'))
156     #构造“加密器”
157     encryptor=PKCS1_v1_5.new(pub)
158     #加密的内容必须为bytes类型
159     encript_passwd =encryptor.encrypt(password.encode('utf-8'))
160     return base64.b64encode(encript_passwd).decode('utf-8')
161 
162 def login(token, gid, callback, rsakey, username, password):
163     post_data = {
164         'staticpage': 'http://pan.baidu.com/res/static/thirdparty/pass_v3_jump.html',
165         'charset': 'utf-8',
166         'token': token,
167         'tpl': 'netdisk',
168         'subpro': 'netdisk_web',
169         'apiver': 'v3',
170         'tt': _get_curtime(),
171         'codestring': '',
172         'safeflg': 0,
173         'u': 'http://pan.baidu.com/disk/home',
174         'isPhone': '',
175         'detect': 1,
176         'gid': gid,
177         'quick_user': 0,
178         'logintype': 'basicLogin',
179         'logLoginType': 'pc_loginBasic',
180         'idc': '',
181         'loginmerge': 'true',
182         'foreignusername': '',
183         'username': username,
184         'password': password,
185         'mem_pass': 'on',
186         # 返回的key
187         'rsakey': rsakey,
188         'crypttype': 12,
189         'ppui_logintime': 33554,
190         'countrycode': '',
191         'callback': 'parent.'+callback
192     }
193     resp = session.post(url='https://passport.baidu.com/v2/api/?login', data=post_data, headers=headers)
194     if 'err_no=0' in resp.text:
195         print('登录成功')
196     else:
197         print('登录失败')
198 def progressbar(size=None, progress=None,progress_title="已完成",finish_title="全部完成"):
199     #size：文件总字节数 progress：当前传输完成字节数
200     # print("{0} / {1}".format(size, progress))
201     if progress<size:
202         sys.stdout.write(progress_title+"： "+str(int((progress/size)*100))+' % '+"
")
203         sys.stdout.flush()
204     else:
205         progress=size
206         sys.stdout.write(finish_title+"： "+str(int((progress/size)*100))+' % '+"
")
207 
208 def upload(dest_path,file_handle,token,callback=None):
209      res=rapidupload(dest_path,file_handle,token)
210      # print(res.content.decode('utf-8'))
211      result=json.loads(res.content.decode('utf-8'))
212      if result.get("error_code",-1)==31079:
213          print("using upload....")
214          params = {
215                 'method': 'upload',
216                 'app_id': "250528",
217                 'BDUSS': session.cookies['BDUSS'],
218                 't': str(int(time.time())),
219                 'bdstoken': token,
220                 'path': dest_path,
221                 'ondup': "newcopy"
222             }
223          # print(params)
224          files = {'file': (str(int(time.time())), file_handle)}
225          url = 'https://{0}/rest/2.0/pcs/file'.format('pcs.baidu.com')
226          api = '%s?%s' % (url, urlencode(params))
227          # print(api)
228          body = BufferReader(files,callback=callback)
229          # print(body)
230          baibupan_header = {"Referer": "http://pan.baidu.com/disk/home",
231                         "User-Agent": "netdisk;4.6.2.0;PC;PC-Windows;10.0.10240;WindowsBaiduYunGuanJia"}
232          header = dict(baibupan_header.items())
233          # print(headers)
234          header.update({"Content-Type": body.content_type})
235          response = session.post(api, data=body, verify=False, headers=header)
236          return response
237      else:
238          print("using rapidupload....")
239          return res
240 
241 def rapidupload(dest_path,file_handler,token,callback=None):
242     """秒传一个文件
243     :param file_handler: 文件handler, e.g. open('file','rb')
244     :type file_handler: file
245 
246     :param dest_path: 上传到服务器的路径，包含文件名
247     :type dest_path: str
248 
249     :return: requests.Response
250         .. note::
251             * 文件已在服务器上存在，不上传，返回示例
252             {
253                 "path" : "/apps/album/1.jpg",
254                 "size" : 372121,
255                 "ctime" : 1234567890,
256                 "mtime" : 1234567890,
257                 "md5" : "cb123afcc12453543ef",
258                 "fs_id" : 12345,
259                 "isdir" : 0,
260                 "request_id" : 12314124
261             }
262             * 文件不存在，需要上传
263             {"errno":404,"info":[],"request_id":XXX}
264             * 文件大小不足 256kb （slice-md5 == content-md5) 时
265             {"errno":2,"info":[],"request_id":XXX}
266             * 远程文件已存在
267             {"errno":-8,"info":[],"request_id":XXX}
268     """
269     params = {
270             'method': 'rapidupload',
271             'app_id': "250528",
272             'BDUSS': session.cookies['BDUSS'],
273             't': str(int(time.time())),
274             'bdstoken': token,
275             'path': dest_path,
276             'ondup': "newcopy"
277             }
278     baibupan_header = {"Referer": "http://pan.baidu.com/disk/home",
279                     "User-Agent": "netdisk;4.6.2.0;PC;PC-Windows;10.0.10240;WindowsBaiduYunGuanJia"}
280     header = dict(baibupan_header.items())
281     url = 'https://{0}/rest/2.0/pcs/file'.format('pcs.baidu.com')
282     api = '%s?%s' % (url, urlencode(params))
283 
284     file_handler.seek(0, 2)
285     _BLOCK_SIZE = 2 ** 20  #1MB大小
286     # print(_BLOCK_SIZE)
287     content_length = file_handler.tell()
288     # print(content_length)
289     file_handler.seek(0)
290 
291     # 校验段为前 256KB
292     first_256bytes = file_handler.read(256 * 1024)
293     slice_md5 = md5(first_256bytes).hexdigest()
294 
295     content_crc32 = crc32(first_256bytes).conjugate()
296     content_md5 = md5(first_256bytes)
297 
298     # data = {
299     #         'content-length': content_length,
300     #         'content-md5': content_md5.hexdigest(),
301     #         'slice-md5': slice_md5,
302     #         'content-crc32': '%d' % (content_crc32.conjugate() & 0xFFFFFFFF)
303     #         }
304     # response= session.post(api, data=data, verify=False,headers=header)
305     # return response
306 
307     count=1
308     while True:
309         block = file_handler.read(_BLOCK_SIZE)
310         if callback:
311             callback(size=content_length,progress=count*_BLOCK_SIZE)
312         count=count+1
313         if not block:
314             break
315         # 更新crc32和md5校验值
316         content_crc32 = crc32(block, content_crc32).conjugate()
317         content_md5.update(block)
318     data = {
319             'content-length': content_length,
320             'content-md5': content_md5.hexdigest(),
321             'slice-md5': slice_md5,
322             'content-crc32': '%d' % (content_crc32.conjugate() & 0xFFFFFFFF)
323             }
324 
325     response= session.post(api, data=data, verify=False,headers=header)
326     return response
327 
328 def download(remote_path,file_path,token):
329     """下载单个文件。
330     download 接口支持HTTP协议标准range定义，通过指定range的取值可以实现
331     断点下载功能。 例如：如果在request消息中指定“Range: bytes=0-99”，
332     那么响应消息中会返回该文件的前100个字节的内容；
333     继续指定“Range: bytes=100-199”，
334     那么响应消息中会返回该文件的第二个100字节内容::
335       >>> headers = {'Range': 'bytes=0-99'}
336       >>> pcs = PCS('username','password')
337       >>> pcs.download('/test_sdk/test.txt', headers=headers)
338     :param remote_path: 网盘中文件的路径（包含文件名）。
339                         必须以 / 开头。
340                         .. warning::
341                             * 路径长度限制为1000；
342                             * 径中不能包含以下字符：``\\ ? | " > < : *``；
343                             * 文件名或路径名开头结尾不能是 ``.``
344                               或空白字符，空白字符包括：
345                               ``\r, \n, \t, 空格, \0, \x0B`` 。
346     :return: requests.Response 对象
347     """
348     params = {
349             'method': 'download',
350             'app_id': "250528",
351             'BDUSS': session.cookies['BDUSS'],
352             't': str(int(time.time())),
353             'bdstoken': token,
354             'path':remote_path
355         }
356     # 兼容原有域名pcs.baidu.com；使用新域名d.pcs.baidu.com，则提供更快、更稳定的下载服务
357     url = 'https://{0}/rest/2.0/pcs/file'.format('d.pcs.baidu.com')
358     baibupan_header = {"Referer": "http://pan.baidu.com/disk/home",
359                 "User-Agent": "netdisk;4.6.2.0;PC;PC-Windows;10.0.10240;WindowsBaiduYunGuanJia"}
360     header = dict(baibupan_header.items())
361     # print(headers)
362     # header.update({'Range': 'bytes=0-1024'}) #返回1KB内容
363     # response = session.get(url, params=params, verify=False, headers=header)
364     # print(response.headers)
365     # print(response.headers['content-length'])
366     with closing(session.get(url, params=params, verify=False, headers=header,stream=True)) as response:
367         chunk_size=1024 #单次请求最大值
368         count=1
369         total_size=int(response.headers['content-length']) #内容体总大小
370         with open(file_path,'wb') as file:
371             for data in response.iter_content(chunk_size=chunk_size):
372                 file.write(data)
373                 progressbar(size=total_size,progress=count*chunk_size,progress_title="正在下载",finish_title="下载完成")
374                 count=count+1
375 
376     """
377     通过断点续传一点一点下载
378     start=0
379     stop=1023
380     while True:
381         chunk_size='bytes={0}-{1}'.format(start,stop)
382         header.update({'Range': chunk_size}) #返回1KB内容
383         response = session.get(url, params=params, verify=False, headers=header)
384         # print(response.apparent_encoding)
385         if response.content:
386             with open(file_path,'ab') as file:
387                 file.write(response.content)
388             start=start+1024
389             stop=stop+1024
390         else:
391             break
392 
393     """
394 
395 def get_filesize(rote_path,token):
396     """获得文件(s)的meta
397     :param rote_path: 文件路径,如 '/aaa.txt'
398     """
399     params = {
400             'method': 'meta',
401             'app_id': "250528",
402             'BDUSS': session.cookies['BDUSS'],
403             't': str(int(time.time())),
404             'bdstoken': token,
405             'path':rote_path
406         }
407     # url="https://pcs.baidu.com/rest/2.0/pcs/file"
408     url = 'https://{0}/rest/2.0/pcs/file'.format('pcs.baidu.com')
409     # api = '%s?%s' % (url, urlencode(params))
410     baibupan_header = {"Referer": "http://pan.baidu.com/disk/home",
411                 "User-Agent": "netdisk;4.6.2.0;PC;PC-Windows;10.0.10240;WindowsBaiduYunGuanJia"}
412     header = dict(baibupan_header.items())
413     # print(headers)
414     response = session.get(url,params=params,verify=False, headers=header)
415     return response
416 
417 def meta(file_list,token):
418     """获得文件(s)的metainfo
419 
420     :param file_list: 文件路径列表,如 ['/aaa.txt']
421     :type file_list: list
422 
423     :return: requests.Response
424         .. note ::
425         示例
426 
427         * 文件不存在
428 
429         {"errno":12,"info":[{"errno":-9}],"request_id":3294861771}
430 
431         * 文件存在
432         {
433             "errno": 0,
434 
435             "info": [
436 
437                 {
438 
439                     "fs_id": 文件id,
440 
441                     "path": "/u5c0fu7c73/mi2su5237recovery.rar",
442 
443                     "server_filename": "mi2su5237recovery.rar",
444 
445                     "size": 8292134,
446 
447                     "server_mtime": 1391274570,
448 
449                     "server_ctime": 1391274570,
450 
451                     "local_mtime": 1391274570,
452 
453                     "local_ctime": 1391274570,
454 
455                     "isdir": 0,
456 
457                     "category": 6,
458 
459                     "path_md5": 279827390796736883,
460 
461                     "delete_fs_id": 0,
462 
463                     "object_key": "84221121-2193956150-1391274570512754",
464 
465                     "block_list": [
466                         "76b469302a02b42fd0a548f1a50dd8ac"
467                     ],
468 
469                     "md5": "76b469302a02b42fd0a548f1a50dd8ac",
470 
471                     "errno": 0
472 
473                 }
474 
475             ],
476 
477             "request_id": 2964868977
478 
479         }
480 
481     """
482     if not isinstance(file_list, list):
483         file_list = [file_list]
484     data = {'target': json.dumps(file_list)}
485     params = {
486             'method': 'filemetas',
487             'app_id': "250528",
488             'BDUSS': session.cookies['BDUSS'],
489             't': str(int(time.time())),
490             'bdstoken': token
491         }
492     print(token)
493     baibupan_header = {"Referer": "http://pan.baidu.com/disk/home",
494                 "User-Agent": "netdisk;4.6.2.0;PC;PC-Windows;10.0.10240;WindowsBaiduYunGuanJia"}
495     header = dict(baibupan_header.items())
496     uri='filemetas?blocks=0&dlink=1'
497     url='http://pan.baidu.com/api/{0}'.format(uri)
498     print(url)
499     if '?' in url:
500         api = "%s&%s" % (url, urlencode(params))
501     else:
502         api = '%s?%s' % (url, urlencode(params))
503     print(api)
504     print(data)
505     response=session.post(api,data=data,verify=False,headers=header)
506     return response
507     # return self._request('filemetas?blocks=0&dlink=1', 'filemetas', data=data, **kwargs)
508 
509 if __name__ == '__main__':
510     user='xxx'
511     password='xxx'
512 
513     cur_gid = get_gid()
514     cur_callback = get_callback()
515     cur_token = get_token(cur_gid, cur_callback)
516     # print("token:%s" %(cur_token))
517     cur_pubkey, cur_key = get_rsa_key(cur_token, cur_gid, cur_callback)
518     encript_password = encript_password(password, cur_pubkey)
519     login(cur_token, cur_gid, cur_callback, cur_key, user, encript_password)
520     # print("cookies:%s" %(session.cookies['BDUSS']))
521 
522     res=upload("/hello/word.py",open("test_BaiduPan.py",'rb'),cur_token,callback=progressbar)
523     print(res.content.decode('utf-8'))
524 
525 
526     # res=rapidupload("/hello/traindata.js",open("login.js",'rb'),cur_token,callback=progressbar)
527     # print(json.loads(res.content.decode('utf-8')))
528 
529 
530     # download("/hello/words.txt","word.txt",cur_token)
531     # print(res.content.decode('utf-8'))
532 
533     # res=get_filesize("/hello/words",cur_token)
534     # print(res.content.decode('utf-8'))
535 
536     # res=meta("/hello/words.txt",cur_token)
537     # print(res.content)

  1 #-*- coding:utf-8 -*-
  2 __author__ = 'Administrator'
  3 
  4 import time
  5 import json
  6 import re
  7 import requests
  8 import execjs
  9 import base64
 10 from urllib.parse import urlencode
 11 from requests_toolbelt import MultipartEncoder
 12 from Crypto.Cipher import PKCS1_v1_5
 13 from Crypto.PublicKey import RSA
 14 from hashlib import md5
 15 from zlib import crc32
 16 # import progressbar
 17 import sys
 18 from contextlib import closing
 19 import time
 20 import os
 21 from io import BytesIO
 22 
 23 try:
 24     requests.packages.urllib3.disable_warnings()
 25 except:
 26     pass
 27 
 28 # class BufferReader(MultipartEncoder):
 29 #     """将multipart-formdata转化为stream形式的Proxy类
 30 #     """
 31 #     def __init__(self, fields, boundary=None, callback=None, cb_args=(), cb_kwargs=None):
 32 #         self._callback = callback
 33 #         self._progress = 0
 34 #         self._cb_args = cb_args
 35 #         self._cb_kwargs = cb_kwargs or {}
 36 #         super(BufferReader, self).__init__(fields, boundary)
 37 #
 38 #     def read(self, size=None):
 39 #         chunk = super(BufferReader, self).read(size)
 40 #         self._progress += int(len(chunk))
 41 #         self._cb_kwargs.update({
 42 #             'size': self._len,
 43 #             'progress': self._progress
 44 #         })
 45 #         if self._callback:
 46 #             try:
 47 #                 self._callback(*self._cb_args, **self._cb_kwargs)
 48 #             except:  # catches exception from the callback
 49 #                 # raise CancelledError('The upload was cancelled.')
 50 #                 pass
 51 #         return chunk
 52 
 53 class BufferReader(BytesIO):
 54     """
 55     """
 56     def __init__(self, filebytes, callback=None):
 57         self._callback = callback
 58         self._progress = 0
 59         self._size =len(filebytes)
 60         super(BufferReader, self).__init__(filebytes)
 61 
 62     def read(self, size=-1):
 63         chunk_size=8192
 64         chunk = BytesIO.read(self，chunk_size)
 65         self._progress += int(len(chunk))
 66         if self._callback:
 67             self._callback(self._size,self._progress)
 68         return chunk
 69 
 70 class PCSBase():
 71     def __init__(self,username,password):
 72         self.session=requests.session()
 73         self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 '
 74                          '(KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
 75            }
 76         self.session.get('https://pan.baidu.com', headers=self.headers)
 77         self.username=username
 78         self.password=password
 79         self.user={}
 80         self.cur_gid=self.get_gid()
 81         self.cur_callback=self.get_callback()
 82         self.cur_time=self._get_curtime()
 83         self._initiate()#登录成功，并获取session.cookies
 84 
 85     def _initiate(self):
 86         self.user['token']= self.get_token()
 87         # print("token:%s" %(self.get_token()))
 88         self.login()
 89         # print("cookies:%s" %(session.cookies['BDUSS']))
 90     def _get_runntime(self):
 91         """
 92         :param path: 加密js的路径,注意js中不要使用中文！估计是pyexecjs处理中文还有一些问题
 93         :return: 编译后的js环境，不清楚pyexecjs这个库的用法的请在github上查看相关文档
 94         """
 95         phantom = execjs.get()  # 这里必须为phantomjs设置环境变量，否则可以写phantomjs的具体路径
 96         with open('login.js', 'r') as f:
 97             source = f.read()
 98         return phantom.compile(source)
 99 
100     def get_gid(self):
101         return self._get_runntime().call('getGid')
102 
103     def get_callback(self):
104         return self._get_runntime().call('getCallback')
105 
106     def _get_curtime(self):
107         return int(time.time()*1000)
108         # 抓包也不是百分百可靠啊,这里?getapi一定要挨着https://passport.baidu.com/v2/api/写，才会到正确的路由
109     def get_token(self):
110         get_data = {
111             'tpl': 'netdisk',
112             'subpro': 'netdisk_web',
113             'apiver': 'v3',
114             'tt':self.cur_time,
115             'class': 'login',
116             'gid': self.cur_gid,
117             'logintype': 'basicLogin',
118             'callback': self.cur_callback
119         }
120         self.headers.update(dict(Referer='http://pan.baidu.com/', Accept='*/*', Connection='keep-alive', Host='passport.baidu.com'))
121         resp = self.session.get(url='https://passport.baidu.com/v2/api/?getapi', params=get_data, headers=self.headers)
122         if resp.status_code == 200 and self.cur_callback in resp.text:
123             # 如果json字符串中带有单引号，会解析出错，只有统一成双引号才可以正确的解析
124             #data = eval(re.search(r'.*?((.*))', resp.text).group(1))
125             data = json.loads(re.search(r'.*?((.*))', resp.text).group(1).replace("'", '"'))
126             return data.get('data').get('token')
127         else:
128             print('获取token失败')
129             return None
130 
131     def get_rsa_key(self):
132         get_data = {
133             'token': self.user['token'],
134             'tpl': 'netdisk',
135             'subpro': 'netdisk_web',
136             'apiver': 'v3',
137             'tt': self.cur_time,
138             'gid': self.cur_gid,
139             'callback': self.cur_callback
140         }
141         resp = self.session.get(url='https://passport.baidu.com/v2/getpublickey', headers=self.headers, params=get_data)
142         if resp.status_code == 200 and self.cur_callback in resp.text:
143             data = json.loads(re.search(r'.*?((.*))', resp.text).group(1).replace("'", '"'))
144             return data.get('pubkey'), data.get('key')
145         else:
146             print('获取rsa key失败')
147             return None
148 
149     def encript_password(self,pubkey):
150         """
151         import rsa
152         使用rsa库加密（法一）
153         pub = rsa.PublicKey.load_pkcs1_openssl_pem(pubkey.encode('utf-8'))
154         encript_passwd = rsa.encrypt(password.encode('utf-8'), pub)
155         return base64.b64encode(encript_passwd).decode('utf-8')
156 
157         """
158         # pubkey必须为bytes类型
159         pub=RSA.importKey(pubkey.encode('utf-8'))
160         #构造“加密器”
161         encryptor=PKCS1_v1_5.new(pub)
162         #加密的内容必须为bytes类型
163         encript_passwd =encryptor.encrypt(self.password.encode('utf-8'))
164         return base64.b64encode(encript_passwd).decode('utf-8')
165 
166     def login(self):
167         cur_pubkey, cur_key = self.get_rsa_key()
168         encript_password =self.encript_password(cur_pubkey)
169         post_data = {
170             'staticpage': 'http://pan.baidu.com/res/static/thirdparty/pass_v3_jump.html',
171             'charset': 'utf-8',
172             'token': self.user['token'],
173             'tpl': 'netdisk',
174             'subpro': 'netdisk_web',
175             'apiver': 'v3',
176             'tt': self.cur_time,
177             'codestring': '',
178             'safeflg': 0,
179             'u': 'http://pan.baidu.com/disk/home',
180             'isPhone': '',
181             'detect': 1,
182             'gid': self.cur_gid,
183             'quick_user': 0,
184             'logintype': 'basicLogin',
185             'logLoginType': 'pc_loginBasic',
186             'idc': '',
187             'loginmerge': 'true',
188             'foreignusername': '',
189             'username': self.username,
190             'password': encript_password,
191             'mem_pass': 'on',
192             # 返回的key
193             'rsakey': cur_key,
194             'crypttype': 12,
195             'ppui_logintime': 33554,
196             'countrycode': '',
197             'callback': 'parent.'+self.cur_callback
198         }
199         resp = self.session.post(url='https://passport.baidu.com/v2/api/?login', data=post_data, headers=self.headers)
200         if 'err_no=0' in resp.text:
201             print('登录成功')
202             self.user['BDUSS'] = self.session.cookies['BDUSS']
203         else:
204             print('登录失败')
205             self.user['BDUSS']=None
206 
207     def _request(self,url,data=None,files=None,extra_params=None,callback=None):
208         params={
209             'app_id': "250528",
210             'BDUSS': self.user['BDUSS'],
211             't': str(int(time.time())),
212             'bdstoken': self.user['token']
213         }
214         if extra_params:
215             params.update(extra_params)
216         # print("params:%s" %params)
217         baibupan_header = {"Referer": "http://pan.baidu.com/disk/home",
218                     "User-Agent": "netdisk;4.6.2.0;PC;PC-Windows;10.0.10240;WindowsBaiduYunGuanJia"}
219         header= dict(baibupan_header.items())
220         if data or files:
221             api = '%s?%s' % (url, urlencode(params))
222             # print("api:%s" %api)
223             if data:
224                 res=self.session.post(api,data=data,verify=False, headers=header)
225                 return res
226             else:
227                 # print(callback==None)
228                 (filedata,contenttype)=requests.packages.urllib3.filepost.encode_multipart_formdata(files)
                    body=BufferReader(filedata,callback=callback)
229                 # print("body:%s" %type(body))
230                 header.update({
231                     "Content-Type": contenttype
232                 })
233                 # print("header:%s" %header)
234                 res=self.session.post(api,data=body,verify=False, headers=header)
235                 return res
236         else:
237             res=self.session.get(url,params=params,verify=False, headers=header,stream=True)
238             return  res
239 
240 class PCS(PCSBase):
241     def __init__(self,username,password):
242         self.username=username
243         self.password=password
244         super(PCS,self).__init__(self.username,self.password)
245 
246     def upload(self,remote_path,file_handler,callback=None):
247         params={
248             'method': 'upload',
249             'path': remote_path,
250             'ondup': "newcopy"
251         }
252         files = {'file': (str(int(time.time())), file_handler)}
253         url = 'https://{0}/rest/2.0/pcs/file'.format('pcs.baidu.com')
254         response=self._request(url,files=files,extra_params=params,callback=callback)
255         return response
256 
257     def rapid_upload(self,remote_path,file_handler,callback=None):
258         params={
259             'method':"rapidupload",
260             'path':remote_path,
261             'ondup':"newcopy"
262         }
263         url = 'https://{0}/rest/2.0/pcs/file'.format('pcs.baidu.com')
264         file_handler.seek(0, 2)
265         _BLOCK_SIZE = 2 ** 20  #1MB大小
266         # print(_BLOCK_SIZE)
267         content_length = file_handler.tell()
268         # print(content_length)
269         file_handler.seek(0)
270 
271         # 校验段为前 256KB
272         first_256bytes = file_handler.read(256 * 1024)
273         slice_md5 = md5(first_256bytes).hexdigest()
274 
275         content_crc32 = crc32(first_256bytes).conjugate()
276         content_md5 = md5(first_256bytes)
277 
278         count=1
279         while True:
280             block = file_handler.read(_BLOCK_SIZE)
281             if callback:
282                 callback(size=content_length,progress=count*_BLOCK_SIZE)
283             count=count+1
284             if not block:
285                 break
286             # 更新crc32和md5校验值
287             content_crc32 = crc32(block, content_crc32).conjugate()
288             content_md5.update(block)
289         data = {
290                 'content-length': content_length,
291                 'content-md5': content_md5.hexdigest(),
292                 'slice-md5': slice_md5,
293                 'content-crc32': '%d' % (content_crc32.conjugate() & 0xFFFFFFFF)
294                 }
295         response=self._request(url,data=data,extra_params=params,callback=callback)
296         return response
297 
298     def download(self,remote_path,local_path,callback=None):
299         params={
300             'method':"download",
301             'path':remote_path
302         }
303         # 兼容原有域名pcs.baidu.com；使用新域名d.pcs.baidu.com，则提供更快、更稳定的下载服务
304         url = 'https://{0}/rest/2.0/pcs/file'.format('d.pcs.baidu.com')
305         with closing(self._request(url, extra_params=params)) as response:
306             chunk_size=1024 #单次请求最大值
307             count=1
308             total_size=int(response.headers['content-length']) #内容体总大小
309             with open(local_path,'wb') as file:
310                 for data in response.iter_content(chunk_size=chunk_size):
311                     file.write(data)
312                     self.progressbar(size=total_size,progress=count*chunk_size,progress_title="正在下载",finish_title="下载完成")
313                     count=count+1
314 
315     def progressbar(self,size=None, progress=None,progress_title="正在上传",finish_title="上传完成"):
316         #size：文件总字节数 progress：当前传输完成字节数
317         # print("{0} / {1}".format(size, progress))
318         if progress<size:
319             sys.stdout.write(progress_title+"： "+str(int((progress/size)*100))+' % '+"
")
320             sys.stdout.flush()
321         else:
322             progress=size
323             sys.stdout.write(finish_title+"： "+str(int((progress/size)*100))+' % '+"
")
324 
325 
326 if __name__ == '__main__':
327     username="xxx"
328     password="xxx"
329     pcs=PCS(username,password)
330     res=pcs.upload("/hello/word.js",open("login.js",'rb').read(),callback=pcs.progressbar)
331     print(res.content.decode('utf-8'))
332     res=pcs.rapid_upload("/hello/word.js",open("login.js",'rb'),callback=pcs.progressbar)
333     print(res.content.decode('utf-8'))
334     pcs.download("/hello/word.js","temp.js")