2019-03-15 使用Request POST获取中加基金的PDF文件,并下载到本地

import requests
import time
base_url='http://www.bobbns.com/common-web/cms/content!getContentsIncludeSubCategoryOrderByHitCountDesc?noCache=1552542874867'
# User-Agent 是必须的,其它的尽量多写点 headers={ 'Host':'www.bobbns.com', 'Referer':'http://www.bobbns.com/byfy/zhuanxiang/index.html', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER', 'X-Requested-With':'XMLHttpRequest' } def get_page(pageNumber): # 分页变量 pageNumber data={ 'siteId':'f44c6a2aee134f608a82af7561debf57', 'categoryId':'7f6b707209f5482984438df86ec64ecd', 'pageNumber':pageNumber, 'pageSize':'10' }
try: response=requests.post(url=base_url,data=data,headers=headers) # print(response.json()['contents']) if response.status_code==200: return response.json() except Exception as e: print(e) def parse_page(json): if json: items=json.get('contents') for item in items: results={} results['title']=item.get('title') results['url']='http://www.bobbns.com'+item.get('url') yield results if __name__ == '__main__': for pageNumber in range(1,9): json=get_page(pageNumber) results=parse_page(json) for result in results: rep=requests.get(result['url'],headers) time.sleep(6) with open(r'./PDF/{}.pdf'.format(result['title']),'wb') as f: f.write(rep.content) # print(result)
原文地址:https://www.cnblogs.com/theDataDigger/p/10536879.html