爬虫小实战-药监总局

 找到动态的路径了和几个关键的data,先全部获取进入详情页的id好

再将id号写入获取所有的消息,

import requests
import json
if __name__=="__main__":
        # 批量获取ID
    id_list = []
    all_data_list=[]#存储所有的企业详情数据
    url='http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'    
    headers={
        'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
    }
    for page in range(1,369):#总共369页
        page=str(page)
        data={
        "on": "true",
        "page": page,
        "pageSize": "15",
        "productName": "",
        "conditionType": "1",
        "applyname": "",
        "applysn": ""
        }
        json_ids=requests.post(url=url,headers=headers,data=data).json()
        for dic in json_ids['list']:
            id_list.append(dic['ID'])
        print(id_list)
    #获取企业详情数据 
    # http://scxk.nmpa.gov.cn:81/xk/itownet/portal/dzpz.jsp?id=c3ae6ecabbcf4ba68038321b80819753  
    # http://scxk.nmpa.gov.cn:81/xk/itownet/portal/dzpz.jsp?id=c3854166c00f46b5b29fe2a55d3df929   
    post_url='http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
    for id in id_list:
        data={
        'id':id
        }
        detail_json=requests.post(url=post_url,headers=headers,data=data).json()
        # print(detail_json)
        all_data_list.append(detail_json)
    #持久化 
    fp=open('./allData.json','w',encoding='utf-8')
    json.dump(all_data_list,fp=fp,ensure_ascii=False)
    print('over!!!')
原文地址:https://www.cnblogs.com/wulianwangaxing/p/14396813.html