python学习日记——小作业之抓取新氧数据

# 抓取新氧数据
import requests
import json
import xlwt
from bs4 import BeautifulSoup

proxies={"http": "http://49.70.64.155:9999", "https": "http://59.57.148.70:9999", }
# 初始化表格行数
row=0
def get_shuhouhuli(url_diclist):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
    }
    for url_dic in url_diclist:
        workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)
        sheet = workbook.add_sheet('doctorinfo', cell_overwrite_ok=True)
        for k,v in url_dic.items():
            response = requests.get(v, headers = headers)
            soup=BeautifulSoup(response.text,'lxml')
            shuhouhulilist=soup.select("#surgery_after > div > div")
            cols=0
            global row
            for shuhouhuli in shuhouhulilist:
                print(shuhouhuli.text)
                sheet.write(row, cols, shuhouhuli.text)
                cols = cols + 1
            row = row + 1
        workbook.save("xinyanginfo.xls")

def get_finalurl(preurl):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
    }
    finalurl = []
    try:
        response=json.loads(requests.get(preurl,headers=headers).text)
        for info in response:
            try:
                pinyin=info["seo"]["pinyin"]
                finalurl.append({info["name"]: "https://www.soyoung.com/itemk/" + pinyin + "/"})
            except:
                print(info)
    except:
        print(preurl+"不可用")
    return finalurl

def scra_data():
    workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)
    sheet = workbook.add_sheet('xinyanginfo', cell_overwrite_ok=True)
    url=""
    try:
        for i in range(20155,20244):
            # 得到一级url
            url="https://www.soyoung.com/items/itemList?_json=1&menu_id="+str(i)
            # 根据一级url抓取得到二级url的字典的列表
            finalurldic=get_finalurl(url)
            # 根据二级url抓取得到信息
            for url_dic in finalurldic:
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
                }
                for k, v in url_dic.items():
                    response = requests.get(v, headers=headers)
                    soup = BeautifulSoup(response.text, 'lxml')
                    shuhouhulilist = soup.select("#surgery_after > div > div")
                    cols = 2
                    global row
                    sheet.write(row, 0, k)
                    sheet.write(row, 1, v)
                    for shuhouhuli in shuhouhulilist:
                        sheet.write(row, cols, shuhouhuli.text)
                        cols = cols + 1
                    row = row + 1
    except:
        workbook.save("xinyanginfo.xls")
        print(url)
    workbook.save("xinyanginfo.xls")

scra_data()

  记录一下抓取的代码,因为新氧的安全策略,所以代理需要频繁替换,估计抓四次左右即可抓全数据

原文地址:https://www.cnblogs.com/ftxy/p/11831306.html