爬取药智网中的方剂信息

为了防止爬虫过程过快,被限制ip所以每次爬取完一个页面,就休眠6秒

初学爬虫,写的有点简单

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import csv
import time

ExcelName = "F:/大学/毕业设计/资料文档/方剂.csv"
#写入表头
# with open(ExcelName, 'w', encoding='utf-8', newline='') as csvfile:
#     writer = csv.writer(csvfile)
#     writer.writerow(["方名","出处","功用大类" ,"功用小类","处方","炮制","功用","主治","附方"])

def get_contents(ulist, url):
    headers = {  # 假装自己是浏览器
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
        # 把你刚刚拿到的Cookie塞进来
        'Cookie': 'kztoken=nJail6zJp6iXaJqWmGpnZmlwYZyZ; his=a%3A10%3A%7Bi%3A0%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqYZya%22%3Bi%3A1%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqYpeX%22%3Bi%3A2%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqYpmU%22%3Bi%3A3%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqYpqS%22%3Bi%3A4%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqYpuU%22%3Bi%3A5%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqY5aV%22%3Bi%3A6%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqY5aa%22%3Bi%3A7%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqY5mX%22%3Bi%3A8%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlvaZaU%22%3Bi%3A9%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlwYZyZ%22%3B%7D; bigdata_use_tips=1; PHPSESSID=iiiqpops4jemgoh33rbrkqhue5; yaozh_logintime=1615682156; yaozh_user=1026728%09%E4%B8%80%E5%BE%80%E6%97%A0%E5%89%8Dgy; yaozh_jobstatus=kptta67UcJieW6zKnFSe2JyYnoaSZ5drnJadg26qb21rg66flM6bh5%2BscZJsbIVJGuFJIuEd%2FNVK7fLIrFlwq2uac1OfwqnZw62gzp1Unti163E4711aE449B15f37E26dF531cDF2DckpSeg2ibZpmdlpVpaGpabNRzZW2Dqs7Rnlmcq2yUmJyDlZqSbJttl5Wammhqalps3g%3D%3D0fc4e597aa9b7a0a8b55788b6dfd7894; _ga=GA1.2.2493188.1609388760; _gid=GA1.2.1909203093.1615682102; kztoken=nJail6zJp6iXaJqWmGpnZmlsZJuU; his=a%3A10%3A%7Bi%3A0%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqaZyU%22%3Bi%3A1%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqaZyb%22%3Bi%3A2%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqapSU%22%3Bi%3A3%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqapSa%22%3Bi%3A4%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqapWX%22%3Bi%3A5%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqapib%22%3Bi%3A6%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlqapqY%22%3Bi%3A7%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlrYZmb%22%3Bi%3A8%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlrZJaS%22%3Bi%3A9%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGpnZmlsZJuU%22%3B%7D; zhuce_show=true; acw_tc=2f624a1716156878327574920e31b8726ca5960ab6c9d6b0f869dc5e312a44; think_language=zh-CN; _ga=GA1.3.165986868.1609388536; _gid=GA1.3.1909203093.1615682102; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1615467255,1615682102,1615682160,1615683332; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1615688097', }
    session = requests.Session()
    response = session.get(url, headers=headers)
    response.encoding = 'UTF-8'
    html = response.text  # 将网页内容以html返回
    soup = BeautifulSoup(html, 'lxml')  # 解析网页的一种方法
    trs = soup.find_all('tr')

    方名 = ""
    出处 = ""
    功用大类 = ""
    功用小类 = ""
    处方 = ""
    炮制 = ""
    功用 = ""
    主治 = ""
    附方 = ""

    for tr in trs:

        for td in tr:
            if td.string == '方名':
                spans = tr.find('span')
                方名 = spans.get_text().split('}')[1]
                spans2 = tr.find('span')
                print(方名)

            if td.string == "出处":
                spans = tr.find('span')
                spans2 = tr.find('span')

                出处 = spans.get_text().split('}')[1]


            if td.string == "功用大类":
                spans = tr.find('span')
                spans2 = tr.find('span')

                功用大类 = spans.get_text().split('}')[1]

            if td.string == "功用小类":
                spans = tr.find('span')
                spans2 = tr.find('span')

                功用小类 = spans.get_text().split('}')[1]

            if td.string == "处方":
                spans = tr.find('span')
                spans2 = tr.find('span')

                处方 = spans.get_text().split('}')[1]

            if td.string == "炮制":
                spans = tr.find('span')
                spans2 = tr.find('span')

                炮制 = spans.get_text().split('}')[1]

            if td.string == "功用":
                spans = tr.find('span')
                spans2 = tr.find('span')

                功用 = spans.get_text().split('}')[1]

            if td.string == "主治":
                spans = tr.find('span')
                spans2 = tr.find('span')

                主治 = spans.get_text().split('}')[1]

            if td.string == "附方":
                spans = tr.find('span')
                spans2 = tr.find('span')

                附方 = spans.get_text().split('}')[1]
        #ulist.append(ui)
    with open(ExcelName, 'a', encoding='utf-8', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([方名,出处,功用大类 ,功用小类,处方,炮制,功用,主治,附方])

    # 保存资源


def main():

    urli = []
    for i in range(1600,2000):
        url = f"https://db.yaozh.com/fangji/{i+10000000}.html"

        print ("开始爬取")
        get_contents(urli, url)
        print("开始保存")
        time.sleep(6)

main()
原文地址:https://www.cnblogs.com/1gaoyu/p/14533797.html