[Python]爬取新型冠状病毒2.2至今的所有数据 python 2020.2.13

爬取网址http://hu.yixue99.com/2020/kszx_0205/27792.html

代码如下:

  1 import requests
  2 from bs4 import BeautifulSoup
  3 
  4 url="http://hu.yixue99.com/2020/kszx_0205/27792.html"
  5 kv = {'user-agent': 'Mozilla/5.0'}
  6 
  7 #爬取总览信息
  8 def content():
  9     url = "http://hu.yixue99.com/2020/kszx_0205/27792.html"
 10     r=requests.get(url,headers=kv)
 11     r.encoding=r.apparent_encoding
 12     demo=r.text
 13     soup=BeautifulSoup(demo,"html.parser")
 14     print("开始")
 15     #print(r.text)
 16     num=0
 17     texts=""
 18     for s in soup.find_all("span",{"style":"font-size:14px;"}):
 19         text=str(s.string).replace("时间(北京时间)", "").replace("确诊", "").replace("疑似", "").replace("死亡", "").replace("治愈","").replace("疫情详情", "").replace("点击查看", "")
 20         if text!="":
 21             num+=1
 22             if num % 5 != 0:
 23                 texts += text + " "
 24             else:
 25                 texts+=text
 26                 print(texts)
 27                 wtire_content(texts.replace("","") + "
")
 28                 texts=""
 29 
 30 
 31 #爬取链接
 32 def href():
 33     url = "http://hu.yixue99.com/2020/kszx_0205/27792.html"
 34     r = requests.get(url, headers=kv)
 35     r.encoding = r.apparent_encoding
 36     demo = r.text
 37     soup = BeautifulSoup(demo, "html.parser")
 38     print("开始")
 39     # print(r.text)
 40     num = 0
 41     texts = ""
 42     for s in soup.find_all("span", {"style": "font-size:14px;"}):
 43         if s.find("a") is not None:
 44             href=str(s.find("a").attrs["href"])
 45             print(href)
 46             wtire_href(href+"
")
 47 
 48 
 49 #爬取内容
 50 def content_day(url):
 51     r = requests.get(url, headers=kv)
 52     r.encoding = r.apparent_encoding
 53     demo = r.text
 54     soup = BeautifulSoup(demo, "html.parser")
 55     print(url)
 56     print("开始")
 57     num = 0
 58     texts = ""
 59     one=0
 60     time= str(soup.find("td",{"style" : "PADDING-BOTTOM: 0px; PADDING-TOP: 0px; PADDING-LEFT: 0px; MARGIN: 0px; PADDING-RIGHT: 0px"}).string).replace("各省疫情动态(截止至","").replace(" 10:00)","").replace("各省疫情动态(截止至","").replace(" 11:00)","")
 61     print(time)
 62     for s in soup.find_all("td",{"style" : "PADDING-BOTTOM: 0px; PADDING-TOP: 0px; PADDING-LEFT: 0px; MARGIN: 0px; PADDING-RIGHT: 0px"}):
 63             text = str(s.string).replace("确诊", "").replace("疑似", "").replace("死亡", "").replace("治愈", "").replace(" ", "").replace("省份", "")
 64             if one==0:
 65                 one+=1
 66             else:
 67                 if text !="":
 68                     num+=1
 69                     if num % 5 != 0:
 70                         texts += text + " "
 71                     else:
 72                         texts += text
 73                         print(time+texts)
 74                         write_content_day(time+" "+texts+"
")
 75                         texts = ""
 76 
 77 #写入总览信息
 78 def wtire_content(contents):
 79     f=open("E:/bingducsv/bingdusum.txt" , "a+" , encoding="utf-8")
 80     f.write(contents)
 81     f.close()
 82 
 83 #写入每日的链接
 84 def wtire_href(contents):
 85     f = open("E:/bingducsv/bingduhref.txt", "a+", encoding="utf-8")
 86     f.write(contents)
 87     f.close()
 88 
 89 def read():
 90     f = open("E:/bingducsv/bingduhref.txt", "r+", encoding="utf-8")
 91     for line in f:
 92         line=line.rstrip("
")
 93         url=line
 94         content_day(url)
 95 
 96 def write_content_day(contents):
 97     f = open("E:/bingducsv/bingduday.txt", "a+", encoding="utf-8")
 98     f.write(contents)
 99     f.close()
100 
101 
102 if __name__=="__main__":
103     content()
104     href()
105     read()
原文地址:https://www.cnblogs.com/zlc364624/p/12304119.html