新型冠状病毒 疫情 数据爬取(Java 含源码)

代码:

  1 package 疫情;
  2 
  3 
  4 import java.text.SimpleDateFormat;
  5 import java.util.ArrayList;
  6 import java.util.Date;
  7 import java.util.List;
  8 
  9 import com.dao.InfoDao;
 10 import com.dao.YiDao;
 11 import org.jsoup.Jsoup;
 12 import org.jsoup.nodes.Document;
 13 import util.StringHandle;
 14 import us.codecraft.webmagic.Page;
 15 import us.codecraft.webmagic.Site;
 16 import us.codecraft.webmagic.Spider;
 17 import us.codecraft.webmagic.processor.PageProcessor;
 18 
 19 public class Info implements PageProcessor {
 20     static String regEx="[
`'' " , ,]";
 21     static String aa="";
 22     // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
 23     private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
 24     private static int count =0;
 25 
 26     @Override
 27     public Site getSite() {
 28         return site;
 29     }
 30     @Override
 31     public void process(Page page) {
 32         Date format = new Date();
 33         SimpleDateFormat ft = new SimpleDateFormat ("yyyy-MM-dd hh:mm:ss");
 34         String date=ft.format(format);
 35         System.out.println("当前时间为: " + ft.format(format));
 36         //System.out.println(page.getHtml());
 37         StringHandle sh=new StringHandle();
 38         String test=page.getHtml().xpath("//script [@id='getAreaStat']").get();
 39         System.out.println(test);
 40 
 41 
 42 
 43         List<String> Provinces=sh.getExpString(""provinceName":"(.*?)","provinceShortName":"(.*?)","confirmedCount":(.*?),"suspectedCount":(.*?),"curedCount":(.*?),"deadCount":(.*?),"comment":(.*?)"","locationId":(.*?),"", test);
 44 
 45         for(String Province:Provinces)
 46         {
 47             String Province_names=sh.getExpString(""provinceName":"(.*?)"", Province).get(0).replaceAll(""provinceName":", "").replaceAll(regEx, aa);
 48 
 49             String Province_num_confirmed=sh.getExpString(""confirmedCount":(.*?),", Province).get(0).replaceAll(""confirmedCount":", "").replaceAll(regEx, aa);
 50 
 51 
 52             String Province_num_cured=sh.getExpString(""curedCount":(.*?),", Province).get(0).replaceAll(""curedCount":", "").replaceAll(regEx, aa);
 53 
 54             String Province_num_dead=sh.getExpString(""deadCount":(.*?),", Province).get(0).replaceAll(""deadCount":", "").replaceAll(regEx, aa);
 55 
 56             String Province_num_locationId=sh.getExpString(""locationId":(.*?),", Province).get(0).replaceAll(""locationId":", "").replaceAll(regEx, aa);
 57 
 58 
 59             YiDao.add("info",date,Province_names,"",Province_num_confirmed,"",Province_num_cured,Province_num_dead,Province_num_locationId);
 60         }
 61 
 62 
 63 
 64 
 65         List<String> citys=sh.getExpString(""cityName":"(.*?)","confirmedCount":(.*?),"suspectedCount":(.*?),"curedCount":(.*?),"deadCount":(.*?),"locationId":(.*?)}", test);
 66         System.out.println(citys.get(5));
 67         for(String city:citys)
 68         {
 69 
 70           //  String Province_names=sh.getExpString(""provinceName":"(.*?)"", city).get(0).replaceAll(""provinceName":", "").replaceAll(regEx, aa);
 71 
 72             String City_names=sh.getExpString(""cityName":"(.*?)"", city).get(0).replaceAll(""cityName":", "").replaceAll(regEx, aa);
 73 
 74             String City_num_confirmed=sh.getExpString(""confirmedCount":(.*?),", city).get(0).replaceAll(""confirmedCount":", "").replaceAll(regEx, aa);
 75 
 76             String City_num_cured=sh.getExpString(""curedCount":(.*?),", city).get(0).replaceAll(""curedCount":", "").replaceAll(regEx, aa);
 77 
 78             String City_num_dead=sh.getExpString(""deadCount":(.*?),", city).get(0).replaceAll(""deadCount":", "").replaceAll(regEx, aa);
 79 
 80             String City_num_locationId=sh.getExpString(""locationId":(.*?)}", city).get(0).replaceAll(""locationId":", "").replaceAll(regEx, aa);
 81 
 82             System.out.println(City_names+City_num_confirmed+""+City_num_cured+City_num_dead+City_num_locationId);
 83             YiDao.add("info",date,"",City_names,City_num_confirmed,"",City_num_cured,City_num_dead,City_num_locationId);
 84         }
 85 
 86 
 87         System.out.println("AAAA");
 88         System.out.println(citys.get(0));
 89 
 90 
 91         count ++;
 92     }
 93 
 94     public static void main(String[] args) {
 95         long startTime, endTime;
 96         System.out.println("开始爬取...");
 97         InfoDao.delete("info");
 98         startTime = System.currentTimeMillis();
 99         Spider.create(new Info()).addUrl("https://ncov.dxy.cn/ncovh5/view/pneumonia_peopleapp?from=timeline&isappinstalled=0").thread(5).run();
100         endTime = System.currentTimeMillis();
101         System.out.println("爬取结束,耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了"+count+"条记录");
102     }
103 
104 
105 }

效果图:

原文地址:https://www.cnblogs.com/smartisn/p/12283472.html