利用Jsoup爬取新冠疫情数据并存至数据库

  需要用到的jar包(用来爬取的jsoup,htmlunit-2.37.0-bin以及连接数据库中的mysql.jar)

  链接:https://pan.baidu.com/s/1VlylWmlhjd8Ka8VTruQEnA     提取码:dxeq

  爬取的原网站为:https://wp.m.163.com/163/page/news/virus_report/index.html?_nw_=1&_anw_=1

Paqu.java

package control;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;

import Dao.AddService;

 
public class Paqu {

    public static void main(String args[]) {
        // TODO Auto-generated method stub
        String sheng="";
        String xinzeng="";
        String leiji="";
        String zhiyu="";
        String siwang="";
         String url = "https://wp.m.163.com/163/page/news/virus_report/index.html?_nw_=1&_anw_=1";
        
        int i=0;
        
        try {
            //构造一个webClient 模拟Chrome 浏览器
            WebClient webClient = new WebClient(BrowserVersion.CHROME);
            //支持JavaScript
            webClient.getOptions().setJavaScriptEnabled(true);
            webClient.getOptions().setCssEnabled(false);
            webClient.getOptions().setActiveXNative(false);
            webClient.getOptions().setCssEnabled(false);
            webClient.getOptions().setThrowExceptionOnScriptError(false);
            webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
            webClient.getOptions().setTimeout(8000);
            HtmlPage rootPage = webClient.getPage(url);
            //设置一个运行JavaScript的时间
            webClient.waitForBackgroundJavaScript(6000);
            String html = rootPage.asXml();
            Document doc = Jsoup.parse(html);
            //System.out.println(doc);
            Element listdiv1 = doc.select(".wrap").first();
            Elements listdiv2 = listdiv1.select(".province");
            for(Element s:listdiv2) {
                Elements span = s.getElementsByTag("span");
                Elements real_name=span.select(".item_name");
                Elements real_newconfirm=span.select(".item_newconfirm");
                Elements real_confirm=span.select(".item_confirm");
                Elements real_dead=span.select(".item_dead");
                Elements real_heal=span.select(".item_heal");
                sheng=real_name.text();
                xinzeng=real_newconfirm.text();
                leiji=real_confirm.text();
                zhiyu=real_heal.text();
                siwang=real_dead.text();
                //System.out.println(sheng+" 新增确诊:"+xinzeng+" 累计确诊:"+leiji+" 累计治愈:"+zhiyu+" 累计死亡:"+siwang);
                Date currentTime=new Date();
                SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm");
                String time = formatter.format(currentTime);//获取当前时间
                AddService dao=new AddService();
                dao.add("myinfo", sheng, xinzeng, leiji, zhiyu, siwang,time);//将爬取到的数据添加至数据库
            }
            
            
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            System.out.println("爬取失败");
        }
    }
    
}

   

  AddService.java:

package Dao;

import java.sql.Connection;
import java.sql.Statement;

import utils.DBUtils;

public class AddService {
    public void add(String table,String sheng,String xinzeng,String leiji,String zhiyu,String dead,String time) {
        String sql = "insert into "+table+" (Province,Newconfirmed_num ,Confirmed_num,Cured_num,Dead_num,Time) values('" + sheng + "','" + xinzeng +"','" + leiji +"','" + zhiyu + "','" + dead+ "','" + time+ "')";
        System.out.println(sql);
        Connection conn = DBUtils.getConn();
        Statement state = null;
        int a = 0;
        try {
            state = conn.createStatement();
            a=state.executeUpdate(sql);
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            DBUtils.close(state, conn);
        }        
    }
}

数据库建表如下:

 遇到的问题

  一开始的数据是动态加载的,无法获取确定的数据,最后在代码中添加了一段js内容来获取动态数据。

  其中还尝试过爬取其他的网站上的数据,但doc并不能很好的输出,只能输出网站的大框架,无法获取具体到内容。

原文地址:https://www.cnblogs.com/xhj1074376195/p/12469256.html