第四周--爬虫的学习

我使用的是Java代码实现简单的爬虫
爬取的丁香医生的疫情信息
源代码：
package com.fin.collect;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.ArrayList;
import java.util.List;

import javax.net.ssl.HttpsURLConnection;

import org.jsoup.Jsoup;

import com.alibaba.fastjson.JSONArray;

import net.sf.json.JSON;
import net.sf.json.JSONObject;
import com.fin.util.BaseConnection;

public class CollectDataClass {
    public static void main(String[] args) throws IOException {
        getAreaStat();
    }

    // 根URL
    private static String httpRequset(String requesturl) throws IOException {
        StringBuffer buffer = null;
        BufferedReader bufferedReader = null;
        InputStreamReader inputStreamReader = null;
        InputStream inputStream = null;
        HttpsURLConnection httpsURLConnection = null;
        try {
            URL url = new URL(requesturl);
            httpsURLConnection = (HttpsURLConnection) url.openConnection();
            httpsURLConnection.setDoInput(true);
            httpsURLConnection.setRequestMethod("GET");
            inputStream = httpsURLConnection.getInputStream();
            inputStreamReader = new InputStreamReader(inputStream, "utf-8");
            bufferedReader = new BufferedReader(inputStreamReader);
            buffer = new StringBuffer();
            String str = null;
            while ((str = bufferedReader.readLine()) != null) {
                buffer.append(str);
            }
        } catch (MalformedURLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        return buffer.toString();
    }

    /**
     * 获取全国各个省市的确诊、死亡和治愈人数
     * 
     * @return
     */
    public static String getAreaStat() {
        String url = "https://ncov.dxy.cn/ncovh5/view/pneumonia";
        String htmlResult = "";
        try {
            htmlResult = httpRequset(url);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        // System.out.println(htmlResult);

        // 正则获取数据
        // 因为html的数据格式看着就像json格式，所以我们正则获取json
        String reg = "window.getAreaStat = (.*?)\}(?=catch)";
        Pattern totalPattern = Pattern.compile(reg);
        Matcher totalMatcher = totalPattern.matcher(htmlResult);

        String result = "";
        if (totalMatcher.find()) {
            result = totalMatcher.group(1);
            System.out.println(result);
            // 各个省市的是一个列表List，如果想保存到数据库中，要遍历结果，下面是demo
            JSONArray array = JSONArray.parseArray(result);
        
            try {
                Connection conn = BaseConnection.getConnection();
                Statement stmt = conn.createStatement();

                for (int i = 0; i <= 30; i++) {

                    com.alibaba.fastjson.JSONObject jsonObject = com.alibaba.fastjson.JSONObject
                            .parseObject(array.getString(i));
                    String provinceName = jsonObject.getString("provinceName");
                        String current = jsonObject.getString("currentConfirmedCount");
                        String confirmed = jsonObject.getString("confirmedCount");
                        String cured = jsonObject.getString("curedCount");
                        String dead = jsonObject.getString("deadCount");
                        String suspect = jsonObject.getString("suspectedCount");
                        
                        stmt.executeUpdate( "insert into province(name,confirm,suspect,heal,dead,current,time) values('" +provinceName  + "','" +confirmed + "','" +suspect + "','" + cured + "','" + dead + "','" + current +  "')");
                        stmt.executeUpdate("update province set name='" + provinceName + "',confirm='" + confirmed + "',suspect='"+ suspect + "',heal='" + cured + "',dead='" + dead + "',current='" + current + "' where name='"+provinceName+"'");
                        
                        JSONArray array2 = jsonObject.getJSONArray("cities");
                        for (int j = 0; j < array2.size(); j++) {
                            com.alibaba.fastjson.JSONObject jsonObject2 = com.alibaba.fastjson.JSONObject
                                    .parseObject(array2.getString(j));
                            String cityname = jsonObject2.getString("cityName");
                            String current2 = jsonObject2.getString("currentConfirmedCount");
                            String confirmed2 = jsonObject2.getString("confirmedCount");
                            String cured2 = jsonObject2.getString("curedCount");
                            String dead2 = jsonObject2.getString("deadCount");
                            String suspect2 = jsonObject2.getString("suspectedCount");
                            System.out.println();
                            stmt.executeUpdate("update city set name='" + cityname + "',confirm='" + confirmed2 + "',suspect='"
                                    + suspect2 + "',heal='" + cured2 + "',dead='" + dead2 + "',current='" + current2 +"',province='"+provinceName+"' where name='"+cityname+"'");
                        }
                }
                stmt.close();
                conn.close();
            } catch (SQLException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
        return result;
    }
}
View Code
首先做一个爬虫我们得先去丁香医生的网址https://ncov.dxy.cn/ncovh5/view/pneumonia
然后打开f12查找需要爬取的数据