我使用的是Java代码实现简单的爬虫
爬取的丁香医生的疫情信息
源代码:
package com.fin.collect; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.ArrayList; import java.util.List; import javax.net.ssl.HttpsURLConnection; import org.jsoup.Jsoup; import com.alibaba.fastjson.JSONArray; import net.sf.json.JSON; import net.sf.json.JSONObject; import com.fin.util.BaseConnection; public class CollectDataClass { public static void main(String[] args) throws IOException { getAreaStat(); } // 根URL private static String httpRequset(String requesturl) throws IOException { StringBuffer buffer = null; BufferedReader bufferedReader = null; InputStreamReader inputStreamReader = null; InputStream inputStream = null; HttpsURLConnection httpsURLConnection = null; try { URL url = new URL(requesturl); httpsURLConnection = (HttpsURLConnection) url.openConnection(); httpsURLConnection.setDoInput(true); httpsURLConnection.setRequestMethod("GET"); inputStream = httpsURLConnection.getInputStream(); inputStreamReader = new InputStreamReader(inputStream, "utf-8"); bufferedReader = new BufferedReader(inputStreamReader); buffer = new StringBuffer(); String str = null; while ((str = bufferedReader.readLine()) != null) { buffer.append(str); } } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } return buffer.toString(); } /** * 获取全国各个省市的确诊、死亡和治愈人数 * * @return */ public static String getAreaStat() { String url = "https://ncov.dxy.cn/ncovh5/view/pneumonia"; String htmlResult = ""; try { htmlResult = httpRequset(url); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } // System.out.println(htmlResult); // 正则获取数据 // 因为html的数据格式看着就像json格式,所以我们正则获取json String reg = "window.getAreaStat = (.*?)\}(?=catch)"; Pattern totalPattern = Pattern.compile(reg); Matcher totalMatcher = totalPattern.matcher(htmlResult); String result = ""; if (totalMatcher.find()) { result = totalMatcher.group(1); System.out.println(result); // 各个省市的是一个列表List,如果想保存到数据库中,要遍历结果,下面是demo JSONArray array = JSONArray.parseArray(result); try { Connection conn = BaseConnection.getConnection(); Statement stmt = conn.createStatement(); for (int i = 0; i <= 30; i++) { com.alibaba.fastjson.JSONObject jsonObject = com.alibaba.fastjson.JSONObject .parseObject(array.getString(i)); String provinceName = jsonObject.getString("provinceName"); String current = jsonObject.getString("currentConfirmedCount"); String confirmed = jsonObject.getString("confirmedCount"); String cured = jsonObject.getString("curedCount"); String dead = jsonObject.getString("deadCount"); String suspect = jsonObject.getString("suspectedCount"); stmt.executeUpdate( "insert into province(name,confirm,suspect,heal,dead,current,time) values('" +provinceName + "','" +confirmed + "','" +suspect + "','" + cured + "','" + dead + "','" + current + "')"); stmt.executeUpdate("update province set name='" + provinceName + "',confirm='" + confirmed + "',suspect='"+ suspect + "',heal='" + cured + "',dead='" + dead + "',current='" + current + "' where name='"+provinceName+"'"); JSONArray array2 = jsonObject.getJSONArray("cities"); for (int j = 0; j < array2.size(); j++) { com.alibaba.fastjson.JSONObject jsonObject2 = com.alibaba.fastjson.JSONObject .parseObject(array2.getString(j)); String cityname = jsonObject2.getString("cityName"); String current2 = jsonObject2.getString("currentConfirmedCount"); String confirmed2 = jsonObject2.getString("confirmedCount"); String cured2 = jsonObject2.getString("curedCount"); String dead2 = jsonObject2.getString("deadCount"); String suspect2 = jsonObject2.getString("suspectedCount"); System.out.println(); stmt.executeUpdate("update city set name='" + cityname + "',confirm='" + confirmed2 + "',suspect='" + suspect2 + "',heal='" + cured2 + "',dead='" + dead2 + "',current='" + current2 +"',province='"+provinceName+"' where name='"+cityname+"'"); } } stmt.close(); conn.close(); } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } } return result; } }
首先做一个爬虫我们得先去丁香医生的网址https://ncov.dxy.cn/ncovh5/view/pneumonia
然后打开f12查找需要爬取的数据