JAVA爬虫抓取页面的URL数据




在互联网发达的今天,程序员往往开发的过程中需要一些稳定的网站数据.这个时候往往有些接口数据会收费,为了方便开发.程序员会使用爬虫技术抓取数据.爬虫往往分几种:网页UR.L抓取,
根据接口抓取等等.下面介绍是根据URL抓取相应数据.
附录常用免费天气接口:
http://api.weatherdt.com/common/?area=101090601&type=forecast[24h_5d{001,002}]&key=3c801494e96ea41ae2c77634b0960977

http://www.weather.com.cn/data/cityinfo/101090601.html

 http://api.k780.com/?app=weather.future&weaid=langfang&&appkey=10003&sign=b59bc3ef6191eb9f747dd4e83c99f2a4&format=json

http://api.weatherdt.com/common/?area=101160901&type=observe&key=fd034bf8fe70289698ec4ea79876feaa
{
    "observe": {
        "101160901": {
            "1001002": {
                "006": "0.0",
                "000": "17:15",
                "005": "53",
                "004": "2",
                "003": "2",
                "007": "879",
                "002": "25"
            }
        }
    }
}
http://www.weather.com.cn/data/sk/101160901.html

{
    "weatherinfo": {
        "city": "天水",
        "cityid": "101160901",
        "temp": "20.5",
        "WD": "北风",
        "WS": "小于3级",
        "SD": "40%",
        "AP": "883.8hPa",
        "njd": "暂无实况",
        "WSE": "<3",
        "time": "17:00",
        "sm": "1.3",
        "isRadar": "1",
        "Radar": "JC_RADAR_AZ9938_JB"
    }
}




1
//抓取森林防火最新页面的URL 2 public void getSlhz(){ 3 String strURL="http://wwww.forestry.gov.cn/Common/index/3563.html"; 4 URL url; 5 6 try{ 7 url = new URL(strURL); 8 HttpURLConnection httpConn=(HttpURLConnection)url.openConnection(); 9 InputStreamReader input=new InputStreamReader(httpConn.getInputStream(),"utf-8"); 10 11 BufferedReader buf= new BufferedReader(input); 12 13 String line=""; 14 StringBuilder conf=new StringBuilder(); 15 while((line=buf.readLine()))!=null){ 16 conf.append(line); 17 } 18 String buf=conf.toString(); 19 int beginIx=buf.indexOf("<ul> <li class="cl"><a href="">); 20 int endIx=buf.indexOf("/" title="""); 21 String result=buf.substring(beginIx,endIx); 22 String resl="http://www.forestry.gov.cn"+result.split("href="")[1]; 23 24 System.out.println(resl); 25 }catch(Exception e){ 26 e.printStackTrace(); 27 28 } 29 30 }

 天气接口爬虫

 
 4 import org.apache.logging.log4j.core.util.JsonUtils;
 5 import org.jsoup.Jsoup;
 6 import org.jsoup.nodes.Document;
 7 import org.jsoup.nodes.Element;
 8 import org.jsoup.select.Elements;
 9 
10 import net.sf.json.JSONArray;
11 import net.sf.json.JSONObject;
12 import java.util.List;
13 
14 
15 public class weth {
16 
17 public static void main(String[] args) {
18   String[] typeStr=new String[]{"tomorrow","third","fourth","fifth","sixth","seventh"};
19   JSONArray ja=new JSONArray();
20   for(String str:typeStr){
21    Document weatherDoc = WeatherDataCatch("http://tianqi.2345.com/"+str+"-54515.htm");
22    JSONObject jobject = new JSONObject();
23    Elements weatherData = ((Element) weatherDoc).getElementsByClass("tbody");  //获取数据块
24    Elements infoF = weatherData.select("[class = phrase]");
25    String info = infoF.get(0).text();
26    if(info!=null&&!"".equals(info)){                                  //天气情况
27     jobject.put("info", info);  
28    }
29    Elements wdDom = weatherData.select("[class = temperature]");
30    String zgwd = wdDom.get(0).text();
31    if(zgwd!=null&&!"".equals(zgwd)){                                  //最高温度
32     jobject.put("zgwd", zgwd);  
33    }
34    
35    String zdwd = wdDom.get(1).text();
36    if(zdwd!=null&&!"".equals(zdwd)){                                  //最低温度
37     jobject.put("zdwd", zdwd);  
38    }
39    
40    Elements parameter = ((Element) weatherDoc).getElementsByClass("parameter"); //获取数据块
41    Elements degree = parameter.select("li");
42    String kqzl = degree.get(0).select("i").text();
43    if(kqzl!=null&&!"".equals(kqzl)){                                 
44     jobject.put("kqzl", kqzl);       //空气质量
45    }
46    if(str.equals("tomorrow")){//
47     String fxfs = degree.get(1).select("i").text();
48     if(fxfs!=null&&!"".equals(fxfs)){                                 
49      jobject.put("fxfs", fxfs);         //风向风速
50     }
51    }else{
52     String fxfs = degree.get(1).select("i").text();
53     fxfs+= degree.get(2).select("i").text();
54     if(fxfs!=null&&!"".equals(fxfs)){                                 
55      jobject.put("fxfs", fxfs);         //风向风速
56     }
57    }
58    
59    ja.add(jobject);
60   }
61   System.out.println(ja.toString());
62  }
63 
64 
65 
66 public static Document WeatherDataCatch(String url){
67       String result="";
68        Document doc = null;
69       try {
70         doc =  Jsoup.connect(url).timeout(100000).get();
71 //        Element body = doc.body();
72 //        result = body.text();
73       } catch (Exception e) {
74        // TODO Auto-generated catch block
75        e.printStackTrace();
76       }
77       return doc;
78      }
79 }
80 
[{"info":"阴","zgwd":"最高:27℃","zdwd":"最低:19℃","kqzl":"良","fxfs":"西南风2级"},{"info":"小雨","zgwd":"最高:25℃","zdwd":"最低:18℃","kqzl":"良","fxfs":"西北风2级"},{"info":"晴","zgwd":"最高:28℃","zdwd":"最低:16℃","kqzl":"良","fxfs":"东北风3级"},{"info":"多云","zgwd":"最高:28℃","zdwd":"最低:16℃","kqzl":"良","fxfs":"西南风3级"},{"info":"多云","zgwd":"最高:27℃","zdwd":"最低:16℃","kqzl":"良","fxfs":"东南风3级"},{"info":"小雨","zgwd":"最高:25℃","zdwd":"最低:16℃","kqzl":"良","fxfs":"东南风2级"}]

pom.xml配置

<dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.7.2</version>
        </dependency>

天气接口工具类:

WeatherUtil.java

package com.gsafety.langfang.screendisplay.utils;

import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.annotation.Resource;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.ResponseBody;

import com.google.gson.Gson;
import com.gsafety.cloudframework.common.base.page.PageResult;
import com.gsafety.cloudframework.config.util.ConfigCacheUtil;
import com.gsafety.langfang.screendisplay.vo.Returnmsg;

import net.sf.json.JSONObject;

public class WeatherUtil {
    
    private static Logger logger = Logger.getLogger(WeatherUtil.class);
        private static String wUrl;
        private static String area;
        private static String type2day;
        private static String type5day;
        private static String key;
        static {
            String weatherUrl = ConfigCacheUtil.getConf("weatherUrl").getValue();
            if(StringUtils.isNotEmpty(weatherUrl)){
                JSONObject jsonObject = JSONObject.fromObject(weatherUrl);
                wUrl = jsonObject.getString("url"); //url
                area = jsonObject.getString("langfangAreaCode"); //区域编码
                type2day= jsonObject.getString("type2d");//2天数据类型
                type5day= jsonObject.getString("type5d");//7天数据类型
                key= jsonObject.getString("key");//key值
            }
        }
        private static SimpleDateFormat SDF = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        
        /**
         * 气象小窗口接口请求
         * 数据为当前的数据
         * @return json
         */
     public static JSONObject getDataJson() {
            JSONObject json = null;
            try {
                Calendar cd=Calendar.getInstance();
                //开始时间
                String startTime = SDF.format(cd.getTime());
                logger.info("*************"+startTime +"气象接口--请求开始**********************");
                    //http://api.weatherdt.com/common/?area=101160901&type=observe&key=fd034bf8fe70289698ec4ea79876feaa
                    String url = wUrl + area + "&type=" + "observe" + "&key=fd034bf8fe70289698ec4ea79876feaa";
                    // 拼装请求
                    GetMethod get = new GetMethod(url);
                    get.releaseConnection();
                    // 调用方法
                    HttpClient client = new HttpClient();
                    logger.info("*************气象接口--地址:" + url + "**********************");
                    String result = "";
                    try {
                        int executeMethod = client.executeMethod(get);
                        result = get.getResponseBodyAsString();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                    if (StringUtils.isEmpty(result)) {
                        result = "{'observe':{'101160901':{'1001002':{'006': '0.0','000':'17:15','005':'53','004':'2','003':'2','007':'879','002':'25'}}}}";
                    }
                    json = JSONObject.fromObject(result.toString());
                    logger.info("*************气象接口--返回值:" + result + "**********************");
                    //结束时间
                    String endTime = SDF.format(cd.getTime());
                    logger.info("*************"+endTime+"气象接口--请求结束**********************");
                } catch (Exception e) {
                     e.printStackTrace();
                   logger.info("*************气象接口--请求失败**********************");
                }
        return json; 
     }
     
     
     /**
         * 其他气象网站的数据,因为大屏要的数据显示不完全
         * @return json  暂时没有用,以后可以用
         * * 
         */
     public static JSONObject getDataofJson() {
            JSONObject json = null;
            String responseStr = null;
            Map map = null;
            try {
                Calendar cd=Calendar.getInstance();
                //开始时间
                String endTime = SDF.format(cd.getTime());
                cd.add(Calendar.DATE,-60);
                //结束时间
                String startTime = SDF.format(cd.getTime());
                logger.info("*************气象接口--请求开始**********************");

                //http://www.weather.com.cn/data/sk/101160901.html
                URL url = new URL("http://www.weather.com.cn/data/sk/101160901.html");
                logger.info("*************气象接口--地址:http://www.weather.com.cn/data/sk/101160901.html**********************");

                // 建立http连接
                HttpURLConnection conn = (HttpURLConnection) url.openConnection();
                // 设置允许输出
                conn.setDoOutput(true);
                conn.setDoInput(true);
                // 设置不用缓存
                conn.setUseCaches(false);
                // 设置传递方式
                conn.setRequestMethod("GET");
                // 设置维持长连接
                conn.setRequestProperty("Connection", "Keep-Alive");
                // 设置文件字符集:
                conn.setRequestProperty("Charset", "UTF-8");
                // 设置文件类型:
                conn.setRequestProperty("contentType", "application/json");
                // 开始连接请求
                conn.connect();
                logger.info("*************气象接口--状态:"+conn.getResponseCode()+"**********************");
                // 请求返回的状态
                if (conn.getResponseCode() == 200) {
                    // 请求返回的数据
                    InputStream in = conn.getInputStream();
                    
                    byte[] data1 = readBig(in);
                    in.read(data1);
                    // 转成字符串
                    responseStr = new String(data1, "utf-8");
                    //logger.info("*************气象接口--返回值:"+responseStr+"**********************");
                    Gson gson=new Gson();
                    json = JSONObject.fromObject(responseStr);
                }else{
                   logger.info("*************气象接口--请求失败**********************");
                 
                }
                logger.info("*************气象接口--请求结束**********************");
            } catch (IOException e) {
            e.printStackTrace();
        }        
        
        return json; 
     }
     
     
     
     
     
     
     
     
     
        /**
         * 气象小窗口接口请求
         * 数据为2天的数据
         * @return json
         * 
         */
     public static JSONObject getData2dayJson() throws java.net.UnknownHostException {
            JSONObject json = null;
            String url = null;
            Calendar cd=Calendar.getInstance();
                //开始时间
               String startTime = SDF.format(cd.getTime());
               logger.info("*************"+startTime +"气象接口--请求开始**********************");
                
                try {
                    url = wUrl+area+"&type=forecast"+URLEncoder.encode("[","UTF-8")+"24h_2d"+URLEncoder.encode("{","UTF-8")+"001,002"+URLEncoder.encode("}]","UTF-8")+"&key="+key;
                } catch (UnsupportedEncodingException e1) {
                    // TODO Auto-generated catch block
                    e1.printStackTrace();
                }
                // 拼装请求
                GetMethod get = new GetMethod(url);
                get.releaseConnection();
                // 调用方法
                HttpClient client = new HttpClient();
                logger.info("*************气象接口--地址:" + url + "**********************");
                String result = "";
                try {
                    int executeMethod = client.executeMethod(get);
                    result = get.getResponseBodyAsString();
                } catch (IOException e) {
                    e.printStackTrace();
                   logger.info("*************气象接口--请求失败**********************");
                    
                }
                if (StringUtils.isEmpty(result)) {
                    result = "{'forecast':{'24h':{'101090601':{'1001001':[{'003':'35','004':'21','001':'00','002':'00'},{'003':'35','004':'21','001':'00','002':'00'}]}}}}";
                     
                }
                json = JSONObject.fromObject(result.toString());
                logger.info("*************气象接口--返回值:" + result + "**********************");
                
                 //结束时间+
                String endTime = SDF.format(cd.getTime());
                logger.info("*************"+endTime+"气象接口--请求结束**********************");
             return json;
    }
     
     /**
         * 气象大窗口接口请求
         * 数据为7天的数据
         * @return json
         */
     public static JSONObject getData7dayJson() {
            JSONObject json = null;
            String url=null;
            try {
                Calendar cd=Calendar.getInstance();
                //开始时间
               String startTime = SDF.format(cd.getTime());
               logger.info("*************"+startTime +"气象接口--请求开始**********************");
                //URL url = new URL(wUrl+area+"&type="+"forecast[24h_5d{001,002}]&key="+key);
                url = wUrl+area+"&type=forecast"+URLEncoder.encode("[","UTF-8")+"24h_5d"+URLEncoder.encode("{","UTF-8")+"001,002"+URLEncoder.encode("}]","UTF-8")+"&key="+key;
                logger.info("*************气象接口--地址:"+url+"   **********************");
                // 拼装请求
                GetMethod get = new GetMethod(url);
                get.releaseConnection();
                // 调用方法
                HttpClient client = new HttpClient();
                String result = "";
                try {
                    int executeMethod = client.executeMethod(get);
                    result = get.getResponseBodyAsString();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                if (StringUtils.isEmpty(result)) {
                     result = "{'forecast':{'24h':{'101090601':{'1001001':[{'003':'35','004':'21','001':'00','002':'00'},{'003':'35','004':'21','001':'00','002':'00'},{'003':'35','004':'21','001':'00','002':'00'},{'003':'38','004':'25','001':'01','002':'01'},{'003':'36','004':'25','001':'01','002':'01'},{'003':'33','004':'23','001':'01','002':'01'},{'003':'33','004':'22','001':'01','002':'02'}]}}}}";
                     
                } 
                json = JSONObject.fromObject(result.toString());
                logger.info("*************气象接口--结果:"+result +"**********************");
              //结束时间+
                String endTime = SDF.format(cd.getTime());
                logger.info("*************"+endTime+"气象接口--请求结束**********************");
            } catch (IOException e) {
            e.printStackTrace();
            logger.info("*************气象接口--请求失败**********************");
        }        
        
        return json;
    }
    /**
     * 气象 wukaihua
     * 
     * "observe": {//实况
     * "101010100": {//站号
     * "1001002": {//数据大类
     * "006": "0",//当前降水量(单位是毫米)
     * "007": "1004",//当前气压(单位百帕)
     * "003": "1",//当前风力(单位是级,不用转码)
     * "004": "2",//当前风向编号
     * "000": "10:25",//实况发布时间
     * "005": "79",//当前湿度(单位%)
     * "002": "7"//当前温度(单位摄氏度)
     * 
     * @return
     */
  
     //判断天气
    public static String getWeatherStr(String str) {
        if (StringUtils.isEmpty(str)) {
            return "";
        }
        if ("00".equals(str)) {
            return "晴";
        }
        if ("01".equals(str)) {
            return "多云";
        }
        if ("02".equals(str)) {
            return "阴";
        }
        
        //阵雨
        if ("03".equals(str)) {
            return "阵雨";
        }
        if ("04".equals(str)) {
            return "雷阵雨";
        }
        if ("05".equals(str)) {
            return "雷阵雨伴有冰雹";
        }
        if ("06".equals(str)) {
            return "雨夹雪";
        }
        
        //小雨
        if ("07".equals(str)) {
            return "小雨";
        }
        
        //中雨
        if ("08".equals(str)) {
            return "中雨";
        }
        if ("21".equals(str)) {
            return "小到中雨";
        }
        
        
        //大雨
        if ("09".equals(str)) {
            return "大雨";
        }
        if ("22".equals(str)) {
            return "中到大雨";
        }
        
        //暴雨
        if ("10".equals(str)) {
            return "暴雨";
        }
        if ("11".equals(str)) {
            return "大暴雨";
        }
        if ("12".equals(str)) {
            return "特大暴雨";
        }
        if ("19".equals(str)) {
            return "冻雨";
        }
        if ("23".equals(str)) {
            return "大到暴雨";
        }
        if ("24".equals(str)) {
            return "暴雨到大暴雨";
        }
        if ("25".equals(str)) {
            return "大暴雨到特大暴雨";
        }
        
        
        if ("301".equals(str)) {
            return "雨";
        }
        
        //
        if ("13".equals(str)) {
            return "阵雪";
        }
        if ("14".equals(str)) {
            return "小雪";
        }
        if ("15".equals(str)) {
            return "中雪";
        }
        if ("16".equals(str)) {
            return "大雪";
        }
        if ("17".equals(str)) {
            return "暴雪";
        }
        if ("26".equals(str)) {
            return "小到中雪";
        }
        if ("27".equals(str)) {
            return "中到大雪";
        }
        if ("28".equals(str)) {
            return "大到暴雪";
        }
        if ("302".equals(str)) {
            return "雪";
        }
        
        
        //
        if ("18".equals(str)) {
            return "雾";
        }
        if ("32".equals(str)) {
            return "浓雾";
        }
        if ("49".equals(str)) {
            return "强浓雾";
        }
        if ("57".equals(str)) {
            return "大雾";
        }
        if ("58".equals(str)) {
            return "特强浓雾";
        }
        
        
        //沙尘暴
        if ("20".equals(str)) {
            return "沙尘暴";
        }
        if ("29".equals(str)) {
            return "浮尘";
        }
        if ("30".equals(str)) {
            return "扬沙";
        }
        if ("31".equals(str)) {
            return "强沙尘暴";
        }
        
        
        //
        if ("53".equals(str)) {
            return "霾";
        }
        
        if ("54".equals(str)) {
            return "中度霾";
        }
        if ("55".equals(str)) {
            return "重度霾";
        }
        if ("56".equals(str)) {
            return "严重霾";
        }
        //
        if ("99".equals(str)) {
            return "无";
        }
        
        
        
        return "";
    }
    //判断风向
    public String getWindStr(String str) {
        if (StringUtils.isEmpty(str)) {
            return "";
        }
        if ("0".equals(str)) {
            return "无持续风向";
        }
        if ("1".equals(str)) {
            return "东北风";
        }
        if ("2".equals(02)) {
            return "东风";
        }
        if ("3".equals(str)) {
            return "东南风";
        }
        if ("4".equals(str)) {
            return "南风";
        }
        if ("5".equals(str)) {
            return "西南风";
        }
        if ("6".equals(str)) {
            return "西风";
        }
        if ("7".equals(str)) {
            return "西北风";
        }
        if ("8".equals(str)) {
            return "北风";
        }
        if ("9".equals(str)) {
            return "旋转风";
        }
        return "";
    }
   //由空气指数范围判定状态情况
    public String getAirLevelStr(int str) {
        if (str==0 || str>0 || str<=50) {
            return "优";
        }
        if ( str>50 || str<=100) {
            return "良";
        }
        if ( str>100 || str<=200) {
            return "轻度污染";
        }
        if ( str>200 || str<=300) {
            return "中度污染";
        }
        if (str>300) {
            return "重度污染";
        }
        
        
        return "";
    }
    
    
   //判断日期一周
    public static String getWeekOfDate(Date date) {
        String[] weekDays = { "星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日" };
        Calendar cal = Calendar.getInstance();
        cal.setTime(date);
        int w = cal.get(Calendar.DAY_OF_WEEK) - 1;
        if (w < 0) {
            w = 0;
        }
        return weekDays[w];
    }
    
    private static byte[] readBig(InputStream in) throws IOException {
        BufferedInputStream bis = new BufferedInputStream(in);
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        int c = bis.read();
        while((c!=-1)){
            baos.write(c);
            c = bis.read();
        }
        bis.close();
        return baos.toByteArray();
    }
}
原文地址:https://www.cnblogs.com/ComputerVip/p/11577214.html