jsoup爬虫,项目实战,欢迎收看

import com.mongodb.BasicDBObject
import com.mongodb.DBCollection
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.select.Elements

public class ZhongYuan {
    public static final DBCollection test = MongoUtils.getCollectionByName("name", "table", 
"port")
    public static final DBCollection html = MongoUtils.getCollectionByName("name", "table", 
"port")

    public static void main(String[] args){
//        循环遍历页面进行数据爬去
        for(int i = 500 ; i<598 ;i++) {
            String url = "http://sh.centanet.com/xiaoqu/g"+i+"/";
            String result = RequestUtil.doGet(url, "GBK");
            Document doc = Jsoup.parse(result);
            //页面加载完成后对document进行处理,获取自己有用的数据
            parseList(doc);
            System.out.println("page=====>"+i);
        }
    }
    private static void parseList(Document doc){

        Elements elements = doc.select("div.house-listBox>div");
        int j = 0;
        for(Element element : elements){

            String name = element.select(".house-title a").first().text();
            html.save(new BasicDBObject("name",name).append("html",element.toString()))
            String regionstr = element.select("div>div>p").first().text().replace(' ','-');
            String region = regionstr.split("-")[0];
            String address = null;
            if(regionstr.split("-").length>1) {
                address = regionstr.split("-")[1] + regionstr.split("-")[2];
            } else {
                address = regionstr.split("-")[1];
            }

            String price = element.select("div>div").last().select("p").first().text();
            test.insert(new BasicDBObject("city","上海").append("region",region).append("name",name)
                    .append("avg_price",price));
            System.out.println(name);
            j++;
        }
        System.out.println(j);
    }
    private static void parseList1(Document doc) {
        Elements elements = doc.select("div.section>ul>li");
        String name = null;
        String region = null;
        String price = null;
        for (Element element : elements) {
            if (element.toString().contains("room-img")) {
                name = element.select("h5.room-name a").first().text();
                Elements datas = element.select("p");
                int i = 0;
                for (Element data : datas) {
                    i++;
                    if (i == 2) {
                        price = data.text();
                    }
                    if (i == 4) {
                        region = data.text();
                    }
                }
                System.out.println(name + price + region);
                test.insert(new BasicDBObject("city","上海").append("region",region).append("name",name)
                        .append("avg_price",price));
            }
        }
    }
}

相关doget请求自己封装了一个util,可以看看,上面的这一句String result = RequestUtil.doGet(url, "GBK");用的就是自己封装的util包,这里也可以使用jsoup自己封装的。

/**
     * 发送get请求
     * @param url
     * @return
     */

    public static String doGet(String url) {
        return doGet(url,  null, "UTF-8", false);
    }

    public static String doGet(String url, boolean encodeUrl) {
        return doGet(url,  null, "UTF-8", encodeUrl);
    }

    public static String doGet(String url, String charset) {
        return doGet(url,  null, charset, true);
    }

    public static String doGet(String url, Map<String, String> headers) {
        return doGet(url, headers, "UTF-8", true);
    }

    public static String doGet(final String url, Map<String, String> headers, String charset, boolean encodeUrl) {
        CloseableHttpClient client = HttpClients
                .custom()
                .setUserAgent(USERAGENT_CHROME)
                .build();

        CloseableHttpResponse response = null;
        String result = null;
        String requestUrl = url;
        try {
            if(encodeUrl) {
                requestUrl = encodingUrl(url, charset);
            }
            HttpGet httpGet = new HttpGet(requestUrl);
//            RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(3000).setConnectTimeout(3000).build();//设置请求和传输超时时间
//            httpGet.setConfig(requestConfig);
            if(headers != null) {
                for(Map.Entry<String, String> entry : headers.entrySet()) {
                    httpGet.addHeader(entry.getKey(), entry.getValue());
                }
            }
            response = client.execute(httpGet);
            int statusCode = response.getStatusLine().getStatusCode();
            if(statusCode == 200) {
                result = EntityUtils.toString(response.getEntity(), charset);
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if(response != null) {
                try {
                    response.close();
                } catch (IOException e) {
                }
            }
            if(client != null) {
                try {
                    client.close();
                } catch (IOException e) {
                }
            }
        }
        return result;
    }
原文地址:https://www.cnblogs.com/asd529735325/p/10216040.html