【apache】使用HttpClient,进行简单网页抓取

 1 package com.lw.httpclient.test;
 2 import org.apache.http.client.methods.CloseableHttpResponse;
 3 import org.apache.http.client.methods.HttpGet;
 4 import org.apache.http.impl.client.CloseableHttpClient;
 5 import org.apache.http.impl.client.HttpClients;
 6 import org.apache.http.util.EntityUtils;
 7 
 8 public class HttpClientTest {
 9     public static void main(String[] args) throws Exception {
10         // TODO Auto-generated method stub
11         get1();
12         get2();
13     }
14     /**
15      * 获取指定链接的网页的内容【初级版】
16      * @throws Exception
17      */
18     public static void get1()throws Exception{
19         //HttpClient hc=new DefaultHttpClient();
20         String url="http://www.budejie.com";
21         url="http://www.btba.com.cn";//网站限制爬,这种方式不再实用。
22         CloseableHttpClient chc=HttpClients.createDefault();
23         HttpGet hg=new HttpGet(url);
24         CloseableHttpResponse chp=chc.execute(hg);
25         System.out.println(EntityUtils.toString(chp.getEntity(),"UTF-8"));
26     }
27     /**
28      * 通过模拟浏览器获取指定链接的页面
29      * @throws Exception
30      */
31     public static void get2()throws Exception{
32         CloseableHttpClient closeableHttpClient=HttpClients.createDefault();
33         String url="http://www.btba.com.cn";
34         HttpGet httpGet=new HttpGet(url);
35         //设置请求头,模拟浏览器访问
36         httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0");
37         CloseableHttpResponse chr=closeableHttpClient.execute(httpGet);
38         System.out.println(EntityUtils.toString(chr.getEntity(),"UTF-8"));
39     }
40 }

未完待续

将会添加如何解析获取到的内容,得到自己想要的部分。。

原文地址:https://www.cnblogs.com/oldwei/p/8620387.html