2020年寒假假期总结0114

  WebMagic的学习基础:Jsoup的学习(Http基础API和Jsoup基础API)

  在学习WebMagic之前,我们需要简单了解关于Jsoup的知识,WebMagic是基于Jsoup的爬虫工具。

  下面我会列出关于Jsoup的api的使用。先列出关于Http的一些基本操作

  所需要的依赖:

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/junit/junit -->
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
            <scope>test</scope>
        </dependency>
        <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.4</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.7</version>
        </dependency>

  HTTPGet的使用:

public class HttpGetTest {

    public static void main(String[] arge){
        //创建httpClient对象
        CloseableHttpClient httpClient= HttpClients.createDefault();

        //创建HTTPGet的对象,设置url访问地址
        HttpGet httpGet=new HttpGet("http://www.itcast.cn");
        CloseableHttpResponse response =null;
        //使用Http发送请求,获取response
        try {
            response =httpClient.execute(httpGet);

            //解析响应
            if(response.getStatusLine().getStatusCode()==200){
                String content=EntityUtils.toString(response.getEntity(),"utf8");
                System.out.println(content.length());
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            //关闭连接请求
            try {
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

    }
}

  带参数的Get请求:

public class HttpGetParmTest {
    public static void main(String[] arge) {
        //创建httpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        //创建URLBuilder
        URIBuilder uriBuilder = null;
        try {
            uriBuilder = new URIBuilder("http://yun.itheima.com/search");
        } catch (URISyntaxException e) {
            e.printStackTrace();
        }
        uriBuilder.setParameter("keys", "Java");
        //创建HTTPGet的对象,设置url访问地址
        HttpGet httpGet = null;
        try {
            httpGet = new HttpGet(uriBuilder.build());
        } catch (URISyntaxException e) {
            e.printStackTrace();
        }
        CloseableHttpResponse response = null;
        //使用Http发送请求,获取response
        System.out.println("发送的请求地址:" + httpGet);
        try {
            response = httpClient.execute(httpGet);

            //解析响应
            if (response.getStatusLine().getStatusCode() == 200) {
                String content = EntityUtils.toString(response.getEntity(), "utf8");
                System.out.println(content.length());
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

    }
}

  Post的请求:

  与Get请求相似,只需要将Get请求的类改成Post类即可

  Post中带参数请求:(添加的参数为表单信息)

public class HttpPostParmTest {

    public static void main(String[] arge) throws UnsupportedEncodingException {
        //创建httpClient对象
        CloseableHttpClient httpClient= HttpClients.createDefault();

        //创建HTTPPost的对象,设置url访问地址
        HttpPost httpPost=new HttpPost("http://itcast.cn");
        //声明list集合 封装表单中的参数
        List<NameValuePair> pairs=new ArrayList<NameValuePair>();
        pairs.add(new BasicNameValuePair("keys","Java"));
        //创建表单中Entit对象
        UrlEncodedFormEntity formEntity=new UrlEncodedFormEntity(pairs,"utf8");
        //设置表单对象到Post请求中
        httpPost.setEntity(formEntity);
        System.out.println("发送的请求为:"+httpPost);
        CloseableHttpResponse response =null;
        //使用Http发送请求,获取response
        try {
            response =httpClient.execute(httpPost);

            //解析响应
            if(response.getStatusLine().getStatusCode()==200){
                String content=EntityUtils.toString(response.getEntity(),"utf8");
                System.out.println(content.length());
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

    }
}

  连接池的使用:

public class HttpClientPool {
    public static void main(String[] args) {
        //创建连接池管理器
        PoolingHttpClientConnectionManager clientConnectionManager = new PoolingHttpClientConnectionManager();
        //设置连接数
        clientConnectionManager.setMaxTotal(100);
        //设置每个主机的最大连接数
        clientConnectionManager.setDefaultMaxPerRoute(10);
        //使用管理器发起请求
        doGet(clientConnectionManager);
        doGet(clientConnectionManager);
    }

    private static void doGet(PoolingHttpClientConnectionManager clientConnectionManager) {
        //从连接池中获取对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(clientConnectionManager).build();
        HttpGet httpGet = new HttpGet("http://www.itcast.cn");

        CloseableHttpResponse response = null;

        try {
            response = httpClient.execute(httpGet);
            if (response.getStatusLine().getStatusCode() == 200) {
                String content= EntityUtils.toString(response.getEntity(),"utf8");
                System.out.println(content.length());
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if(response!=null){
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                //httpClient的关闭由连接池管理

            }
        }

    }
}

   Get请求携带配置信息:

//配置请求信息
        RequestConfig config=RequestConfig.custom().setConnectTimeout(1000)//创建连接的最长时间
                .setConnectionRequestTimeout(500)//设置获取连接的最长时间
                .setSocketTimeout(10*1000)//设置数据传输的最长时间
                .build();
        //给设置的请求信息加进去
        httpGet.setConfig(config);

  Jsoup基础API使用:(注释很详细,就不做解释了)

public class JsoupFirsttest {


    @Test
    public void testUrl() throws Exception {
        //解析Url地址,第一个参数是访问额url,第二个参数是访问时候的超时时间
        Document doc = Jsoup.parse(new URL("http://www.itcast.cn"), 1000);
        //使用标签选择器
        String title = doc.getElementsByTag("title").first().text();
        //打印
        System.out.println(title);
    }

    @Test
    public void testString() throws Exception {
        //读取文件,获取字符串
        String content = FileUtils.readFileToString(new File("C:\Users\SuperMan\Desktop\test.html"), "utf8");
        //解析字符串
        Document doc = Jsoup.parse(content);
        String title = doc.getElementsByTag("title").first().text();
        System.out.println(title);
    }

    @Test
    public void testFile() throws Exception {
        //解析文件
        Document doc = Jsoup.parse(new File("C:\Users\SuperMan\Desktop\test.html"), "utf8");
        String title = doc.getElementsByTag("title").first().text();
        System.out.println(title);
    }

    @Test
    public void TestDom() throws Exception {
        Document doc = Jsoup.parse(new File("C:\Users\SuperMan\Desktop\test.html"), "utf8");

        //获取元素
        //通过ID获取元素内容
        Element element = doc.getElementById("city_bj");
        //通过标签获取元素
        Element element1 = doc.getElementsByTag("span").get(12);
        //通过class获取元素
        Element element2 = doc.getElementsByClass("fdnav").first();
        //通过属性获取
        Element element3 = doc.getElementsByAttribute("abc").first();
        Element element4 = doc.getElementsByAttributeValue("href", "http://yun.itheima.com/").first();
        //打印
        System.out.println(element.text());
        System.out.println(element1.text());
        System.out.println(element2.text());
        System.out.println(element3.text());
        System.out.println(element4.text());
    }

    @Test
    public void testData()throws Exception{
        Document doc = Jsoup.parse(new File("C:\Users\SuperMan\Desktop\test.html"), "utf8");

        //获取元素
        Element element=doc.getElementById("cy");
        String str="";
        //从元素中获取ID
        str=element.id();
        System.out.println(str);
        //获取ClassName
        Set<String> s =element.classNames();
        for (String string:s
             ) {
            System.out.println(string);
        }
        //获取attr
        str=element.attr("id");
        //获取所有属性
        Attributes attribute=element.attributes();
        System.out.println(attribute.toString());
        //获取文本内容
        str=element.text();
        System.out.println(str);
    }
}

  

  

  

原文地址:https://www.cnblogs.com/heiyang/p/12199107.html