爬虫任务一:使用httpclient去爬取百度新闻首页的新闻标题和url,编码是utf-8

第一个入手的爬虫小任务:

maven工程

<project xmlns="http://maven.apache.org/POM/4.0.0"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.zhaowu</groupId>
    <artifactId>pachong01</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.3</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.2</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.6</version>
        </dependency>


    </dependencies>
</project>

代码实现:

package com.zhaowu.renwu1;

import java.io.IOException;

import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class News {
    public static void main(String[] args) throws ClientProtocolException, IOException {
        // 创建HttpClient实例
        CloseableHttpClient httpClient = HttpClients.createDefault();
        // 创建httpget实例
        HttpGet httpGet = new HttpGet("https://news.baidu.com/");
        
        RequestConfig config = RequestConfig.custom()
                .setConnectTimeout(10000)//设置连接超时时间10秒钟,单位毫秒
                .setSocketTimeout(10000) //设置读取超时时间10秒钟
                .build();
        httpGet.setConfig(config);
        // 设置请求头消息User-Agent模拟浏览器
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/59.0");
        // 执行get请求
        CloseableHttpResponse response = httpClient.execute(httpGet);
        // 获取返回实体
        HttpEntity entity = response.getEntity();
        // 实体的内容(编码格式为utf-8)
        String content = EntityUtils.toString(entity, "utf-8");
        // System.out.println("网页内容为: " + content);

        // 解析网页 得到文档对象
        Document doc = Jsoup.parse(content);    
        
        Elements hrefElements = doc.select("a[href]");// 选择所有的a元素
        for (Element e : hrefElements) {
            System.out.println("新闻标题:" + e.text());
            System.out.println("新闻地址:" + e.attr("href"));
            System.out.println("------------------------");
        }
        
    }
}
原文地址:https://www.cnblogs.com/sutao/p/9012393.html