Java使用HttpClient和Jsoup爬取豆瓣小组的帖子并存入Mysql

首先，确定要爬取的小组，本次以豆瓣的five组为例。因为是第一次用Java爬虫，所有采取了简答的暴力循环爬取的方法，以后有时间再继续改进。（不过也可能转到Python了）

另外，本次尝试采用Spring boot开发。

1、爬取所有的帖子的链接

1.1、分析网页

小组首页

更多讨论，然后将地址栏的参数修改成0

然后往后一直到最后，就是这个小组的全部帖子了，我们只需要将每一页的链接全部取出来，然后更新地址栏的start参数，遍历到最后，就可以获取所有的帖子的链接了。

1.2、创建数据库

CREATE DATABASE `douban`;

USE `douban`;


# 存储帖子的标题和链接
DROP TABLE IF EXISTS `985posts`;

CREATE TABLE 985posts (
	id INT NOT NULL AUTO_INCREMENT COMMENT '帖子id',
	title VARCHAR(150) NOT NULL COMMENT '帖子标题',
	author VARCHAR(50) NOT NULL COMMENT '帖子作者',
	post_href VARCHAR(100) NOT NULL COMMENT '帖子的url地址',
	KEY id(id)
) ENGINE=INNODB DEFAULT CHARSET=utf8mb4;

编码使用utf8mb4的原因是帖子中有emoji，每个emoji是4个字符。

1.3、编码

1.3.1、引入相关jar包

<!--HttpClient-->
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.12</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.13.1</version>
</dependency>

这是主要的两个jar包：httpclient和jsoup，其他的看需要再引入，比如mybatis，lombok等等。

1.3.2、 mybatis操作数据库

实体类

package com.fan.pojo;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

@Data
@AllArgsConstructor
@NoArgsConstructor
public class DoubanPost {
    private int id;
    private String title;
    private String author;
    private String postHref;
}

mapper类

package com.fan.mapper;

import com.fan.pojo.DoubanPost;
import org.apache.ibatis.annotations.Mapper;
import org.springframework.stereotype.Repository;

import java.util.List;

/**
 * 这个类是用来给985posts数据表进行增删改查的
 * @author 
 * @date 2020/8/23 - 2:18
 */

// 这个注解表示了这是一个mybatis的mapper类
@Mapper
@Repository
public interface DoubanPostMapper {

    List<DoubanPost> queryDoubanPostList();

    DoubanPost queryDoubanPostById(int id);

    int addDoubanPost(DoubanPost doubanPost);

    int deleteDoubanPost(int id);

}

xml文件

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper
        PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
        "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.fan.mapper.DoubanPostMapper">

    <select id="queryDoubanPostList" resultType="DoubanPost">
        select * from douban.985posts;
    </select>

    <select id="queryDoubanPostById" resultType="DoubanPost">
        select * from douban.985posts where id=#{id};
    </select>

    <insert id="addDoubanPost" parameterType="DoubanPost">
        insert into douban.985posts (title, author, post_href) VALUES (#{title}, #{author}, #{postHref});
    </insert>

    <delete id="deleteDoubanPost" parameterType="int">
        delete from douban.985posts where id=#{id};
    </delete>

</mapper>

1.3.3、获取网页html工具类

package com.fan.util;

import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.Map;

/**
 * @author 
 * @date 2020/8/23 - 1:28
 */
public class HttpUtils {

    // 创建连接池管理器-->存疑
    private static PoolingHttpClientConnectionManager cm;

    // 创建一个代理
    // http://120.79.209.11:3128
    // 61.160.245.88 46779
    // 171.35.215.90 9999
    // private static HttpHost proxy = new HttpHost("61.178.118.86", 8080);


    public HttpUtils() {
        cm = new PoolingHttpClientConnectionManager();
        // 设置最大连接数
        cm.setMaxTotal(100);
        // 设置每个路由的最大连接数
        cm.setDefaultMaxPerRoute(10);
    }

    // 配置请求信息
    private static RequestConfig getConfig() {
        RequestConfig config = RequestConfig.custom()
                .setConnectTimeout(100 * 1000) // 创建连接的最长时间，单位毫秒
                .setConnectionRequestTimeout(100 * 1000) // 设置获取连接的最长时间，单位毫秒
                .setSocketTimeout(100 *1000) // 设置数据传输的最长时间，单位毫秒
                .build();
        return config;
    }


    /**
     * 根据请求地址下载页面数据
     * @param url 请求路径
     * @param map 请求参数
     * @param mapTitle 请求头
     * @return // 页面数据
     * @throws URISyntaxException
     */
    public static String doGetHtml(String url, Map<String, String> map, Map<String, String> mapTitle) throws URISyntaxException {
        // 创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();

        // 设置请求地址
        // 创建URIBuilder
        URIBuilder uriBuilder = new URIBuilder(url);

        // 设置参数
        if (!map.isEmpty()) {
            for (String key : map.keySet()) {
                uriBuilder.setParameter(key, map.get(key));
            }
        }

        // 创建HttpGet对象，设置url访问地址
        // uriBuilder.build()得到请求地址
        HttpGet httpGet = new HttpGet(uriBuilder.build());

        // 设置请求头信息
        if (!mapTitle.isEmpty()) {
            for (String key : mapTitle.keySet()) {
                httpGet.addHeader(key, mapTitle.get(key));
            }
        }

        // 设置请求信息
        httpGet.setConfig(getConfig());
        System.out.println("这里注意观察");
        System.out.println("发起的请求信息: " + httpGet);

        // 使用HttpClient发起请求，获取response
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            // 解析响应体Entity是否为空，如果不为空就可以使用EntityUtils
            if (response.getEntity() != null) {
                String content = EntityUtils.toString(response.getEntity(), "utf8");
                return content;
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            // 关闭response
            // 猜测，这里的httpClient应该是交给了连接池管理
            try {
                response.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        System.out.println("获取失败");
        return "";
    }

}

1.3.4、编写测试类

package com.fan;

import com.fan.mapper.DoubanPostMapper;
import com.fan.pojo.DoubanPost;
import com.fan.util.HttpUtils;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;

import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;

@SpringBootTest
class CrawdoubangroupsApplicationTests {

    @Autowired
    private DoubanPostMapper doubanPostMapper;

    @Test
    void contextLoads() throws URISyntaxException {
        // 请求地址
        // https://www.douban.com/group/692739/discussion?start=0
        String url = "https://www.douban.com/group/692739/discussion";
        Map<String, String> map = new HashMap<>();
        Map<String, String> mapTitle = new HashMap<>();
        // 设置请求参数
        // map.put("start", "0");
        // 设置请求头
        mapTitle.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36 Edg/84.0.522.61");
        mapTitle.put("Cookie", "ll='118171'; bid=s9mINaPcmtA; __utmz=30149280.1592482199.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmv=30149280.21555; douban-fav-remind=1; push_doumail_num=0; push_noty_num=0; __utmc=30149280; _pk_ses.100001.8cb4=*; __utma=30149280.1877381396.1592482199.1598182478.1598185708.18; ck=PnCH; __utmt=1; ap_v=0,6.0; _pk_id.100001.8cb4=9fc580371d86dd7a.1592482197.19.1598187695.1598182755.; __utmb=30149280.5.10.1598185708");


        DoubanPost doubanPost = new DoubanPost();

        // 先抓取前1000（大概）条数据，并存入数据库
        for (int k = 1; k < 14062;) {
            // 改变请求参数，因为小组页面每页只有30个list循环是每次要加30
            map.put("start", k + "");
            // 工具类获取html
            String html01 = HttpUtils.doGetHtml(url, map, mapTitle);
            System.out.println("保护html============================================================================================》debug");
            System.out.println(html01);
            System.out.println("保护html============================================================================================》debug");
            // Jsoup解析html
            Document document = Jsoup.parse(html01);
            // 利用Jsoup获取content
            Element content = document.getElementById("content");
            // 获取tr标签下的所有html内容
            Elements trs = content.getElementsByTag("tr");
            // 遍历tr元素下的内容，舍弃第一个标题头
            for (int i = 1; i < trs.size(); i++) {
                // 根据索引获取对应的element，即单个tr
                Element element = trs.get(i);
                // 获取tr下面的孩子td
                Elements children = element.children();
                // System.out.println(children);
                // System.out.
                // println(children);
                // 处理第一行数据，取出标题和链接
                Element child = children.get(0).child(0);
                // System.out.println(child);
                Attributes attributes = child.attributes();
                // System.out.println("文章链接为：");
                String href = attributes.get("href");
                // System.out.println(href);
                // 设置实体类
                doubanPost.setPostHref(href);
                // System.out.println("标题为：");
                String title = attributes.get("title");
                // System.out.println(title);
                // 设置实体类
                doubanPost.setTitle(title);

                if (title.equals("")) {
                    System.out.println("执行了方法====》");

                    child = children.get(0).child(1);
                    // System.out.println(child);
                    attributes = child.attributes();
                    // System.out.println("文章链接为：");
                    href = attributes.get("href");
                    // System.out.println(href);
                    // 设置实体类
                    doubanPost.setPostHref(href);
                    // System.out.println("标题为：");
                    title = attributes.get("title");
                    // System.out.println(title);
                    // 设置实体类
                    doubanPost.setTitle(title);
                }


                // 处理第二行数据，取出作者和链接
                Element child1 = children.get(1).child(0);
                Attributes attributes1 = child1.attributes();
                String href1 = attributes1.get("href");
                doubanPost.setAuthor(href1);

                // 打印实体类
                System.out.println(doubanPost);
                doubanPostMapper.addDoubanPost(doubanPost);

                System.out.println(i+1 + "===============");
            }

            System.out.println("分割=====》start = " + k);
            k += trs.size() - 1;

        }

    }

}

嗯，就是暴力获取，最后爬取了大概一万四七多条数据。

数据库中的结果：

2、根据第一步获取的链接，获取每一个具体的帖子的html网页

这里的数据库操作和第一步的很类似，故从略。只展示最后的测试类。

package com.fan;

import com.fan.mapper.DoubanPostMapper;
import com.fan.mapper.PostMapper;
import com.fan.pojo.DoubanPost;
import com.fan.pojo.Post;
import com.fan.util.HttpUtils;
import org.apache.http.protocol.HTTP;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;

import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * 爬取豆瓣主帖子的内容
 * @author 
 * @date 2020/8/24 - 14:08
 */

@SpringBootTest
public class CrawDoubanMainPosts {

    @Autowired
    private DoubanPostMapper doubanPostMapper;

    @Autowired
    private PostMapper postMapper;


    // 批量查询帖子的html，并将主帖子的内容存入数据库中
    @Test
    public void test01() throws URISyntaxException {
        // 请求地址
        // 每一个帖子对应的url
        // String url = null;
        Map<String, String> map = new HashMap<>();
        Map<String, String> mapTitle = new HashMap<>();
        // 设置请求头
        mapTitle.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36 Edg/84.0.522.61");
        mapTitle.put("Cookie", "ll='118171'; bid=s9mINaPcmtA; __utmv=30149280.21555; douban-fav-remind=1; push_doumail_num=0; push_noty_num=0; ct=y; douban-profile-remind=1; ck=PnCH; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1598334460%2C%22https%3A%2F%2Faccounts.douban.com%2Fpassport%2Flogin%3Fredir%3Dhttps%253A%252F%252Fwww.douban.com%252Fgroup%252Ftopic%252F175432568%252F%22%5D; _pk_id.100001.8cb4=9fc580371d86dd7a.1592482197.27.1598334460.1598257938.; _pk_ses.100001.8cb4=*; __utma=30149280.1877381396.1592482199.1598255406.1598334460.26; __utmc=30149280; __utmz=30149280.1598334460.26.3.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; __utmt=1; __utmb=30149280.4.8.1598334460");

        DoubanPost doubanPost = null; // 每一次查询的链接类
        Post post = new Post();

        for (int i = 11; i < 14068; i++) {
            doubanPost = doubanPostMapper.queryDoubanPostById(i);
            // 装填title
            post.setTitle(doubanPost.getTitle());
            // 装填authorHref
            post.setAuthorHref(doubanPost.getAuthor());
            // 装填postHref
            post.setPostHref(doubanPost.getPostHref());

            // 获取postid
            String postHref = doubanPost.getPostHref(); // 后面根据这个url获取网页
            String s = postHref.substring(35, postHref.length() - 1);
            int postId = Integer.parseInt(s);
            // 装填postId
            post.setPostId(postId);

            // 接下来只需要装填author和content
            String html = HttpUtils.doGetHtml(postHref, map, mapTitle);
            Document document = Jsoup.parse(html);
            Element content = document.getElementById("content");
            // 防止报空指针异常
            if (content == null) {
                System.out.println("内容不存在");
                post.setContent("内容不存在");
                // 装填昵称
                String username = "此人已删帖";
                post.setAuthor(username);
                // 保存进数据库
                postMapper.addPost(post);
                System.out.println("添加第" + i + "条数据成功！");
                continue;
            }
            Element elementById = content.getElementById("link-report");
            Elements p = elementById.getElementsByTag("p");
            String article = "";
            for (Element element : p) {
                String html02 = element.html();
                html02 += "\n";
                article += html02;
            }
            // 装填文章内容
            post.setContent(article);
            // 装填昵称
            String username = content.child(0).child(0).child(2).child(1).child(0).child(0).child(0).html();
            post.setAuthor(username);

            // 保存进数据库
            postMapper.addPost(post);
            System.out.println("添加第" + i + "条数据成功！");
            // 舍弃图片和链接，暂时只截取文字部分
        }
        System.out.println("前1000条数据获取完成");
    }

}

数据库的结果：

这里只是爬取了主帖的内容，并没有爬取评论的内容，爬完之后已经被豆瓣限制ip了（爬了这么多，豆瓣才限制我，豆瓣对新手真是太友好了，当然，豆瓣的高手还是很多的，要反爬虫还是很轻松的）。同时，使用java还是有些慢的，虽然我目前没有用多线程，但是估计用了多线程，操作还是很繁琐，代码量目测比Python要多好多，打算后续爬取小组的帖子使用Python了。Java还是写一些网站吧。