毕业设计进度8

开始修改爬取核心代码,改变自己正则表达式,通过这个扩大可爬取的帖子内容,并且通过修改变量类型尽可能的避免出现null或者爬取输出到控制台为空的问题

核心代码:

public class CnblogsSpider implements PageProcessor {

    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000)
            .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36");

    public static final String URL_LIST = "https://www.cnblogs.com/mvc/AggSite/PostList.aspx";

    public static int pageNum = 1;

    public void process(Page page) {

        if (page.getUrl().regex("^https://www\.cnblogs\.com$").match()) {
            try {
                page.addTargetRequests(page.getHtml().xpath("//*[@id="post_list"]/div/div[@class='post_item_body']/h3/a/@href").all());
                pageNum++;
             //模拟post请求
                Request req = new Request();
                req.setMethod(HttpConstant.Method.POST);
                req.setUrl("https://www.cnblogs.com/mvc/AggSite/PostList.aspx");
                req.setRequestBody(HttpRequestBody.json("{CategoryType: 'SiteHome', ParentCategoryId: 0, CategoryId: 808, PageIndex: " + pageNum
                        + ", TotalPostCount: 4000,ItemListActionName:'PostList'}", "utf-8"));
                page.addTargetRequest(req);
            } catch (Exception e) {
                e.printStackTrace();
            }
        } else if (page.getUrl().regex(URL_LIST).match() && pageNum <= 200) {
            try {           
                Thread.sleep(5000);
                List<String> urls = page.getHtml().xpath("//*[@class='post_item']//div[@class='post_item_body']/h3/a/@href").all();
                page.addTargetRequests(urls);
                //模拟post请求
                Request req = new Request();
                req.setMethod(HttpConstant.Method.POST);
                req.setUrl("https://www.cnblogs.com/mvc/AggSite/PostList.aspx");
                req.setRequestBody(HttpRequestBody.json("{CategoryType: 'SiteHome', ParentCategoryId: 0, CategoryId: 808, PageIndex: " + ++pageNum
                        + ", TotalPostCount: 4000,ItemListActionName:'PostList'}", "utf-8"));
                page.addTargetRequest(req);
                System.out.println("CurrPage:" + pageNum + "#######################################");

            } catch (Exception e) {
                e.printStackTrace();
            }
        } else {
            // 获取页面需要的内容,这里只取了标题,其他信息同理。
            System.out.println("抓取的内容:" + page.getHtml().xpath("//a[@id='cb_post_title_url']/text()").get());
        }
    }

    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        Spider.create(new CnblogsSpider()).addUrl("https://www.cnblogs.com").thread(3).run();
    }
}
原文地址:https://www.cnblogs.com/z245894546/p/12311905.html