毕业设计进度2

用webagic爬取csdn

前面配置pom.xml一样,主要代码:

 1 public class CsdnBlogProcessor implements PageProcessor {
 2  
 3     private static String username = "yixiao1874";// 设置csdn用户名
 4     private static int size = 0;// 共抓取到的文章数量
 5  
 6     // 抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
 7     private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
 8  
 9     @Override
10     public void process(Page page) {
11         if (!page.getUrl().regex("http://blog.csdn.net/" + username + "/article/details/\d+").match()) {
12             //获取当前页码
13             String number = page.getHtml().xpath("//li[@class='page-item active']//a[@class='page-link']/text()").toString();
14             //匹配当前页码+1的页码也就是下一页,加入爬取列表中
15             String targetUrls = page.getHtml().links()
16                     .regex("http://blog.csdn.net/"+username+"/article/list/"+(Integer.parseInt(number)+1)).get();
17             page.addTargetRequest(targetUrls);
18  
19             List<String> detailUrls = page.getHtml().xpath("//li[@class='blog-unit']//a/@href").all();
20             for(String list :detailUrls){
21                 System.out.println(list);
22             }
23             page.addTargetRequests(detailUrls);
24         }else {
25             size++;// 文章数量加1
26             CsdnBlog csdnBlog = new CsdnBlog();
27             String path = page.getUrl().get();
28             int id = Integer.parseInt(path.substring(path.lastIndexOf("/")+1));
29             String title = page.getHtml().xpath("//h1[@class='csdn_top']/text()").get();
30             String date = page.getHtml().xpath("//div[@class='artical_tag']//span[@class='time']/text()").get();
31             String copyright = page.getHtml().xpath("//div[@class='artical_tag']//span[@class='original']/text()").get();
32             int view = Integer.parseInt(page.getHtml().xpath("//button[@class='btn-noborder']//span[@class='txt']/text()").get());
33             csdnBlog.id(id).title(title).date(date).copyright(copyright).view(view);
34             System.out.println(csdnBlog);
35         }
36     }
37  
38     public Site getSite() {
39         return site;
40     }
41  
42     public static void main(String[] args) {
43         // 从用户博客首页开始抓,开启5个线程,启动爬虫
44         Spider.create(new CsdnBlogProcessor())
45                 .addUrl("http://blog.csdn.net/" + username)
46                 .thread(5).run();
47         System.out.println("文章总数为"+size);
48     }

爬取后的控制台输出:

原文地址:https://www.cnblogs.com/z245894546/p/12257377.html