JAVA 爬虫Gecco

主要代码:

  1 Gecco(matchUrl="https://github.com/{user}/{project}", pipelines="consolePipeline")
  2 public class MyGithub implements HtmlBean {
  3  
  4     private static final long serialVersionUID = -7127412585200687225L;
  5      
  6     @Request
  7     private HttpRequest request;
  8      
  9     @RequestParameter("user")
 10     private String user;
 11      
 12     @RequestParameter("project")
 13     private String project;
 14      
 15     @Text
 16     @HtmlField(cssPath=".repository-meta-content")
 17     private String title;
 18      
 19     @Text
 20     @HtmlField(cssPath=".pagehead-actions li:nth-child(2) .social-count")
 21     private int star;
 22      
 23     @Text
 24     @HtmlField(cssPath=".pagehead-actions li:nth-child(3) .social-count")
 25     private int fork;
 26  
 27     @Href(click=false)
 28     @HtmlField(cssPath="ul.numbers-summary > li:nth-child(4) > a")
 29     private String contributors;
 30      
 31     @HtmlField(cssPath=".entry-content")
 32     private String readme;
 33  
 34     public HttpRequest getRequest() {
 35         return request;
 36     }
 37  
 38     public void setRequest(HttpRequest request) {
 39         this.request = request;
 40     }
 41  
 42     public String getReadme() {
 43         return readme;
 44     }
 45  
 46     public void setReadme(String readme) {
 47         this.readme = readme;
 48     }
 49  
 50     public String getUser() {
 51         return user;
 52     }
 53  
 54     public void setUser(String user) {
 55         this.user = user;
 56     }
 57  
 58     public String getProject() {
 59         return project;
 60     }
 61  
 62     public void setProject(String project) {
 63         this.project = project;
 64     }
 65  
 66     public String getTitle() {
 67         return title;
 68     }
 69  
 70     public void setTitle(String title) {
 71         this.title = title;
 72     }
 73  
 74     public int getStar() {
 75         return star;
 76     }
 77  
 78     public void setStar(int star) {
 79         this.star = star;
 80     }
 81  
 82     public int getFork() {
 83         return fork;
 84     }
 85  
 86     public void setFork(int fork) {
 87         this.fork = fork;
 88     }
 89      
 90     public String getContributors() {
 91         return contributors;
 92     }
 93  
 94     public void setContributors(String contributors) {
 95         this.contributors = contributors;
 96     }
 97  
 98     public static void main(String[] args) {
 99         GeccoEngine.create()
100         .classpath("com.geccocrawler.gecco.demo")
101         //开始抓取的页面地址
102         .start("https://github.com/xtuhcy/gecco")
103         //开启几个爬虫线程,线程数量最好不要大于start request数量
104         .thread(2)
105         //单个爬虫每次抓取完一个请求后的间隔时间
106         .interval(2000)
107         .run();
108     }
109  
110 }
原文地址:https://www.cnblogs.com/lr393993507/p/5629380.html