java学习--网络爬虫

目录

   1.jar包----jsoup

  2.解析步骤(爬去51job网页信息)

一、jar包----jsoup

  jsoup包是开源的html解析工具包

  jsoup包下载链接http://www.mvnjar.com/org.jsoup/jsoup/1.11.3/detail.html

二、解析步骤(爬取51job网页信息)

  1.打开需要爬取的网页

   2.通过链接对象获取文档对象

  3.通过jsoup中的select()解析文档对象

   4.创建一个实体类,内容包括需要爬取的信息

   5.将select查找出来的信息放到实体类中,并将这些对象保存到集合数组中

ps:利用jsoup解析html需要遵循html的语法

  

package com.work.crawler;
/**
 * 工作信息
 * @author Hu YS
 *
 * 2018年9月1日
 */
public class Work implements Comparable<Work>{
    private String position;//职位
    private String company;//公司
    private String place;//工作地点
    private String salary;//薪资
    private String date;//发布时间
    public String getPosition() {
        return position;
    }
    public void setPosition(String position) {
        this.position = position;
    }
    public String getCompany() {
        return company;
    }
    public void setCompany(String company) {
        this.company = company;
    }
    public String getPlace() {
        return place;
    }
    public void setPlace(String place) {
        this.place = place;
    }
    public String getSalary() {
        return salary;
    }
    public void setSalary(String salary) {
        this.salary = salary;
    }
    public String getDate() {
        return date;
    }
    public void setDate(String date) {
        this.date = date;
    }
    @Override
    public String toString() {
        return "Work [position=" + position + ", company=" + company + ", place=" + place + ", salary=" + salary
                + ", date=" + date + "]";
    }
    /**
     * 排序规则
     */
    @Override
    public int compareTo(Work o) {
        int i1 = Integer.parseInt(this.getDate().substring(0, 2));
        int i2 = Integer.parseInt(this.getDate().substring(3, 5));
        int o1 = Integer.parseInt(o.getDate().substring(0, 2));
        int o2 = Integer.parseInt(o.getDate().substring(3, 5));
        
        if(i1>=i2) {
            if(o1>o2) {
                return -1;
            }
            else if(o1<o2){
                return 1;
            }else {
                return 0;
            }
        }else if(i1<i2){
                return 1;
                    
        }
            return 0;
        
    }
}
工作信息实体类
package com.work.crawler;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import net.sf.json.JSONArray;
/**
 * 网页爬取线程
 * @author Administrator
 *
 */
public class Crawler implements Runnable {
    //打开地址
    private String url;
    //保存集合
    private List<Work> list;
    public Crawler(String url,List<Work> list) {
        this.list=list;
        this.url=url;
    }
    @Override
    public void run() {
        try {
            //链接网页获取document文档
            Document doc = Jsoup.connect(url).timeout(5000).get();
            //解析有效内容
            Elements eles = doc.select(".dw_table .el:gt(2)");
            for (Element element : eles) {
                Work work = new Work();
                String position = element.select(".t1 span a").text();
                String company =element.select(".t2 a").text();
                String place = element.select(".t3").text();
                String salary = element.select(".t4").text();
                String date = element.select(".t5").text();
                work.setCompany(company);
                work.setDate(date);
                work.setPlace(place);
                work.setSalary(salary);
                work.setPosition(position);
                System.out.println(work);
                list.add(work);
                
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    /**
     * 将爬取后的信息转换成json文件并写入本地文件
     * @param list    需要转换的list数组
     */
    public static void save(List<Work> list) {
        BufferedWriter bw=null;
        try {
            //文件以追加的形式写入json文件
                bw = new BufferedWriter(new FileWriter("E:\用户\Desktop\目标\1.json",true)) ;
                //将整个
                JSONArray fromObject = JSONArray.fromObject(list);
                bw.write(fromObject.toString());
                bw.newLine();
                bw.flush();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }finally {
            try {
                bw.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }
}
爬虫线程
 1 package com.work.crawler;
 2 
 3 import java.util.ArrayList;
 4 import java.util.List;
 5 import java.util.concurrent.ExecutorService;
 6 import java.util.concurrent.Executors;
 7 /**
 8  * 利用线程池爬取网页内容
 9  * @author Administrator
10  *
11  */
12 public class Main {
13     static List<Work> list = new ArrayList<>();
14     public static void main(String[] args) {
15         long s1 = System.currentTimeMillis();
16         int count = 1;
17         //创建线程池
18         ExecutorService es = Executors.newCachedThreadPool();
19         while(true) {
20             //当爬取到了150页的时候停止爬取
21             if(count == 150) {
22                 break;
23             }
24             //爬取线程的url
25             String url = "https://search.51job.com/list/010000,000000,0000,00,9,99,Java%2B%25E5%25BC%2580%25E5%258F%2591%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,"+count+".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
26             count+=1;
27             //执行线程池的线程(url=路径,list=保存的数组)
28             es.execute(new Crawler(url, list));
29         }
30         //关闭线程池
31         es.shutdown();
32         //不断询问线程池是否关闭
33         while(true) {
34             //当线程池关闭保存到本地
35             if(es.isTerminated()) {
36                 Crawler.save(list);
37                 System.out.println("over");
38                 break;
39             }
40         }
41         long s2 = System.currentTimeMillis();
42         System.out.println(s2-s1);
43     }
44 }
利用线程池爬取网页
原文地址:https://www.cnblogs.com/bananafish/p/9704814.html