Jsoup爬取职位信息

待爬取的牛客网的实习信息

https://www.nowcoder.com/job/center

首先在Eclipse新建一个maven项目

1、在maven文件中加入以下的代码

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.wu</groupId>
  <artifactId>TopEssay</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  
  
  <dependencies>
	<dependency>
	    <groupId>org.jsoup</groupId>
	    <artifactId>jsoup</artifactId>
	    <version>1.11.3</version>
	</dependency>
  	
  </dependencies>
  
  
</project>

  

2、提取所需要的信息

这里编写CSS规则,有点麻烦,我们可以利用游览器自带的工具,帮助我们快速选择所需要的的元素

比如我们这里的标题,通过这种该方法,为 body > div.nk-container > div.nk-main.clearfix > div.nk-content > div > div.module-body > ul > li:nth-child(1) > div > div.reco-job-cont > a

然后我们可以在上面这个基础上进行相应的修改,有效节省了我们的时间。

package com.jsoup;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.entity.JobInfo;

public class NiuKeSpider {
	private static final String url = "https://www.nowcoder.com/job/center";
	
	public static void main(String[] args) {
		try {
			// 获取网页的源代码
			Document document = Jsoup.connect(url).get();
			// 筛选出和职位有关的网页源码
			Elements jobs = document.getElementsByClass("reco-job-main");
			System.out.println(jobs.size());
			
			List<JobInfo> lists = new ArrayList<>();
			
			//工作描述+公司+地点+工资+url
			for(Element element : jobs) {
				JobInfo jobInfo = new JobInfo();
				jobInfo.setJobContent(element.getElementsByClass("reco-job-cont").text());
				jobInfo.setUrl(element.select("div.reco-job-cont > a").attr("abs:href"));
				jobInfo.setCompany(element.getElementsByClass("reco-job-com").text());
				jobInfo.setAddress(element.getElementsByClass("job-address").text());
				jobInfo.setSalary(element.select("div.reco-job-info > div:nth-child(1) > span:nth-child(2)").text().trim());
				lists.add(jobInfo);
			}
			
			for(JobInfo job : lists) {
				System.out.println(job);
			}
			
		} catch (IOException e) {
			e.printStackTrace();
		}
		
	}

}

  

3、封装所需的信息

package com.entity;

/**
 * 职位有关的信息
 * @author Administrator
 *
 */
public class JobInfo {
	private String jobContent;
	private String url;
	private String company;
	private String address;
	private String Salary;
	
	public String getJobContent() {
		return jobContent;
	}
	public void setJobContent(String jobContent) {
		this.jobContent = jobContent;
	}
	
	public String getUrl() {
		return url;
	}
	public void setUrl(String url) {
		this.url = url;
	}
	public String getCompany() {
		return company;
	}
	public void setCompany(String company) {
		this.company = company;
	}
	public String getAddress() {
		return address;
	}
	public void setAddress(String address) {
		this.address = address;
	}
	public String getSalary() {
		return Salary;
	}
	public void setSalary(String salary) {
		Salary = salary;
	}
	@Override
	public String toString() {
		return "job [jobContent=" + jobContent + ", url=" + url + ", company=" + company + ", address=" + address
				+ ", Salary=" + Salary + "]";
	}
	
}

  

4、运行结果:

 

总结:

原文地址:https://www.cnblogs.com/wylwyl/p/10775481.html