java实现爬取静态页面的新闻数据


可能需要的pom依赖包:

<!-- https://mvnrepository.com/artifact/commons-codec/commons-codec -->
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.4</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>

<!-- https://mvnrepository.com/artifact/commons-logging/commons-logging -->
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.1.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-httpclient/commons-httpclient -->
<dependency>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
<version>3.1</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>




主要贴出实现类相关代码:


@Override
public boolean inserturlNews(String urls) {


// TODO: 2021/5/17 只支持新民网数据爬取,可根据页面标签定时解析
String url = urls;
Document doc = null;
try {
doc = Jsoup.connect(url).get();
Elements listDiv = doc.getElementsByAttributeValue("class", "type_content_list type-item");
NewsInformation newsInformation= new NewsInformation();
for (Element element : listDiv) {
Elements texts = element.getElementsByTag("a");
for (Element text : texts) {
String newsUrl=text.attr("href");
String ptext = text.attr("title");
if (! ptext.isEmpty() && newsUrl.contains(".html")){
newsInformation.setTitle(ptext);
newsInformation.setNewsUrl(newsUrl);
try {
Document newsDoc = Jsoup.connect(newsUrl).get();
newsInformation.setForm(newsDoc.select(".info").select("span").get(0).text());
//环球,时政
if (url.contains("http://newsxmwb.xinmin.cn/world/") || url.contains("http://newsxmwb.xinmin.cn/shizheng/")) {
newsInformation.setAuthor(newsDoc.select(".info").select("span").get(1).text());
newsInformation.setDataTime(newsDoc.select(".info").select("span").get(3).text());
if (!newsInformation.getDataTime().contains("2021-")){
newsInformation.setDataTime(now.format(fmTime));
}
//文、体会
}else if (url.contains("http://newsxmwb.xinmin.cn/wentihui/")){
newsInformation.setAuthor(newsDoc.select(".info").select("span").get(1).text());
newsInformation.setDataTime(newsDoc.select(".info").select("span").get(4).text());
if (!newsInformation.getDataTime().contains("2021-")){
newsInformation.setDataTime(newsDoc.select(".info").select("span").get(3).text());
}
//头条
} else if (url.contains("http://shanghai.xinmin.cn/t/gdbd/")){
newsInformation.setAuthor(newsDoc.select(".info").select("span").get(1).text());
newsInformation.setDataTime(newsDoc.select(".info").select("span").get(3).text());
if (!newsInformation.getDataTime().contains("2021-")){
newsInformation.setDataTime(now.format(fmTime));
}
}
Elements listNewsDetail = newsDoc.getElementsByAttributeValue("class", "a_content");
for (Element listNews : listNewsDetail) {
Elements contents = listNews.getElementsByTag("p");
Elements images = listNews.getElementsByTag("img");
newsInformation.setImage(images.attr("src"));
StringBuffer buffer =new StringBuffer();
for (Element newsContent : contents) {
buffer.append(newsContent.text().trim());
}
newsInformation.setContent(buffer.toString().trim());
newsInformation.setStatus(1);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
newsMapperExt.inserturlNews(newsInformation);
}
}
}


} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

return true;
}
}


插入本地数据库 展示



原文地址:https://www.cnblogs.com/yangsanluo/p/14845374.html