Java爬页面数据


 <!--爬数据 start-->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.9</version>
        </dependency>
        <dependency>
            <groupId>net.sourceforge.htmlunit</groupId>
            <artifactId>htmlunit</artifactId>
            <version>2.27</version>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>
<!--爬数据 end-->

一.创建你要爬取的字段实体
package com.tecnon.common.utils; import lombok.Data; @Data public class POItoExcel { /** * 书名 */ private String bookName; /** * 价格 */ private String price; /** * 作者 */ private String author; /** * 出版社 */ private String Press; /** * 出版时间 */ private String pressTime; } 二.单元测试实现代码 这是我要爬取的页面链接:https://www.bookuu.com/search.php?cid=101702 实现单元测试 public static void main(String[] args) { List<POItoExcel> poItoExcelList = new ArrayList<>(); for (int i = 1; i <= 2; i++) { String url = "https://www.bookuu.com/search.php?cid=101702&page=" + i; try { Document document = Jsoup.connect(url).header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36") .header("referer", "https://www.bookuu.com/search.php?cid=101702&page=" + i).get(); Element body = document.body(); Elements a = body.getElementsByClass("wd-640"); for (Iterator it = a.iterator(); it.hasNext(); ) { POItoExcel poItoExcel = new POItoExcel(); Element e = (Element) it.next(); Elements bn = e.getElementsByClass("fs-16"); Elements p = e.getElementsByClass("fs-21"); Elements w = e.getElementsByClass("wd-30p fl to-hd mr-10"); Elements f = e.getElementsByClass("wd-30p fl to-hd cl-9 mr-10"); Elements t = e.getElementsByClass("wd-30p fl to-hd cl-9"); //爬到的数据放到list中 poItoExcel.setBookName(bn.text()); poItoExcel.setPrice(p.text()); poItoExcel.setAuthor(w.text()); poItoExcel.setPress(f.text()); poItoExcel.setPressTime(t.text()); poItoExcelList.add(poItoExcel); } } catch (Exception e) { e.printStackTrace(); } System.out.println("第" + i+ "页结束"); } System.out.println("----"+ StringUtil.getJsonFromObject(poItoExcelList) +"----"); } 有什么问题:加qq:501397578
原文地址:https://www.cnblogs.com/thcy1314/p/13565166.html