Jsoup爬取京东和融e购商品列表工具类

1.新建maven项目,添加Jsoup的依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>Jsoup-demo</artifactId>
    <version>1.0-SNAPSHOT</version>


    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.13.1</version>
        </dependency>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.12</version>
        </dependency>
    </dependencies>

</project>

2.新建实体类,代码如下:

 1 package cn.lxcourse.jsoup.pojo;
 2 
 3 import lombok.AllArgsConstructor;
 4 import lombok.Data;
 5 import lombok.NoArgsConstructor;
 6 
 7 @Data
 8 @NoArgsConstructor
 9 @AllArgsConstructor
10 public class Content {
11     private String price;
12     private String title;
13     private String imgSrc;
14 }

3.编写工具类,代码如下:

 1 package cn.lxcourse.jsoup.util;
 2 
 3 import cn.lxcourse.jsoup.pojo.Content;
 4 import org.jsoup.Jsoup;
 5 import org.jsoup.nodes.Document;
 6 import org.jsoup.nodes.Element;
 7 import org.jsoup.select.Elements;
 8 
 9 import java.net.URL;
10 import java.util.ArrayList;
11 import java.util.List;
12 
13 /**
14  * 爬虫工具
15  */
16 public class JsoupUtils {
17 
18     /**
19      * 爬取京东商品列表
20      * @param keywords
21      * @return
22      * @throws Exception
23      */
24     public static List<Content> getJDGoods(String keywords) throws Exception {
25         String url = "https://search.jd.com/Search?keyword=Java" + keywords;
26         Document document = Jsoup.parse(new URL(url), 300000);
27         //商品列表
28         Element j_goodsList = document.getElementById("J_goodsList");
29         Elements glEtemElements = j_goodsList.getElementsByClass("gl-item");
30 
31         List<Content> list = new ArrayList<>();
32         for (Element element : glEtemElements) {
33 
34             String imgSrc = element.getElementsByTag("img").eq(0).attr("source-data-lazy-img");
35             String price = element.getElementsByClass("p-price").eq(0).text();
36             String title = element.getElementsByClass("p-name").eq(0).text();
37 
38             Content content = new Content();
39             content.setImgSrc(imgSrc);
40             content.setPrice(price);
41             content.setTitle(title);
42 
43             list.add(content);
44         }
45 
46         return list;
47     }
48 
49     /**
50      * 爬取工行融e购商品列表
51      * @param keywords
52      * @return
53      * @throws Exception
54      */
55     public static List<Content> getRongYiGouGoods(String keywords) throws Exception {
56         //https://mall.icbc.com.cn/searchproducts/pv.jhtml?query=java
57         String url = "https://mall.icbc.com.cn/searchproducts/pv.jhtml?query=" + keywords;
58 
59         Document document = Jsoup.parse(new URL(url), 30000);
60         Element ajaxQueryContent = document.getElementById("ajaxQueryContent");
61 
62         Elements liElements = ajaxQueryContent.getElementsByTag("li");
63 
64         List<Content> list = new ArrayList<>();
65 
66         for (Element el : liElements) {
67             String src = el.getElementsByTag("img").eq(0).attr("src");
68             String price = el.getElementsByClass("p-price").eq(0).text();
69             String title = el.getElementsByClass("p-name").eq(0).select("a").eq(0).attr("title");
70             Content content = new Content();
71             content.setTitle(title);
72             content.setPrice(price);
73             content.setImgSrc(src);
74             list.add(content);
75         }
76 
77         return list;
78     }
79 
80     public static void main(String[] args) throws Exception {
81         //getJDGoods("Java").forEach(System.out::println);
82         getRongYiGouGoods("java").forEach(System.out::println);
83     }
84 }
原文地址:https://www.cnblogs.com/zhaoran8775/p/12773138.html