Jsoup解析页面

package com.tl.spider.parser.impl;

import com.tl.spider.download.WebPageDownLoadUtil;
import com.tl.spider.parser.interfaces.ParseFieldsInterface;
import com.tl.spider.pojos.ParserResultEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.ArrayList;
import java.util.Date;
import java.util.List;

/**
 * @ClassName ParseFields4Xpath
 * @Description 使用Jsoup实现解析类
 * @Author Administrator
 * @Date 2019/5/19 16:09
 * @Version 1.0
 **/
public class ParseFields4Xpath implements ParseFieldsInterface {
    @Override
    public List<ParserResultEntity> parseHtml(String htmlContent) {
        Document doc = Jsoup.parse(htmlContent); // 从字符串中输入 HTML 文档
        Element element = doc.select("ul.tj3_1").first();
        Elements elements = element.select("li");
        List<ParserResultEntity>  results = new ArrayList<>();
        Date currentData = new Date(System.currentTimeMillis());

        for(Element e : elements) {
            ParserResultEntity obj = new ParserResultEntity();
            obj.setTitle(e.select("a").text());
            obj.setPostDate(e.select("font").text());
            obj.setInsertDate(currentData.toString());
            results.add(obj);
        }
        /*
        // css语法提取标题和时间
        System.out.println(doc.select("body > div.main > div.main_l > div.rdwz > ul > li:nth-child(1) > a").text());
        System.out.println(doc.select("body > div.main > div.main_l > div.rdwz > ul > li:nth-child(1) > font").text());
        */

        return results;
    }

    public static void main(String[] args) throws Exception {

        String url = "http://news.youth.cn/gn/";
        String charSet = "gb2312"; // 这个地方的编码可以通过查看网页源代码的meta charset得到
        String content = WebPageDownLoadUtil.getHtmlSourceBySocket(url, charSet);
        System.out.println(content.length());

        ParseFields4Xpath parseFields4Xpath = new ParseFields4Xpath();
        List<ParserResultEntity> results = parseFields4Xpath.parseHtml(content);
        for(ParserResultEntity message : results) {
            System.out.println(message.toString());
        }
    }
    
}

  

原文地址:https://www.cnblogs.com/wylwyl/p/10890050.html