进行网页内容爬取的方法

1.使用webzip进行整站或指定页面的下载

2.使用jsoup进行在线网页内容获取

  例子:

@Autowired
    private TmCategoryOneRepository tmCategoryOneRepository;

    public HashMap<String, String> importTmCategoryOne(String url) {

        try {
            Document doc = Jsoup.connect(url).get();
            Elements results = doc.getElementsByClass("dw");
            Elements results1 = doc.getElementsByTag("h3");
            Elements results2 = doc.getElementsByTag("h1");
            Elements results3 = results.select("a");
            List<String> list = new ArrayList<String>();
            List<String> list1 = new ArrayList<String>();
            List<String> list2 = new ArrayList<String>();
            List<String> list3 = new ArrayList<String>();
            for (Element element : results3) {
                list3.add(element.attr("href"));
            }
            for (Element element : results2) {
                list1.add(element.text());
            }
            for (Element element : results1) {
                list.add(element.text());
            }
            for (Element element : results) {
                list2.add(element.text());
            }
            for (int i = 0; i < list2.size(); i++) {

                TmCategoryOne tmo = new TmCategoryOne();

                tmo.setId(OidMgr.requestOID("tm_category_one").toString());
                tmo.setName(list2.get(i));
                tmo.setUrl(list3.get(i));
                tmo.setParticulars(list1.get(i));
                tmo.setCode(i + "");
                tmo.setAnnotation(list.get(i));

                tmCategoryOneRepository.save(tmo);

            }

        } catch (IOException e) {
            e.printStackTrace();
        }
        return new HashMap<>();
    
原文地址:https://www.cnblogs.com/zixiaopiaomiao/p/5849637.html