java捕获一个网站页面的全部图片

直接上代码:

package com.jeecg.util;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CatchImage {
    // 地址
    private static final String URL = "http://news.163.com/";// 编码
    private static final String ECODING = "UTF-8";
    // 获取img标签正则
    private static final String IMGURL_REG = "<img src=(.*?)[^>]*?>";
    // 获取src路径的正则
    private static final String IMGSRC_REG = "http:.+(\.jpeg|\.jpg|\.png|\.gif)"";
    

    public static void main(String[] args) throws Exception {
        CatchImage cm = new CatchImage();

        // 获得html文本内容
        String HTML = cm.getHTML(URL);

        // 获取图片标签
        List<String> imgUrl = cm.getImageUrl(HTML);

        // 获取图片src地址
        List<String> imgSrc = cm.getImageSrc(imgUrl);

        // 下载图片 cm.Download(imgSrc);
        cm.Download(imgSrc);
    }

    /**
     * 
     * 
     * 获取HTML内容
     * 
     * @param url
     * @return
     * @throws Exception
     **/
    private String getHTML(String oldLink) throws Exception {
        StringBuffer sb = new StringBuffer();
        URL url = new URL(oldLink);
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        connection.setRequestMethod("GET");
        connection.setConnectTimeout(2000);
        connection.setReadTimeout(2000);
        if (connection.getResponseCode() == 200) {
            InputStream inputStream = connection.getInputStream();
            BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
            String line = "";
            while ((line = reader.readLine()) != null) {
                sb.append(line);
                
            }
        }
        return sb.toString();
    }

    /**
     * 获取ImageUrl地址
     * 
     * @param HTML
     * 
     * @return
     */
    private List<String> getImageUrl(String HTML) {
        Matcher matcher = Pattern.compile(IMGURL_REG).matcher(HTML);
        List<String> listImgUrl = new ArrayList<String>();
        while (matcher.find()) {
            listImgUrl.add(matcher.group());
        }
        return listImgUrl;
    }

    /**
     * 获取ImageSrc地址
     * 
     * @param listImageUrl
     * 
     * @return
     **/
    private List<String> getImageSrc(List<String> listImageUrl) {
        List<String> listImgSrc = new ArrayList<String>();
        for (String image : listImageUrl) {
            Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image);
            while (matcher.find()) {
                listImgSrc.add(matcher.group().substring(0, matcher.group().length() - 1));
            }
        }
        return listImgSrc;
    }

    /**
     * 下载图片
     * 
     * @param listImgSrc
     * @throws FileNotFoundException 
     **/
    private void Download(List<String> listImgSrc) throws Exception {
        int count = 0;
        ArrayList al = new ArrayList();
        for (String urll : listImgSrc) {
            System.out.println(urll);
            Pattern p = Pattern.compile("\.jpg|\.png|\.gif|\.jpeg[^_]");
            Matcher m = p.matcher(urll);
            while (m.find()) {
                al.add(m.group());
            }
        }
        for (String url : listImgSrc) {
            System.out.println(url);
            URL uri = new URL(url);
            InputStream in = uri.openStream();

            FileOutputStream fo = new FileOutputStream("D:/imgPage/" + count + al.get(count));

            byte[] buf = new byte[1024];
            int length = 0;
            System.out.println("开始下载:" + url);
            while ((length = in.read(buf, 0, buf.length)) != -1) {
                fo.write(buf, 0, length);
            }
            in.close();
            fo.close();
            System.out.println("下载完成");
            count++;
        }
        System.out.println(count);
    }
}

 

原文地址:https://www.cnblogs.com/shuilangyizu/p/11157333.html