java爬虫，爬取网址、爬取视频、爬取图片

一、爬取网址

import java.io.*;
import java.net.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * java实现爬虫
 */
public class Robot {
    public static void main(String[] args) {
        URL url = null;
        URLConnection urlconn = null;
        BufferedReader br = null;
        PrintWriter pw = null;
//        String regex = "http://[\w+\.?/?]+\.[A-Za-z]+";
        String regex = "https://[\w+\.?/?]+\.[A-Za-z]+";//url匹配规则
        Pattern p = Pattern.compile(regex);
        try {
            url = new URL("https://www.cnblogs.com/peachh/p/9740229.html");//爬取的网址、这里爬取的是一个生物网站
            urlconn = url.openConnection();
            pw = new PrintWriter(new FileWriter("C:/SiteURL.txt"), true);//将爬取到的链接放到D盘的SiteURL文件中
            br = new BufferedReader(new InputStreamReader(
                    urlconn.getInputStream()));
            String buf = null;
            while ((buf = br.readLine()) != null) {
                Matcher buf_m = p.matcher(buf);
                while (buf_m.find()) {
                    pw.println(buf_m.group());
                }
            }
            System.out.println("爬取成功^_^");
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                br.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            pw.close();
        }
    }
}

View Code

二、爬取视频

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
/**
 * 功能：爬取某姐的小视频
 * @author cxd
 *
 */
public class WebSpiderDemo1 {
 
    public static void main(String[] args) throws Exception {
 
        String source = "http://www.budejie.com/video/";
        String destDir = "C:/rob/";
 
        Map<String, String> urlMap = getUrlInSource(source);
 
        for (Map.Entry<String, String> entry : urlMap.entrySet()) {
            String title = entry.getKey();// 视频名称
            String url = entry.getValue();// 视频url
            File destFile = new File(destDir + title + ".mp4");
            download(url, destFile);
        }
    }
 
    /**
     * 通过视频的URL下载该视频并存入本地
     * 
     * @param url      视频的URL
     * @param destFile 视频存入的位置
     * @throws IOException
     */
    public static void download(String url, File destFile) throws IOException {
        URL videoUrl = new URL(url);
 
        InputStream is = videoUrl.openStream();
        FileOutputStream fos = new FileOutputStream(destFile);
 
        int len = 0;
        byte[] buffer = new byte[1024];
        while ((-1) != (len = is.read(buffer))) {
            fos.write(buffer, 0, len);
        }
        fos.flush();
 
        if (null != fos) {
            fos.close();
        }
 
        if (null != is) {
            is.close();
        }
    }
 
    /**
     * 获取视频的URL地址和视频名称存入hashMap
     * 
     * @param source
     * @return
     * @throws IOException
     */
    public static Map<String, String> getUrlInSource(String source) throws IOException {
 
        Map<String, String> hashMap = new HashMap<>();
 
        for (int index = 1; index <= 1; index++) { // 页数最大为50，自己玩嘛，就只爬取了一页。
            String pageUrl = source + index;
            URL url = new URL(pageUrl);
            InputStream is = url.openStream();
 
//            若遇到反爬机制则使用该方法将程序伪装为浏览器进行访问
//            HttpURLConnection conn = (HttpURLConnection) url.openConnection();
//            conn.setRequestMethod("GET");
//            conn.setRequestProperty("user-agent",
//                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36");
//            BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8"));
 
            BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
 
            String info = null;
            String title = null;
            // 此处不要用==null进行判断，因为网页中有很多行都是null，否则会报java.lang.NullPointerException。
            for (int i = 0; i < 10000; i++) {
                info = br.readLine();
 
                if (null != info) {
                    String urlRegex = "data-mp4="(.*?\.mp4)";
 
                    if (info.contains("data-title")) {
                        title = info;
                    }
 
                    Pattern pattern = Pattern.compile(urlRegex);
                    Matcher matcher = pattern.matcher(info);
                    if (matcher.find()) {
                        for (int j = 0; j <= matcher.groupCount(); j++) {
                            String tmp = matcher.group(j);
                            if (!tmp.startsWith("data-mp4=")) {
                                String videoTitle = getTitle(title.trim());
                                hashMap.put(videoTitle, tmp);
                            }
                        }
                    }
                }
            }
        }
        return hashMap;
    }
 
    /**
     * 清洗整理titile字符串，
     * 
     * @param info
     * @return
     */
    private static String getTitle(String info) {
 
        int len = info.length();
        String title = info.substring(12, len - 1);
        return title;
    }

}

View Code

三、爬取图片

  1 import com.obcy.util.DownLoad;
  2 import com.obcy.util.GetHTML;
  3 import org.jsoup.Jsoup;
  4 import org.jsoup.nodes.Document;
  5 import org.jsoup.select.Elements;
  6 import org.junit.Test;
  7  
  8 import java.io.File;
  9 import java.util.ArrayList;
 10 import java.util.HashMap;
 11 import java.util.Map;
 12  
 13 public class BiAn {
 14  
 15     //获取到所有的一级页面，从第2页到第946页
 16  
 17  
 18     public ArrayList<String> getTopUrl(){
 19         //String topurl = "http://www.netbian.com/hd3840x2160/index_2.htm"
 20  
 21         //定义一个集合保存所有的一级页面
 22         ArrayList<String> list = new ArrayList<String>();
 23         for (int i = 2; i <= 946; i++) {
 24             list.add("http://www.netbian.com/hd3840x2160/index_"+i+".htm");
 25         }
 26  
 27         return list;
 28     }
 29  
 30  
 31     //获取一级页面的所有图片查看地址
 32     //传入的参数是一级页面地址
 33     public HashMap<String,String> getGpjView(String topUrl){
 34  
 35         String url = topUrl;
 36         String html = GetHTML.getHTML(url);
 37         //获取到网页源代码document对象
 38         Document document = Jsoup.parse(html);
 39         //解析document对象，拿到页面每个图片查看地址
 40         Elements list = document.getElementsByClass("list");
 41         //拿到的list只有一个对象,这个对象里包含页面所有的图片a标签
 42         Elements a = null;
 43         try {
 44             a = list.get(0).select("ul>li>a");
 45         } catch (Exception e) {
 46             System.out.println("没有获取到a标签");
 47  
 48         }
 49         //遍历a标签对象，拿到a标签的href属性值并拼接成完整的图片查看地址放入集合里
 50             //建立一个map集合
 51         HashMap<String,String> map = new HashMap<String, String>();
 52         for (int i = 0; i < a.size(); i++) {
 53             String href = "http://www.netbian.com"+a.get(i).attr("href");
 54             String name = a.get(i).attr("title");
 55             //System.out.println(href);http://www.netbian.com/desk/22138.htm
 56             map.put(name,href);
 57         }
 58  
 59         //搜集本页面的图片查看地址完成
 60  
 61         return map;
 62  
 63     }
 64  
 65  
 66     //访问每个一级页面，获取到页面里所有的图片下载地址,
 67     // 方法接收一个装有一个一级页面所有图片查看地址的集合
 68  
 69     public void getDownload(HashMap<String,String> map){
 70  
 71  
 72         //遍历集合，对集合里所有的页面进行提取，每个页面提取到一张图片下载地址,并下载
 73  
 74         for (Map.Entry<String, String> entry : map.entrySet()) {
 75  
 76  
 77             String html = GetHTML.getHTML(entry.getValue());
 78  
 79             Document document = Jsoup.parse(html);
 80  
 81             //拿到图片img标签对象，只有一个
 82  
 83             Elements endpage = null;
 84             try {
 85                 endpage = document.getElementsByClass("endpage").get(0).select("div>p>a>img");
 86             } catch (Exception e) {
 87                 System.out.println("没获取到页面对象，继续下一个");
 88                 continue;
 89             }
 90  
 91             //System.out.println(endpage.get(0).attr("src"));
 92  
 93             //得到下载地址
 94             String target = endpage.get(0).attr("src");
 95  
 96  
 97             String path = "F:/BiAn/"+entry.getKey()+".jpg";
 98             //开始下载
 99             DownLoad.downLoad(target,path);
100         }
101  
102  
103     }
104  
105     @Test
106     public void test(){
107         //判断是否存在文件夹F:/BiAn
108         File file = new File("C:/BiAn");
109         if (!file.exists()){
110             file.mkdirs();
111             System.out.println("已创建下载文件夹F:/BiAn");
112         }else {
113             System.out.println("已存在文件夹，具备下载条件");
114         }
115  
116  
117         //单线程
118         //拿到所有一级页面
119         ArrayList<String> topUrl = getTopUrl();
120         //对每个页面进行操作，1.得到视图集合，2.遍历集合，拿到下载地址，3.下载
121  
122         for (String url : topUrl) {
123             HashMap<String, String> gpjView = getGpjView(url);
124             getDownload(gpjView);
125         }
126  
127  
128  
129     }
130 }

View Code

四、如何分析网页信息，从而进行抓取的分析？

https://www.cnblogs.com/518894-lu/p/9021548.html