java小爬虫

爬取煎蛋网

1、找出页面网址的规律

2、设计页面图片网址的正则

代码:

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class SpiderTest {
    
    private static ArrayList<String> urlStrs = new ArrayList<String>();
    private static String regx = ""[\S]*\.(jpg|gif)";   //读取jpg和gif图片的正则
    private static int num = 0;        //图片名递增量
    
    public static void main(String[] args)throws Exception{
        //String urlStr = "http://jandan.net/ooxx/page-2381#comments";        //要抓取的煎蛋妹子网页示例
        String urlStr="";
        String dstDir = "d:/dstDir";
        int start = 2340;    //起始页
        int end = 2370;        //结束页
    
        for(int i=start;i<=end;i++){
            urlStr = "http://jandan.net/ooxx/page-"+i+"#comments";
            matchAll(urlStr);
            if(urlStrs.size() > 0){
                for(String imgStr:urlStrs){
                    downFile(imgStr,dstDir);    
                    Thread.sleep(300);    //休息一会
                }
            }
            urlStrs.clear();
        }
        System.out.println("网址抓取完毕");
    }
    /*
     * @param:urlStr 要爬取的网址
     */
    private static void matchAll(String urlStr)throws Exception{
        Pattern p = Pattern.compile(regx);
        Matcher m;
        URL url;
        try {
            url = new URL(urlStr);
        } catch (MalformedURLException e) {
            throw new Exception("网址不存在");
        }
        
        BufferedReader read= new BufferedReader(new InputStreamReader(url.openStream()));
        String line = "";
        while((line = read.readLine()) != null){
            m = p.matcher(line);
            while(m.find()){
                System.out.println(m.group());
                urlStrs.add("http:"+m.group().substring(1));        //将图片网址添加到ArrayList(过滤第一个双引号)
            }
        }
        read.close();
    }
    /*下载指定图片网址的图片
     * @param:urlStr 图片网址
     * @param:dstDir 图片存放目录
     */
    private static void downFile(String urlStr,String dstDir)throws Exception{
        byte[] bBuf = new byte[1024];
        File dir = new File(dstDir);
        String fileName = "";
        if(!dir.exists()){
            dir.mkdir();
        }
        if(urlStr.endsWith("jpg")){
            fileName = (num++) + ".jpg";
        }else if(urlStr.endsWith("gif")){
            fileName = (num++) + ".gif";
        }
        File imgFile = new File(dstDir,fileName);
        //if(imgFile.exists()){
        //    TODO..
        //}
        URL url = new URL(urlStr);
        BufferedInputStream in = new BufferedInputStream(url.openStream());
        BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(imgFile));
        
        System.out.println("开始下载。。");
        int len = 0;
        while((len = in.read(bBuf)) != -1){
            out.write(bBuf,0,len);
        }
        System.out.println("下载完毕");
        in.close();
        out.close();
    }
    /*
     * 获取网页源码(此方法没有使用)
     */
    private void getSourceCode(String u)throws Exception{
        //String u = "http://m.onepiece.cc/post/10001/";
        File f = new File("d:/tmp.txt");
        if(!f.exists()){
            f.createNewFile();
        }
        URL url = new URL(u);
        BufferedReader read = new BufferedReader(new InputStreamReader(url.openStream()));
        BufferedWriter write = new BufferedWriter(new FileWriter(f));
        String s = "";
        while((s=read.readLine()) != null){
            write.write(s);
            write.write('
');
        }
        System.out.println("拷贝完成");
        read.close();
        write.close();
    }
}

原文地址:https://www.cnblogs.com/boluoboluo/p/6511061.html