自己做一个网页爬虫用来抓取一个网站的地址

    以前做过全文检索加网页爬虫,针对的是整个因特网,不过,用的开源的网页抓取工具,hreitrix,研究其源码,但也只是稍微修改了部分源码,以达到业务需要,不过,后面,因为项目停了,这个工作我也就搁置了,后面自己单独写了个类似原理的爬虫,当然,我写的这个简单的很,代码也没有写规范。

    现在有个任务,需要将整个湖南省的所有区域写入数据库中,我就将原来的写的一个爬虫工具拿出来,修改了一下,刚好能跑,按照需求也算完成了任务。

    原理就是用jsoup将网页下载到本地:

  • 利用递归算法解析出所有的链接,然后再拼出正确的http路径,当然,你也可以不用下载到本地,这样的话,你每测试一次就会从网上去下载 一次,这样的话,性能很差,所以,我选择的是下载到本地
  • 然后利用htmlparser工具包,解析对应的数据,然后将数据导入到数据库里

   

/**

*用线程池的方法添加启动程序,效率有所提升

*/

   

public class ChoeseClass {

    public static String url = "http://www.stats.gov.cn/tjbz/cxfldm/2012/43.html";// 网站地址

    public static String chose = ".citytable";//命名的规则,其实就是一个网页里的一个class = // '.tt1'的网页

    static BlockingQueue<String> queue = new LinkedBlockingQueue<String>(10);

    public static void main(String[] args) throws IOException {

        ExecutorService service = Executors.newCachedThreadPool();

        Thread t1 = new CatchURL(queue);

        service.execute(t1);//抓取链接地址是单线程的

        Thread t2 = new DownLoadHtml(queue);

        for (int i = 0; i < 10; i++) {//下载网页用多线程

            service.execute(t2);

        }

   

    }

   

}

   

   

class DownLoadHtml extends Thread {

    private BlockingQueue<String> queue;

    public DownLoadHtml(BlockingQueue<String> queue){

        this.queue = queue;

    }

   

    @Override

    public void run() {

        try {

            while (true) {

                System.out.println(Thread.currentThread().getName());

                //System.out.println(this.queue.take().toString());

                new MirroCraw(this.queue.take().toString(), "G://test//local");

                // Thread.sleep(10000);// 设置等待时间为10秒钟,下载一个网页的最大时间为10秒钟

                if (this.queue.size() == -1)// 一直判断是否还是没有处理完的链接

                    break;

            }

   

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

}

   

/**

* 抓取链接地址,并添加到一个线程安全的队列中

(自己也实现了一个线程安全的队列,用双向list添加,由于添加了list去重方法,效率很低,所以改成jdk自带的队列来实现)

*/

class CatchURL extends Thread {

    private BlockingQueue<String> queue;

    public CatchURL(BlockingQueue<String> queue){

        this.queue = queue;

    }

   

    @Override

    public void run() {

        try {

            choClass(ChoeseClass.url, ChoeseClass.chose);

        } catch (IOException e) {

            e.printStackTrace();

        }

    }

   

    public void choClass(String url, String chose) throws IOException {

        Document doc = null;

        doc = Jsoup.connect(url).timeout(30000).get();

        String classes = chose;

        Elements el = doc.select(classes);

        List<String> links = getPageLinks(el.toString());

        List<String> tempLinks = new ArrayList<>();

        for (String string : links) {

            if(!tempLinks.contains(string)) {

                tempLinks.add(string);

            }

        }

        // url的一些组装

        for (String string : tempLinks) {

            if (!(string.equals("#"))) {

                if (!string.startsWith("http://")) {

                    if (string.startsWith("/")) {

                        string = assemblyUrl(url) + string;

                    } else {

                        if (string.startsWith("../")) {// 如果是这种开头,则对应的上级目录加这个../后的目录

                            string = string.replace("../", "");

                            string = url.substring(0,

                                    url.substring(0, url.lastIndexOf("/"))

                                            .lastIndexOf("/") + 1)

                                    + string;

                        } else {

                            string = url.substring(0, url.lastIndexOf("/") + 1)

                                    + string;

   

                        }

                    }

   

                }

                if (!string.contains(url)) {

                    this.queue.add(string);

                    System.out.println("添加:"+Thread.currentThread().getName() + " " + string);

                    int i = ContainChar.ContainCharByStr(string);

                    if (i == 7) {

                        choClass(string, ".countytable");

                    }

                    if (i == 8) {

                        choClass(string, ".towntable");

                    }

                    if (i == 9) {

                        choClass(string, ".towntable");

                    }

                }

            }

        }

    }

   

    protected static List<String> getPageLinks(String html) {

        Parser parser = null;

        NodeList nodeList = null;

        NodeFilter filter = null;

        List<String> urlList = new ArrayList<String>();

        try {

            // 创建指定编码的Parser对象

            parser = Parser.createParser(html, "GB18030");

            // 创建一个接受标签A的过滤器

            filter = new TagNameFilter("A");

            // 过滤节点内容

            nodeList = parser.extractAllNodesThatMatch(filter);

            int size = nodeList.size();

            for (int i = 0; i < size; i++) {

                LinkTag tag = (LinkTag) nodeList.elementAt(i);

                // System.out.println(tag.getLink());

                urlList.add(tag.getLink());

            }

        } catch (ParserException e) {

            e.printStackTrace();

        }

        return urlList;

    }

   

   

   

   

    /**

     * 组装url

     *

     * @param url

     */

    public static String assemblyUrl(String url) {

        if (url.startsWith("http://")) {

            String newUrl = url.substring(0, url.indexOf("/", 8));

            return newUrl;

        }

        return url;

   

    }

}

   

   

   

下面部份则是批量解析的代码,没时间注释

public class MainAction {

   

    public static void main(String[] args) throws IOException {

        File dir = new File("G:\test\local\www.stats.gov.cn");

        File [] files = readFiles(dir);

        for (File file : files) {

            startAction(file);

        }

          

    }

      

    public static void startAction(File file) throws IOException{

        FileReader fr = null;

        StringBuffer sb = new StringBuffer();

        try {

            fr = new FileReader(file);

            char[] buf = new char[1024];

            int len = 0;

            while ((len = fr.read(buf)) != -1) {

                sb.append(new String(buf, 0, len));

            }

        } catch (FileNotFoundException e1) {

            e1.printStackTrace();

        } finally {

            fr.close();

        }

        int fileName = file.getName().length();

        String fileNametemp [] = file.getName().split("\.");

        String anaylisisHtml = null;

        String type = null;

        if(fileName == 9){

            anaylisisHtml = anayli(sb.toString(), ".countytable");

            type=".countytable";

        }else if(fileName == 11) {

            anaylisisHtml = anayli(sb.toString(), ".towntable");

            type=".towntable";

        }else if(fileName == 14){

            anaylisisHtml = anayli(sb.toString(), ".villagetr");

            type=".villagetr";

        }

            

          

        List<String> list = getPageLinks(anaylisisHtml,type);

        List<ParamModel> models = new ArrayList<>();

        if(type.equals(".villagetr")){

            for (int i = 0; i < list.size(); i = i + 3) {

                System.out.println(fileNametemp[0]+" "+ list.get(i) + " " + list.get(i+2));

                ParamModel model = new ParamModel();

                model.setFileName(fileNametemp[0]);

                model.setObj1(list.get(i));

                model.setObj2(list.get(i+2));

                models.add(model);

            }

            try {

                //DateUtil.save(models);

            } catch (Exception e) {

                e.printStackTrace();

            }

        }else{

            for (int i = 0; i < list.size(); i = i+2) {

                System.out.println(fileNametemp[0]+" "+ list.get(i) + " " + list.get(i+1));

                ParamModel model = new ParamModel();

                model.setFileName(fileNametemp[0]);

                model.setObj1(list.get(i));

                model.setObj2(list.get(i+1));

                models.add(model);

            }

            try {

                //DateUtil.save(models);

            } catch (Exception e) {

                e.printStackTrace();

            }

        }

          

          

    }

      

    /**

     * 得到文件夹下的所有文件

     * @param dir

     */

    public static File [] readFiles(File dir){

        File [] files = dir.listFiles();

        return files;

    }

      

    /**

     * 进行class过滤,返回过滤后的class内的所有html元素

     * @param html

     * @param choClass

     * @return

     */

    public static String anayli(String html,String choClass){

        Document doc = Jsoup.parse(html);

        Elements el = doc.select(choClass);

        return el.toString();

    }

        

      

    protected static List<String> getPageLinks(String html,String type) {

        Parser parser = null;

        NodeList nodeList = null;

        NodeFilter filter = null;

        List<String> urlList = new ArrayList<String>();

        try {

            // 创建指定编码的Parser对象

            parser = Parser.createParser(html, "GB18030");

            // 创建一个接受标签A的过滤器

            if(type.equals(".villagetr")){

                filter = new TagNameFilter("td");

            }else {

                filter = new TagNameFilter("A");

            }

            // 过滤节点内容

            nodeList = parser.extractAllNodesThatMatch(filter);

            int size = nodeList.size();

            for (int i = 0; i < size; i++) {

                if(type.equals(".villagetr")){

                    Tag tag = (Tag) nodeList.elementAt(i);

                    //String temStr = tag.getFirstChild().toHtml();

                    urlList.add(tag.getFirstChild().toHtml());

                }else{

                    LinkTag tag = (LinkTag) nodeList.elementAt(i);

                    //System.out.println(tag.getLink());

                    //System.out.println(tag.getChildrenHTML());

                    urlList.add(tag.getChildrenHTML());

                }

            }

        } catch (ParserException e) {

            e.printStackTrace();

        }

        return urlList;

    }

}

 

 

工程下载地址:http://files.cnblogs.com/wxwall/Mycrawler.zip

原文地址:https://www.cnblogs.com/wxwall/p/3326038.html