宽度优先爬虫-爬虫学习(四)

  我们用爬虫去遍历互联网的时候,可以将互联网看作是一个有向图,链接就是图中的有向边,所以我们可以利用图的遍历方法去遍历这个巨大的互联网,图的遍历有宽度优先遍历和深度优先遍历。但是深度优先遍历可能会在遍历的时候遍历过深,导致浪费资源,所以我们这里采用宽度优先遍历。

  下面使用Java实现一个简单的宽度优先爬虫的例子,其中用到来HttpClient和HtmlParser两个开源的工具包。

/**
 * @introduction 队列 保留将要访问的URL 
 * @author Guo
 */
public class Queue {
    /**
     * @introduction 底层使用LinkedList去封装一个队列,用来存储未访问的URL
     */
    private LinkedList<String> queue = new LinkedList<String>();
    
    /**
     * @introduction 想队列中加入URL
     * @param url
     */
    public void in(String url) {
        queue.addLast(url);
    }
    
    /**
     * @introduction 移除队列中的URL
     * @return
     */
    public String out() {
        return queue.removeFirst();
    }
    
    /**
     * @introduction 判断是否包含URL
     * @param url
     * @return
     */
    public boolean contains(String url) {
        return queue.contains(url);
    }
    
    /**
     * @introduction 判断队列是否为空
     * @return
     */
    public boolean isEmpty() {
        return queue.isEmpty();
    }
}
/**
 * @introduction 一个处理用过的URL和没用过的URL的类
 * @author Guo
 */
public class LinkQueue {
    /**
     * @introduction 用HashSet去封装一个访问过的URL队列
     */
    private static Set<String> visitedURL = new HashSet<String>();
    /**
     * @introduction 未访问的URL队列
     */
    private static Queue unVisitedURL = new Queue();
    
    /**
     * @introduction 添加已经访问过的URL
     * @param url
     */
    public static void addVisitedURL(String url) {
        visitedURL.add(url);
    }
    
    /**
     * @introduction 移除访问过的URL
     * @param url
     */
    public static void removeVisitedURL(String url) {
        visitedURL.remove(url);
    }
    
    /**
     * @introduction 拿出未访问的URL
     * @return
     */
    public static String unVisitedURLOutQueue() {
        return unVisitedURL.out();
    }
    
    /**
     * @introduction 添加未访问的URL
     * @param url
     */
    public static void addUnVisitedURL(String url) {
        if(url != null && !url.trim().equals("") && 
                !visitedURL.contains(url) && !unVisitedURL.contains(url)) {
            unVisitedURL.in(url);
        }
    }
    
    /**
     * @introduction 拿到访问过的URL的数目
     * @return
     */
    public static int getVisitedURLNumber() {
        return visitedURL.size();
    }

    public static Set<String> getVisitedURL() {
        return visitedURL;
    }

    public static void setVisitedURL(Set<String> visitedURL) {
        LinkQueue.visitedURL = visitedURL;
    }

    public static Queue getUnVisitedURL() {
        return unVisitedURL;
    }

    public static void setUnVisitedURL(Queue unVisitedURL) {
        LinkQueue.unVisitedURL = unVisitedURL;
    }
}
public class DownloadFile {
    /**
     * @introduction 将文件存储到本地
     * @param input
     * @param filePath
     */
    public void saveToLocal(InputStream input, String filePath) {
        try {
            DataOutputStream out = new DataOutputStream(new FileOutputStream(new File(filePath)));
            byte[] data = new byte[1024];
            int l;
            while ((l = input.read(data)) != -1) {
                out.write(data);
            }
            out.flush();
            out.close();
            input.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    /**
     * @introduction 下载该链接的URL
     * @param url
     * @return
     */
    public String downloadFile(String url) {
        String filePath = null;
        try {
            HttpClient httpClient = new DefaultHttpClient();
            HttpGet httpGet = new HttpGet(url);
            HttpResponse response = httpClient.execute(httpGet);
            HttpEntity entity = response.getEntity();
            if(entity != null) {
                InputStream input = entity.getContent();
                filePath = "G:/temp/" + response.getAllHeaders().toString() + ".html";
                saveToLocal(input, filePath);
            }
            return filePath;
        }catch(Exception e) {
            e.printStackTrace();
            return filePath;
        }
    }
}
/**
 * @introduction 用来过滤链接的Filter 
 * @author Guo
 */
public interface LinkFilter {
    public boolean accept(String url);
}
/**
 * @introduction 专门对HTML页面进行处理 
 * @author Guo
 */
public class HtmlParserTool {
    /**
     * @introduction 根据url这个参数,拿到这个url中的链接
     * @param url
     * @param filter
     * @return
     */
    public static Set<String> getLinks(String url, LinkFilter filter) {
        Set<String> links = new HashSet<String>();
        try {
            Parser parser = new Parser(url);
            parser.setEncoding("utf-8");
            NodeFilter frameFilter = new NodeFilter() {
                public boolean accept(Node node) {
                    if(node.getText().startsWith("frame src=")) {
                        return true;
                    }else {
                        return false;
                    }
                }
            };
            OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
            NodeList list = parser.extractAllNodesThatMatch(linkFilter);
            for(int i = 0; i < list.size(); i++) {
                Node tag = list.elementAt(i);
                if(tag instanceof LinkTag) {
                    LinkTag link = (LinkTag)tag;
                    String linkUrl = link.getLink();
                    if(filter.accept(linkUrl))
                        links.add(linkUrl);
                }else {
                    String frame = tag.getText();
                    int start = frame.indexOf("src=");
                    frame = frame.substring(start);
                    int end = frame.indexOf(" ");
                    if(end == -1)
                        end = frame.indexOf(">");
                    String frameUrl = frame.substring(5, end - 1);
                    if(filter.accept(frameUrl))
                        links.add(frameUrl);
                }
            }
        }catch(Exception e) {
            e.printStackTrace();
        }
        return links;
    }
}
public class MyCrawler {
    /**
     * @introduction 初始化LinkQueue
     * @param seeds
     */
    private void initCrawlerWithSeeds(String[] seeds) {
        for(String seed : seeds) 
            LinkQueue.addUnVisitedURL(seed);
    }
    
    /**
     * @introduction 爬虫具体的爬行
     * @param seeds
     */
    public void crawling(String[] seeds) {
        LinkFilter filter = new LinkFilter() {
            public boolean accept(String url) {
                if(url.startsWith("http://www.baidu.com"))
                    return true;
                else
                    return false;
            }
        };
        
        initCrawlerWithSeeds(seeds);
        
        while(!LinkQueue.getUnVisitedURL().isEmpty() && LinkQueue.getVisitedURLNumber() <= 1000) {
            String visitUrl = LinkQueue.unVisitedURLOutQueue();
            if(visitUrl == null)
                continue;
            DownloadFile downloader = new DownloadFile();
            downloader.downloadFile(visitUrl);
            LinkQueue.addVisitedURL(visitUrl);
            Set<String> links = HtmlParserTool.getLinks(visitUrl, filter);
            for(String link : links) 
                LinkQueue.addUnVisitedURL(link);
        }
    }
    
    public static void main(String[] args) {
        MyCrawler crawler = new MyCrawler();
        crawler.crawling(new String[]{"http://www.baidu.com"});
    }
}

 

原文地址:https://www.cnblogs.com/rayguo/p/3489625.html