定向爬虫小例子

demo下载 java  和 python

# --*-- coding:utf-8 --*--
import urllib2
from lxml import etree
import Queue
import time
import os


def getHtml(url):
    request = urllib2.Request(url)
    request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0')
    doc = urllib2.urlopen(request, timeout=45).read().decode('gbk')
    return doc

seed = 'http://it.dataguru.cn/'
seed = 'http://bi.dataguru.cn/'
seed = 'http://science.dataguru.cn/'
que_urls = Queue.Queue()
que_urls.put(seed)


def getCurTimeStamp(root='/data/data/dataguru/science/'):
    """
    获取当前时间戳:离1970年1月1日午夜开始的毫秒数
    :return:
    """
    return root + str(int(time.time() * 1000)) + '.txt'


def start():
    while que_urls.qsize() > 0:
        url = que_urls.get()
        html = getHtml(url)
        dom = etree.HTML(html)
        # links = dom.xpath(u"//div[@id='ct']//a[@class='xi2']/@href")
        links = dom.xpath(u"//div[@id='ct']//a[@class='xi2']")
        print len(links)
        for lk in links:
            print lk.text, lk.xpath('./@href')
            try:
                link = lk.xpath('./@href')[0]
                html_c = getHtml(link)
                dom_c = etree.HTML(html_c)
                article = dom_c.xpath('//td[@id="article_content"]//text()')
                content = os.linesep.join(article)
                content = content.replace('
', '')
                with open(getCurTimeStamp(), 'wb') as mf:
                    mf.write(link + os.linesep)
                    mf.write(lk.text.encode('utf-8') + os.linesep)
                    mf.write(content.encode('utf-8'))
            except Exception, e:
                print e
                continue

        links_next = dom.xpath('//div[@id="ct"]//a[@class="nxt"]')
        for lk in links_next:
            print lk.text, lk.xpath('./@href')
            que_urls.put(lk.xpath('./@href')[0])

import jieba
if __name__ == '__main__':
    #start()
    sen = '我来到北京清华大学'
    sen = '他来到了网易杭研大厦'
    seg_list = jieba.cut(sen, cut_all=False)
    res = "/ ".join(seg_list)
    print type(seg_list)
    print "Default Mode:", "/ ".join(seg_list)  # 精确模式
package com.data.crawl.qa.baiduzhidao;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedList;
import java.util.Queue;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;

public class Crawl {

    private static Log log = LogFactory.getLog(HttpClientPool.class);

    private HtmlCleaner cleaner = new HtmlCleaner();

    private HttpClientPool httpPool = new HttpClientPool();

    private Queue<String> queue = new LinkedList<String>();

    private Pattern Pat_index = Pattern.compile("http://zhidao.baidu.com/browse/\d+(\?pn=\d+#list)?");
    // http://zhidao.baidu.com/browse/82?pn=25#list
    // http://zhidao.baidu.com/browse/82?pn=50#list
    // http://zhidao.baidu.com/browse/82

    private Pattern Pat_content = Pattern.compile("http://zhidao.baidu.com/question/\d+.html\?entry=qb_browse_default");

    // http://zhidao.baidu.com/question/1732680699842305627.html?entry=qb_browse_default
    // http://zhidao.baidu.com/question/368440625636623924.html?entry=qb_browse_default
    // http://zhidao.baidu.com/question/1946360168489647948.html?entry=qb_browse_default

    public void start(String seed) {
        queue.add(seed);
        while (queue.size() > 0) {
            String uri = queue.poll();
            String html = httpPool.downHtml(uri);
            if (Pat_index.matcher(uri).find()) {
                getOutlinks(html, uri);
            }else if(Pat_content.matcher(uri).find()){
                getFields(html, uri);
            }else{
                log.info("regex err: " + uri);
            }
        }
    }

    private void getFields(String html, String uri) {
        // TODO Auto-generated method stub
        
        TagNode doc = cleaner.clean(html);
        try {
            Object[] tags_title = doc.evaluateXPath("//span[@class='ask-title  ']");            
            String title = ((TagNode)tags_title[0]).getText().toString();
            log.info(title);
        } catch (XPatherException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        
    }

    public static void main(String[] args) {

        Crawl crawl = new Crawl();
        String seed = "http://zhidao.baidu.com/browse/82";
        crawl.start(seed);
        log.info("complete");
    }

    public void getOutlinks(String html, String base) {
        TagNode doc = cleaner.clean(html);

        try {
            URL baseUrl = new URL(base);
            Object[] tags_content = doc.evaluateXPath("//a[@class='question-title']");
            for (Object object : tags_content) {
                String relativeurl = ((TagNode) object).getAttributeByName("href");
                URL url = new URL(baseUrl, relativeurl);
                queue.add(url.toString());
            }
            
            Object[] tags_next = doc.evaluateXPath("//a[@class='pager-next']");
            String relative_url_next = ((TagNode) tags_next[0]).getAttributeByName("href");
            URL url = new URL(baseUrl, relative_url_next);
            queue.add(url.toString());

        } catch (XPatherException e) {
            log.warn(e.getMessage());
        } catch (MalformedURLException e) {
            e.printStackTrace();
        }
    }

}


 
原文地址:https://www.cnblogs.com/i80386/p/3282064.html