构建自己的博客专用搜索引擎--抓数据

博客园有自己的lucene.net搜索引擎,还有google的站内搜索,不过即使是google搜索,也不会完全索引我的内容,它也挑三捡四的,即使索引了,由于计算机方面的搜索的词典的冷门性,也没办法有很高的查全率。所以我一直希望做一个自己的博客的全文索引。

本来想搞一个能用的基于rake+hbase+whenever+massive_record的方案来实现可扩展,做了一半,感觉整个工程周期太长,还是放了一旁,拿起以前的代码,改吧改吧先能用起来再说

我使用的是以前15-1688小额批发搜索引擎的部分脚本,之前使用web ui的方式来定制抓取的脚本模板,这里就直接拿来用了。

整个抓取数据的过程分为4步,共4个脚本,

A生成列表页链接

B抓取列表页

C抽取詳細页链接

D抽取詳細页

我就直接上代码了

A

#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.15,v0.2
2010.10.07,v0.1
批量生成列表页链接
"""
import sys,os,time
list_url_template = "http://www.cnblogs.com/lexus/default.html?page=%s"
list_url_start    = 1
list_url_end      = 154
list_links_file   = os.path.join("./","list_links.txt")
g_step=1

def cost_time(func):
    def newFunc(*args, **args2):
        t0 = time.time()
        print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
        back = func(*args, **args2)
        print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
        print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
        return back
    return newFunc

@cost_time
def show(list_url_start=list_url_start,\
         list_url_end=list_url_end,\
         list_url_template=list_url_template):
    lines=[]
    for i in xrange(list_url_start,list_url_end+1):
        line="http://www.cnblogs.com/lexus/default.html?page=%s\n"%(i*g_step)
        print line.rstrip()
        lines.append(line)
    open(list_links_file,"w").writelines(lines)
    print "total count:%s"%len(lines)
    print "done!"

#import os.path
#print os.path.abspath(".")
if __name__=="__main__":
    l=len(sys.argv)
    if l==1:
        show()
    elif l==2:
        show(list_url_end=int(sys.argv[1]))
    elif l==3:
        show(list_url_start=int(sys.argv[1]),list_url_end=int(sys.argv[2]))
    elif l==4:
        show(list_url_start=int(sys.argv[1]),list_url_end=int(sys.argv[2]),list_url_template=sys.argv[3])

B

#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.16,v0.3
2010.10.09,v0.2
2010.10.07,v0.1
批量抓取列表页
"""
from __future__ import with_statement
from __future__ import division

import socket as original_socket
original_socket.setdefaulttimeout(10)
from eventlet.timeout import with_timeout
from eventlet.green import urllib2

import sys
####reload(sys)
####sys.setdefaultencoding('utf-8')

import eventlet
from eventlet import pools
#httplib2 = eventlet.import_patched('httplib2')
#httppool = pools.Pool(create=lambda: httplib2.Http(timeout=90),max_size=20)

import time

import os

import os.path

import stat

import select

import shutil

import re

import gzip
import StringIO

list_list_folder    = os.path.join("./","lists")
list_info_folder    = os.path.join("./","lists_infos")
status_file         = os.path.join("./","lists_infos/status.txt")
error_file          = os.path.join("./","lists_infos/error.txt")
error_file_bak      = os.path.join("./","lists_infos/error.txt.bak")
success_file        = os.path.join("./","lists_infos/success.txt")
list_links_file     = os.path.join("./","list_links.txt")
g_headers={}
g_pool_num          = 5

def init():
    if not os.path.exists(list_list_folder):
        os.mkdir(list_list_folder)
    if not os.path.exists(list_info_folder):
        os.mkdir(list_info_folder)
    print "完成初始化"

def delete(src):
    '''delete files and folders'''
    permission(src)
    if os.path.isfile(src):
        try:
            os.remove(src)
        except:
            pass
    elif os.path.isdir(src):
        for item in os.listdir(src):
            itemsrc=os.path.join(src,item)
            delete(itemsrc)
        try:
            os.rmdir(src)
        except:
            pass

def permission(src):
    os.chmod(src,stat.S_IRWXU|stat.S_IRWXO|stat.S_IRWXG)    

def clear():
    delete(list_list_folder)
    delete(list_info_folder)
    print "还原为初始"

def size(src):
    "检查文件或文件夹大小"
    r = 0L
    if os.path.isfile(src):
        r=os.path.getsize(src)
    else:
        for root, dirs, files in os.walk(src):
           r += sum([os.path.getsize(os.path.join(root, name)) for name in files])
    l=len(str(r))

    if l>9:
        r=r/1024/1024/1024
        r="%.2f GiB"%r
    elif l>6:
        r=r/1024/1024
        r="%.2f MiB"%r
    elif l>3:
        r=r/1024
        r="%.2f KiB"%r
    print "%s 大小为:%s"%(src,r)

def status(str):
    "running/stop"
    f=open(status_file,"w")
    f.write(str)
    f.close()    

def error(url,ex):
    f=open(error_file,"a")
    f.write("%s\n"%(url,))
    f.close()

def success(url):
    f=open(success_file,"a")
    f.write("%s\n"%url)
    f.close()

def url2filename(url):
    import base64
    return base64.urlsafe_b64encode(url)

def url2filename2(url):
    url=url.strip()
    idx=url.rfind("/")
    r=url[idx+1:]
    if idx==-1 or len(r)==0:
#       raise ValueError("url2filename function parser error")
        print "启用特殊url2filename"
        r = re.findall(r"\d+", url)[-1]
    return r

def cost_time(func):
    def newFunc(*args, **args2):
        t0 = time.time()
        print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
        back = func(*args, **args2)
        print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
        print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
        return back
    return newFunc

def statistics(func):
    def tongji():
        total,successed=0,0
        if os.path.exists(list_links_file):
            total=len(set(open(list_links_file,"r").readlines()))
            print "total lines:%s"%total
        if os.path.exists(success_file):
            successed=len(set(open(success_file,"r").readlines()))
            print "successed lines:%s"%successed
        print "left lines:%s"%(total-successed)
    def newFunc(*args,**args2):
        tongji()
        back = func(*args, **args2)
        tongji()
        return back
    return newFunc

def get_html(url):
    def do(url):
        html=""
        try:
            req = urllib2.Request(url = url,headers = g_headers)
            html = urllib2.urlopen(req).read()
            return html
        except Exception,e:
            print url,"error",e
            error(url,e)
            return None
    rr = with_timeout(10, do, url, timeout_value=None)
    return rr

def get_html22(url):
    import types
    def do(url):
        html=""
        try:
            req = urllib2.Request(url = url,headers = g_headers)
            html = urllib2.urlopen(req).read()
            t=type(html)
            if t==types.StringTypes or t==types.UnicodeType:
                return html
            else:
                print url,"error======"
                return ""
        except Exception,e1:
            pdata = StringIO.StringIO(rr)#下面6行是实现解压缩
            gzipper = gzip.GzipFile(fileobj = pdata)
            try:
                html = gzipper.read()
                return html
            except Exception,e2:
                print url,e1,e2
                error(url,e1)
            return ""
    rr = with_timeout(10, do, url, timeout_value="")
    return rr

def get_html2(url):
    "when use gzipped page will get fetch error"
    #print url
    with httppool.item() as http:
        #eventlet.sleep(0)
        resp, content = http.request(url)
        print content
        return content

def save_html2file(filename,html):
    f=open(filename,"w")
    f.write(html)
    f.close()

def save_url2file(url):
    #html=""
    #try:
    #    html=get_html(url)
    #except Exception,e:
    #    print url,"fetch error",e
    #    error(url,e)
    #    return
    html=get_html(url)
    if html is not None and html<>"":
        filename=os.path.join(list_list_folder,url2filename(url))
        save_html2file(filename,html)
        if os.path.getsize(filename)<1024*20:
            error(url,"size小于%s"%(1024*20))
            print url,"error"
            return
        success(url)#以成功的为基准,剩下的都是不成功的或未执行的
        print url,"success"
    else:
        print url,"error"
        error(url,"html为None或为空")

@cost_time
@statistics
def batch_get_html(urls):
    print "执行批量下载网页工作"
    pool=eventlet.GreenPool(g_pool_num)
    for url in urls:
        pool.spawn_n(save_url2file,url)
    pool.waitall()
    print "done!"

def process_continue():
    "接着success抓取剩下的部分"
    #读取完整的部分和已完成的部分进行取非交集合
    done=set(open(success_file,"r").read().split("\n"))
    all=set(open(list_links_file,"r").read().split("\n"))
    left=all-done
    batch_get_html(left)

if __name__=="__main__":
    init()
    l=len(sys.argv)
    if l==1:
        content=""
        if not select.select([sys.stdin,],[],[],0.0)[0]:
            print "load from %s"%list_links_file
            content=open(list_links_file,"r").read()
        else:
            print "load from stdin"
            content=sys.stdin.read()
        urls=content.strip().split("\n")
        #print urls
        batch_get_html(urls)
        size(list_list_folder)
    elif l==2:
        argv=sys.argv[1]
        if argv=="clear":
            clear()
        if argv=="continue":
            process_continue()
    elif l==3:
        argv=sys.argv[1]
        if argv=="load":
            url=sys.argv[2]
            print url
            save_url2file(url)
    print "done!"

C

#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.16,v0.22
2010.10.11,v0.21
2010.10.09,v0.2
2010.10.07,v0.1
从列表页抽取详细页的链接和缩略图链接的脚本
"""
import sys
import re
import os.path

list_list_folder      = os.path.join("./","lists")
success_file        = os.path.join("./","lists_infos/success.txt")
detail_links_file   = os.path.join("./","extract_detail_links.txt")

#g_pattern=r"""
[^"]*?)\1[\s\S]*?[^"]*?)\3 """ g_pattern=r"""http://www.cnblogs.com/lexus/archive/\d{4}/\d{1,2}/\d{1,2}/\d{1,}\.html)\1[\s\S]*?>(?P[\s\S]*?)<[\s\S]*?/[\s\S]*?a[\s\S]*?>""" if g_pattern[-2]=='"': g_pattern=g_pattern[:-2]+'\\"' else: g_pattern=g_pattern[:-1] def url2filename(url): import base64 return base64.urlsafe_b64encode(url) def url2filename2(url): url=url.strip() idx=url.rfind("/") r=url[idx+1:] if idx==-1 or len(r)==0: # raise ValueError("url2filename function parser error") print "启用特殊url2filename" r = re.findall(r"\d+", url)[-1] return r def delete(src): '''delete files and folders''' #permission(src) if os.path.isfile(src): try: os.remove(src) print "删除文件%s"%src except: pass elif os.path.isdir(src): for item in os.listdir(src): itemsrc=os.path.join(src,item) delete(itemsrc) try: os.rmdir(src) print "删除文件夹%s"%src except: pass def clear(): delete(detail_links_file) def extract_detail_link(url): lines=[] regex=re.compile(g_pattern) file=os.path.join(list_list_folder,url2filename(url)) subject=open(file,"r").read() for match in regex.finditer(subject): #line="%s,%s\n"%(match.group("link").replace("&","&"),match.group("img").replace("http:/www","http://www").replace(","," ")) line="%s,\n"%(match.group("link").replace("&","&"),) lines.append(line) return lines def batch_extract_detail_links(): f=open(detail_links_file,"w") urls=open(success_file,"r").read().strip().split("\n") total=[] for url in urls: lines=extract_detail_link(url) total.extend(lines) print "%s,%s"%(url,len(lines)) s=set(total) f.writelines(s) f.close() print "done!" print "repeat count:%s"%(len(total)-len(s)) print "total lines:%s"%len(s) if __name__=="__main__": l=len(sys.argv) if l==1: batch_extract_detail_links() elif l==2: if sys.argv[1]=="clear": clear()

D

#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.16,v0.13
2010.10.15,v0.12
2010.10.13,v0.11
2010.10.07,v0.1
批量抓取详细页
"""
from __future__ import with_statement
from __future__ import division

import socket as original_socket
original_socket.setdefaulttimeout(10)
from eventlet.timeout import with_timeout
from eventlet.green import urllib2

from urlparse import urljoin
import sys
####reload(sys)
####sys.setdefaultencoding('utf-8')

import eventlet
from eventlet import pools
#httplib2 = eventlet.import_patched('httplib2')
#httppool = pools.Pool(create=lambda: httplib2.Http(timeout=90),max_size=20)

import time

import os

import os.path

import stat

import select

g_host                  = "http://www.cnblogs.com/lexus"
g_data_folder           = os.path.join("./","details")
g_info_folder           = os.path.join("./","details_infos")
g_status_file           = os.path.join("./","details_infos/status.txt")
g_error_file            = os.path.join("./","details_infos/error.txt")
g_success_file          = os.path.join("./","details_infos/success.txt")
g_result_links_file     = os.path.join("./","extract_detail_links.txt")
g_pool_num              = 1
g_headers={}
headers                 = """"""
headers                 = headers.strip().replace("\r\n","\n")
if headers<>"":
    for elem in headers.split("\n"):
        if elem.strip()=="":
            continue
        a,b=elem.split(":",1)
        a=a.strip()
        b=b.strip()
        g_headers[a]=b

def init():
    if not os.path.exists(g_data_folder):
        os.mkdir(g_data_folder)
    if not os.path.exists(g_info_folder):
        os.mkdir(g_info_folder)
    print "完成初始化"

def delete(src):
    '''delete files and folders'''
    permission(src)
    if os.path.isfile(src):
        try:
            os.remove(src)
        except:
            pass
    elif os.path.isdir(src):
        for item in os.listdir(src):
            itemsrc=os.path.join(src,item)
            delete(itemsrc)
        try:
            os.rmdir(src)
        except:
            pass

def permission(src):
    os.chmod(src,stat.S_IRWXU|stat.S_IRWXO|stat.S_IRWXG)    

def clear():
    delete(g_data_folder)
    delete(g_info_folder)
    print "还原为初始"

def size(src):
    "检查文件或文件夹大小"
    r = 0L
    if os.path.isfile(src):
        r=os.path.getsize(src)
    else:
        for root, dirs, files in os.walk(src):
           r += sum([os.path.getsize(os.path.join(root, name)) for name in files])
    l=len(str(r))

    if l>9:
        r=r/1024/1024/1024
        r="%.2f GiB"%r
    elif l>6:
        r=r/1024/1024
        r="%.2f MiB"%r
    elif l>3:
        r=r/1024
        r="%.2f KiB"%r
    print "%s 大小为:%s"%(src,r)

def status(str):
    "running/stop"
    f=open(g_status_file,"w")
    f.write(str)
    f.close()    

def error(url,ex):
    f=open(g_error_file,"a")
    f.write("%s\n"%(url,))
    f.close()

def success(url):
    f=open(g_success_file,"a")
    f.write("%s\n"%url)
    f.close()

def url2filename(url):
    import base64
    return base64.urlsafe_b64encode(url)

def url2filename2(url):
    url=url.strip()
    idx=url.rfind("/")
    r=url[idx+1:]
    if idx==-1 or len(r)==0:
#       raise ValueError("url2filename function parser error")
        print "启用特殊url2filename"
        r = re.findall(r"\d+", url)[-1]
    return r

def statistics(func):
    def tongji():
        total,successed=0,0
        if os.path.exists(g_result_links_file):
            total=len(set(open(g_result_links_file,"r").readlines()))
            print "total lines:%s"%total
        if os.path.exists(g_success_file):
            successed=len(set(open(g_success_file,"r").readlines()))
            print "successed lines:%s"%successed
        print "left lines:%s"%(total-successed)
    def newFunc(*args,**args2):
        tongji()
        back = func(*args, **args2)
        tongji()
        return back
    return newFunc

def cost_time(func):
    def newFunc(*args, **args2):
        t0 = time.time()
        print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
        back = func(*args, **args2)
        print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
        print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
        return back
    return newFunc

def get_html(url):
    def do(url):
        html=""
        try:
            req = urllib2.Request(url = url,headers = g_headers)
            html = urllib2.urlopen(req).read()
            return html
        except Exception,e:
            print url,"error",e
            error(url,e)
            return None
    rr = with_timeout(10, do, url, timeout_value=None)
    return rr

def get_html2(url):
    #print url
    with httppool.item() as http:
        #eventlet.sleep(0)
        resp, content = http.request(url,'GET',headers=g_headers)
        #resp, content = http.request(url)
        return content

def save_html2file(filename,html):
    f=open(filename,"w")
    f.write(html)
    f.close()

def save_url2file(url):
    a,b=url.strip().split(",")
    if not a.startswith("http://"):
        a=urljoin(g_host,a)
    #a=a.replace("&","&")
    html=get_html(a)
    if html is not None and html<>"":
        filename=os.path.join(g_data_folder,url2filename(a))
        save_html2file(filename,html)
        if os.path.getsize(filename)<1024*10:
            error(url,"size小于%s"%(1024*10))
            print url,"error"
            return
        success(url)#以成功的为基准,剩下的都是不成功的或未执行的
        print url,"success"
    else:
        print url,"error"
        error(url,"html为None或为空")

def save_url2file2(url):
    a,b=url.strip().split(",")
    if not a.startswith("http://"):
        a=urljoin(g_host,a)
    html=""
    try:
        html=get_html(a)
    except Exception,e:
        print url,e,"fetch error"
        error(url,e)
        return

    if html<>"":
        filename=os.path.join(g_data_folder,url2filename(a))
        save_html2file(filename,html)
        if os.path.getsize(filename)<1024*10:             error(url,"size小于%s"%(1024*10))             print url,"error"             return         success(url)#以成功的为基准,剩下的都是不成功的或未执行的         print url,"success" @cost_time @statistics def batch_get_html(urls):     print "执行批量下载网页工作"     pool=eventlet.GreenPool(g_pool_num)     for url in urls:         pool.spawn_n(save_url2file,url)     pool.waitall()     size(g_data_folder)     print "done!" def count():     total,successed=set(),set()     if os.path.exists(g_success_file):         successed=set(open(g_success_file,"r").read().strip().split("\n"))     if os.path.exists(g_result_links_file):         total=set(open(g_result_links_file,"r").read().strip().split("\n"))     left=total-successed     return total,successed,left def process_continue():     "接着success抓取剩下的部分"     #读取完整的部分和已完成的部分进行取非交集合     total,successed,left=count()     batch_get_html(left) def process_forever():     "循环处理,直到全部完成"     total,successed,left=count()     print "left"     while len(left)>0:
        print "由于还没未完成页面,再次循环执行"
        process_continue()
        total,successed,left=count()

if __name__=="__main__":
    init()
    l=len(sys.argv)
    if l==1:
        content=""
        if not select.select([sys.stdin,],[],[],0.0)[0]:
            print "load from %s"%g_result_links_file
            content=open(g_result_links_file,"r").read()
        else:
            print "load from stdin"
            content=sys.stdin.read()
        urls=content.strip().split("\n")
        #print urls
        batch_get_html(urls)
    elif l==2:
        argv=sys.argv[1]
        if argv=="clear":
            clear()
        if argv=="continue":
            process_continue()
        if argv=="loop":
            process_forever()
    elif l==3:
        if sys.argv[1]=="load":
            url=sys.argv[2]
            save_url2file(url)
    print "done!"

代码我使用pre标签进行标识,直接在browser下查看可能有连行的问题,本来想找高亮工具,但是我的blog client没有这个feature,就算了,copy代码的话,查看源代码定位pre标签换行什么的都有的,不需要改动就可以用了,
这几年写了大概3060篇blog,大概3/4应该是转载加一些自己的注解吧,整个抓下来大概265M的样子。
代码使用了协程coroutine来加快下载,
本来有一个fabric的统领性的脚本可以传到远程服务器上利用服务器IO再解析再返回本地的脚本,找不到了,大概是上次电脑crash的时候丢了,我再努力找找,我有备份的好习惯应该能找到,呵呵
先这样,下一篇我再写一下,如何来索引数据
PS:也不知道博客园的积分排名是怎么算的,我写的blog也不少,怎么就排不进首页呢,总是在470左右的样子

这个爬虫的一个问题是没有办法发现新的链接,及内容的变化,这也是我为什么想用hbase+whenever来做的原因,不过这次先做为备份数据,先保留一份到本地吧。

原文地址:https://www.cnblogs.com/lexus/p/2285393.html