python实战--csdn博客专栏下载器

打算利用业余时间好好研究Python的web框架--web.py,深入剖析其实现原理，体会web.py精巧之美。但在研究源码的基础上至少得会用web.py。思前想后，没有好的Idea,于是打算开发一个csdn博客专栏下载器，界面不是很好看，还过得去吧。

效果图如下：

为了简单，下载以html格式保存。

下载我自己的博客专栏，目录列表

主界面html文件非常简单，如下：

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>CSDN 博客专栏下载器</title>
<link rel="stylesheet" type="text/css" href="/static/main.css"/>
<script type="text/javascript" src="static/main.js"></script>
</head>

<body>
<input type="text" class="name" name="csdnname" id="csdnid"/><br/>
<button type="button" class="btn" onclick="category()">获取专栏</button>
<div id="categorylist">
</div>
<div id="download">
</div>
<div id="status">
</div>
<div id="footer">
</div>
</body>
</html>

获取后端数据使用Ajax,没有用封装好的库，所以看起来很简洁。

function ajax(requesturl,handler){
	var xmlhttp;
	if (window.XMLHttpRequest){
  		xmlhttp=new XMLHttpRequest();
  	}
	else{
  		xmlhttp=new ActiveXObject("Microsoft.XMLHTTP");
  	}
  	xmlhttp.onreadystatechange=function(){
  		if (xmlhttp.readyState==4 && xmlhttp.status==200){
    		handler(xmlhttp.responseText);
    	}
  	}
	xmlhttp.open("GET",requesturl,true);
	xmlhttp.send();
}

function category(){
	var name=document.getElementById('csdnid').value;
	document.getElementById('categorylist').innerHTML=""
	if (name==""){
		alert("用户名不可以为空");
	}
	else{
		ajax('category'+'?name='+name,function(content){
					document.getElementById('categorylist').innerHTML=content;
				   });
	}
}

function down(){
	var box=document.getElementsByName('check');
	var atag=new Array();
	for (var i=0; i<box.length; i++) {
		if(box[i].checked){
			atag.push(box[i].value);
		}
	}
	var astring=atag.join('*');
	ajax('down'+'?urls='+astring,function(content){
					if(content!=""){
						document.getElementById('download').innerHTML=content;
					}
					else{
						document.getElementById('download').innerHTML="正在下载";
					}
				   });
}

ajax函数，一个参数是请求url,另一个是请求成功回调函数。在函数里创建一个XMLHttpRequest对象，发送请求给url,并调用回调函数。

category函数，主要是获取csdn用户ID,并发送给ajax请求获取专栏信息，成功后显示。

down函数，获取选中的复选框，将选中专栏url拼接后发送ajax请求。

主文件，如下

#coding=utf-8
import web
import os
import urllib2
import cookielib
import re
import threading
import thread
import sys

reload(sys)
sys.setdefaultencoding('utf8')

urls = (
    '/','index',
    '/category(.*)','category',
    '/down','down'

       )
render = web.template.render('templates/')

#所有用到的正则表达式
contentMatch={
    'category':re.compile(r"<div id="panel_Category"(.*?)博(.*?)>(.*?)</div>", re.I|re.DOTALL), #专栏显示
    'zlalink':re.compile(r"a(s*)href(s*)=("|')(.*?)(3)(.*?)>(.*?)</a>",re.I|re.DOTALL),   #各个专栏url
    'blogalink': re.compile(r"<a(s*)name(.*?)href(s*)=("|')(.*?)(4)",re.I|re.DOTALL),  #博客链接url
    'lastpagenum': re.compile(r"<a(s*)href=(.*?)?page=(d)">尾页",re.I|re.DOTALL),    #尾页链接
    'title':re.compile(r"<title>(.*?)</title>",re.I|re.DOTALL)
    }

class Http:
    """
       由于CSDN做了特殊处理，如果使用简单的httplib2.Http().request()会抓取不到数据，所以我们需要模拟真实用户行为，
    """
    def __init__(self):
        cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
        self.opener = urllib2.build_opener(cookie_support,urllib2.HTTPHandler)
        #urllib2.install_opener(opener)
        self.opener.addheaders = [("User-agent","Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11"),("Accept","*/*"),("Referer","http://www.google.com")]

    def open(self,url):
        return self.opener.open(url)

class index:
    """
    首页
    """
    def GET(self):
        return render.index()

class category:
    """
    获取专栏信息，并显示相应的复选框
    """
    def GET(self,name):
        name=web.input(name=None).name
        url = "http://blog.csdn.net/"+name
        try:
            opener=Http()
            res=opener.open(url)
            content = res.read()
            category_match=contentMatch['category'].search(content)
            if category_match:
                left=category_match.group()
                right=""
                list_match=contentMatch['zlalink'].findall(left)
                for i in range(len(list_match)):
                    if i%2 == 1:
                        right+="""
                        <input type="checkbox" name="check" value="%s" /> %s<br />
                        """ % (list_match[i][3],list_match[i][6])
                submit="""<button type="button" class="btn" onclick="down()">下载专栏</button>"""
                return left+right+submit
            else:
                return "该用户没有开通专栏"
        except Exception:
            return "请检查网络和用户名"

class createfile(threading.Thread):
    """
    下载专栏中文章的线程类
    """
    def __init__(self,zlurl):
        threading.Thread.__init__(self)
        self.blogurl=[]
        self.opener=Http()
        self.zlname=""
        try:
            res=self.opener.open(zlurl)
            content = res.read()
            zlname_match=contentMatch['title'].search(content)
            if zlname_match:
                self.zlname="".join(zlname_match.group(1).split('-')[:-2]).decode('utf8')
                if not os.path.exists(self.zlname):
                    os.mkdir(self.zlname)
            else:
                thread.exit_thread()
            if content.find("尾页") < 0:
                self.addblog(content)
            else:
                page_match=contentMatch['lastpagenum'].search(content)
                page=int(page_match.group(3))
                for x in range(1,page+1):
                    url="%s?page=%d" %(zlurl,x)  #分页处理
                    content=self.opener.open(url).read()
                    self.addblog(content)

        except Exception,e:
            print 'init:'+str(e)
            thread.exit_thread()

    def addblog(self,content): #获取文章url
        try:
            blogs_match=contentMatch['blogalink'].findall(content)
            if blogs_match:
                for m in blogs_match:
                    if m[4] not in self.blogurl:
                        self.blogurl.append(m[4])
        except Exception,e:
            print "addblog:"+str(e)

    def write(self,content): #写入文件
        try:
            if content !="":
                blogtitle="".join(contentMatch['title'].search(content).group(1).split('-')[:-3])
                #path="%s%s%s.html" %(self.zlname.encode('utf8'),os.sep,blogtitle)
                path="%s.html" % blogtitle
                f = open(path.decode('utf8'),"w")
                f.write(content.decode('utf8'))
                f.close()
        except Exception,e:
            print "write:"+str(e)

    def run(self):
        try:
            print len(self.blogurl)
            for blog in self.blogurl:
                self.write(self.opener.open(blog).read())
        except Exception,e:
            print "run:"+str(e)



class down:
    def GET(self):
        urls=web.input().urls
        urllist=urls.split('*')
        for url in urllist:
            file=createfile(url)
            file.start()
            file.join()
        #等待线程结束，结束后返回下载完成
        return "下载完成"

if __name__=='__main__':
    app = web.application(urls,globals())
    app.run()

使用了多线程，博主有几个专栏，将启动几个线程，主程序等待线程结束。

在这里也遇到了一个问题，本身是每个专栏一个文件夹，但处理时有点问题，拼接path后open时总是报没有这个文件或文件夹，应该是编码的问题。

留个遗憾，等待各位亲的指教。