python 爬虫-sohu抓小说

#coding:utf-8
import urllib2
import sys
import re


def getPage(url,offset = '0'):
    realurl = "%s%s%s" %(url,offset,'.shtml')
    print realurl
    resp = urllib2.urlopen(realurl)
    content = resp.read()
    #print content
    p = re.compile('<[^>]+>')
    p1=re.compile('<[^>p]+>')
    print p
    rematch = re.compile(r'(<h1.*</h1>)')
    h1 = rematch.findall(content)
    print h1[0],'ok'
    try:
        h1content = p.sub("",h1[0])
        print h1content
    except Exception,e:
        print str(e),'error'
        return
    fp = open(r'juyudao.txt','a')
    fp.write(h1content+ '
')
    fp.flush()
   
    #print content
    
    content = content.replace('
','')    
    content = content.replace('
','')

    content = content.replace(' ','')

    content = content.replace('     ','')
    cont = re.search('articleBody(.*)class="pages">', content, re.S)#先获取一部分html
    #print 'cont1',cont.group()
    cont1=cont.group()
    articleBody=re.findall('</script>(.*)<divclass="pages">',cont1)
    #print articleBody
    articleBody=articleBody[0].replace('</p>','')
    articleBody=p1.sub('',articleBody)
    txt=articleBody.split('<p>')
    for i in txt:
        fp.write(i+ '
')
        fp.flush()
    
    fp.close()


def getBook(url, startoffset, endOffset):
    while startoffset < endOffset:
        getPage(url, offset = str(startoffset))
        startoffset += 1

if __name__ == '__main__':
    getPage(url = 'http://book.sohu.com/20131107/n389762800',offset='')
    getBook(url = 'http://book.sohu.com/20131107/n389762800_',startoffset=1,endOffset=20)