爬取四大名著

'''
 诗词名句网
 1. 爬取固定书籍
 2. 爬取书名
 3. 爬取本部书的章回目录
 4. 灵活处理,爬取任意书籍的章回目录
 5. 加入异常处理
 6. 爬取任意整本书
'''

import requests
import re

def bookSpider(oldurl,bookName):
    url=oldurl+".html"
    html=loadPage(url)
    try:
        with open("demo.txt",'w',encoding='utf-8') as f:
            f.write(html)
    except:
        print("FILE OPERATION ERROR")
    findTitle("demo.txt",bookName)
    cnt=findTileOfPages("demo.txt",bookName)
    getWholeBook(oldurl,bookName,cnt)

def findTitle(filename,bookName):
    try:
        f=open(filename,encoding='utf-8')
        book=open("book.txt",'w',encoding='utf-8')
    except:
        print("FILE OPERATION ERROR")
    while True:
        line=f.readline()
        #print("READ:"+line)
        if not line:
            break
        pattern=re.compile(r'<title>《.{0,10}》')
        bookName=re.search(pattern,line)
        flag=False
        if bookName:
            print("书名:",end="")
            for ch in str(bookName):
                if ch == '':
                    flag=True
                if ch == '':
                    flag=False
                    print("")
                    book.write(''+'
')
                if flag:
                    print(ch,end="")
                    book.write(ch)

def findTileOfPages(filename,bookName):
    cnt=0
    try:
        f = open(filename,encoding='utf-8')
        book = open("book.txt",'a', encoding='utf-8')
    except:
        print("FILE OPERATION ERROR")
    book.write("目录:
")
    while True:
        line = f.readline()
        # print("READ:"+line)
        if not line:
            break
        pattern = re.compile(r'<li><a href="/book/'+bookName+'/d+.html">.{10,40}</a></li>')
        titleOfpages = pattern.findall(line)
        flag = False
        if titleOfpages:
            for i in range(0,len(titleOfpages)):
                cnt+=1
                for j in range(0,len(titleOfpages[i])):
                    if titleOfpages[i][j] == '':
                        flag=True
                    if titleOfpages[i][j] == '<':
                        flag=False
                    if flag:
                        print(titleOfpages[i][j],end="")
                        book.write(titleOfpages[i][j])
                print()
                book.write('
')
    return cnt

def getWholeBook(url,bookName,cnt):
    print("正在下载全本书,请稍后...")
    for i in range(1,cnt+1):
        newUrl=url+'/'+str(i)+".html"
        print(newUrl)
        html=loadPage(newUrl)
        try:
            with open("bookHtml.txt", 'w', encoding='utf-8') as f:
                f.write(html)
        except:
            print("FILE OPERATION ERROR")
        f = open('bookHtml.txt', 'r', encoding='utf-8')
        bookContent = open('book.txt', 'a', encoding='utf-8')
        while True:
            line = f.readline()
            # print("READ:"+line)
            if not line:
                break
            pattern = re.compile(r'<p>&nbsp;&nbsp;&nbsp;&nbsp;.+</p>')
            content = re.findall(pattern, line)
            patternOfTitle=re.compile(r'<h1>.+</h1>')
            contentOfTitle = re.findall(patternOfTitle, line)
            flag=False
            for i in range(0, len(contentOfTitle)):
                for j in range(0, len(contentOfTitle[i])):
                    if contentOfTitle[i][j] == '>':
                        flag=True
                        continue
                    if contentOfTitle[i][j] == '<':
                        flag=False
                        continue
                    if flag:
                        bookContent.write(contentOfTitle[i][j])
                bookContent.write('
')

            flag = False
            for i in range(0, len(content)):
                for j in range(0, len(content[i])):
                    if content[i][j] == '<':
                        flag=False
                        continue
                    if content[i][j] == ';' and content[i][j - 1] == 'p' and content[i][j + 1] != '&':
                        flag = True
                        continue
                    if flag:
                        bookContent.write(content[i][j])
                bookContent.write('
')
        f.close()
        bookContent.close()

def loadPage(url):
    try:
        header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
        response = requests.get(url, headers=header)
        return response.content.decode('utf-8')
    except:
        print("PAGE LOAD ERROR")

if __name__ == "__main__":
    bookName=input("请输入想看的书名:(全拼)")
    url = "http://www.shicimingju.com/book/"+bookName
    bookSpider(url,bookName)
原文地址:https://www.cnblogs.com/TheSilverMoon/p/11143203.html