Python爬虫-《神雕侠侣》

Python3.5

爬取《神雕侠侣》http://www.kanunu8.com/wuxia/201102/1610.html

武侠迷,所以喜欢爬取武侠小说

#!/usr/bin/python
# -*- coding: utf-8 -*-

from selenium import webdriver
import os
from docx import Document
import re

class House():

    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'}
        self.baseUrl = 'http://www.kanunu8.com/wuxia/201102/1610.html'
        self.basePath = os.path.dirname(__file__)

    def makedir(self, name):
        path = os.path.join(self.basePath, name)
        isExist = os.path.exists(path)
        if not isExist:
            os.makedirs(path)
            print('File has been created.')
        else:
            print('The file is existed.')
        #切换到该目录下
        os.chdir(path)

    def connect(self, url):
        try:
            driver = webdriver.PhantomJS()
            driver.get(url)
            return driver
        except:
            print('This page is not existed.')

    #爬取每个板块中每一章节的链接地址
    def getBookLinkList(self, url):
        driver = self.connect(url)
        bookLinkList = []
        try:
            #找到所有href链接
            bookLinks = driver.find_elements_by_xpath("//a")
            for link in bookLinks:
                temp = link.get_attribute('href')
                print(temp)
                try:
                    #通过正则表达式筛选出各章节的链接
                    pattern = re.compile(".+/[0-9]{5}.html$")
                    if pattern.match(temp):
                        print('ok')
                        bookLinkList.append(link.get_attribute('href'))
                except:
                    print('little error')
        except:
            print('Error')

        return bookLinkList

    #爬取每本书的细节数据
    def getBookDetail(self, url):
        driver = self.connect(url)
        try:
            #找到标题和文章内容
            title = driver.find_element_by_xpath('//h2').text
            content = driver.find_element_by_xpath('//p').text
            print(title)
            print(content)
        except:
            print('Error.')
        return title, content

    def getData(self):
        doc = Document()
        self.makedir('StoryFiles')
        bookLinkList = self.getBookLinkList(self.baseUrl)
        for linkUrl in bookLinkList:
            doc.add_paragraph(self.getBookDetail(linkUrl))

        doc.save('神雕侠侣.docx')

if __name__ == '__main__':
    house = House()
    house.getData()
原文地址:https://www.cnblogs.com/fredkeke/p/7761100.html