笔趣阁小说 selenium爬取

import re
from time import sleep

from lxml import etree
from selenium import webdriver

options = webdriver.ChromeOptions()
#options.add_argument('--headless')
options.add_argument(
    "User-Agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36")
options.add_argument("Referer=https://s.weibo.com/")
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('blink-settings=imagesEnabled=false')
options.add_argument('--disable-gpu')
options.add_argument('--hide-scrollbars')  # 隐藏滚动条, 应对一些特殊页面
options.add_argument(
    'Cookie: ')


class Qidian:
    def __init__(self, url, driver):
        self.url = url
        self.driver = driver
        content = self.get_content(url)
        self.file_name = self.pase_file_name(content)


    def crawl_start(self):
        content = self.get_content(self.url)
        self.parse_detail(content)

    def get_content(self,url):
        self.driver.get(url)
        content = driver.page_source
        return content

    def pase_file_name(self, content):
        html = etree.HTML(content)
        file_info = html.xpath('//*[@id="info"]/h1/text()')
        file_name = file_info[0] + ".txt"
        return file_name

    def parse_detail(self, content):
        html = etree.HTML(content)
        ul = html.xpath('//div[@id="list"]/dl//dd')
        open(self.file_name, 'w')
        for li in ul:
            item = {}
            title = li.xpath('./a/text()')
            href = li.xpath('./a/@href')

            item['title'] = title[0]
            item['href'] = "http://www.biquge.info/0_273/" + href[0]
            print(item)
            driver.get(item['href'])
            html = etree.HTML(driver.page_source)
            details = html.xpath('//*[@id="content"]//text()')
            detail = ''.join(details)

            self.save_to_file(self.file_name, title[0], detail)
            sleep(3)

    def save_to_file(self, file_name, title, content):
        with open(file_name, 'a+') as f:
            f.write(title + '
')
            f.write(content)
            f.write('
')
            f.close()


if __name__ == "__main__":
    url = "http://www.biquge.info/0_273/"
    driver = webdriver.Chrome(options=options)
    try:
        qidian = Qidian(url, driver)
        qidian.crawl_start()
        driver.quit()
    except Exception as e:
        print(str(e))

  

原文地址:https://www.cnblogs.com/brady-wang/p/12541164.html