爬取笔趣阁_完本书籍

爬取笔趣阁_完本书籍

爬取地址

import os, time, shutil, requests, sqlite3
from bs4 import BeautifulSoup
from threading import Thread
from datetime import datetime


def fun_makedir(file_path):
    """
    创建文件夹,并进入该文件夹
    :return:
    """
    if not os.path.exists(file_path):
        os.makedirs(file_path)
    os.chdir(file_path)


def main():
    """
    主函数
    :return:
    """
    create_db()  # 创建数据库
    start = datetime.now()
    book_urls = []
    url = "http://www.biquge.tv/wanben/1_1"  # 抓取入口url

    book_urls = get_book(url)  # 获取书名、书的url
    total_book = len(book_urls)
    threads = []
    book_id = 0
    for item in book_urls:
        book_id += 1
        t = Thread(target=save_book_todb, args=(book_id, item[0], item[1]))
        threads.append(t)
    thread_no = 0
    for t in threads:
        thread_no += 1
        print("下载进度:[{}{}]".format(">" * (thread_no), "." * (total_book - thread_no)))  # 打印进度条
        t.start()  # 开始线程
        time.sleep(5)  # print(delay)
        time.sleep(delay)  # 抓取一本书5000章节,大概需要80秒。delay=80
    for t in threads:
        t.join()
        print(t)
    print("
共抓取{}部小说	".format(count))
    run_time = (datetime.now() - start).total_seconds()
    print("总共用时{}秒".format(run_time), end="	")
    print("{}正在导出小说".format(">" * 100))
    show_books()


def create_db():
    """
    创建数据库
    :return:
    """
    if os.path.exists(dbname): os.remove(dbname)
    conn = sqlite3.connect(dbname)
    conn.close()


def set_delay(total_chapter):
    """
    根据章节数量,设置延迟时间
    :param total_chapter:
    :return:
    """
    base = 30
    if total_chapter > 5000:
        delay = base * 6
    elif total_chapter > 4000:
        delay = base * 5
    elif total_chapter > 3000:
        delay = base * 4
    elif total_chapter > 2000:
        delay = base * 3
    elif total_chapter > 1000:
        delay = base * 2
    else:
        delay = base
    return delay


def get_book(url):
    """
    获取书名、书的地址
    :param url:
    :return:(书名,书的地址)
    """
    books = []
    response = requests.get(url, headers=headers)
    response.encoding = "gbk"
    soup = BeautifulSoup(response.text, "html.parser")
    book_txts = soup.find('div', class_='r').findAll('li')
    for book in book_txts:
        book_url = book.find('a')['href']
        book_name = book.find('a').get_text()
        # print("{:<40s}{:<60s}".format(book_name, book_url))
        books.append([book_name, book_url])
    return books


def get_chapter(book_url):
    """
    获取章节名、章节地址
    :param book_url:
    :return:章节名、章节地址
    """
    chapters = []
    chapter_res = requests.get(book_url, headers=headers)
    chapter_res.encoding = 'gbk'
    chapter_soup = BeautifulSoup(chapter_res.text, "html.parser")
    chs = chapter_soup.find('div', id="list").findAll('dd')
    max_chapter = len(chs)
    for i in range(9, max_chapter):
        chapter = chs[i].find('a')
        chapter_url = "http://www.biquge.tv" + chapter['href']
        chapter_name = chapter.get_text()
        chapters.append([chapter_name, chapter_url])
    return chapters


def save_book_todb(book_id, book_name, book_url):
    """
    获取书籍数据,保存到数据库
    :param book_id:
    :param book_name:
    :param book_url:
    :return:
    """
    global count, delay
    count = count + 1
    chapters = []
    chapters = get_chapter(book_url)  # 获取所有章节
    total_chapter = len(chapters)
    delay = set_delay(total_chapter)  # 设置休息时间
    print("正在下载----小说 {}{:<2s}{},共有{}章节,请等待{}秒".format('>' * 50, str(book_id), book_name, total_chapter, delay))
    chapter_id = 0
    threads = []
    for item in chapters:
        chapter_id += 1
        t = Thread(target=save_chapter_todb, args=(chapter_id, item[0], item[1], book_id, book_name, book_url))
        threads.append(t)  # save_chapter_todb(chapter_id, item[0], item[1],book_id, book_name, book_url)
    for t in threads:
        t.start()
        time.sleep(0.01)
    for t in threads:
        t.join()
    print("下载完成 {}{:<2s}{},共有{}章节".format('*' * 30, str(book_id), book_name, len(chapters)))


def save_chapter_todb(chapter_id, chapter_name, chapter_url, book_id, book_name, book_url):
    """
    获取章节内容,并保存到数据库
    :param chapter_id:
    :param chapter_name:
    :param chapter_url:
    :param book_id:
    :param book_name:
    :param book_url:
    :return:
    """
    down_chapter_res = requests.get(chapter_url, headers=headers)
    down_chapter_res.encoding = 'gbk'
    down_chapter_soup = BeautifulSoup(down_chapter_res.text, "html.parser")
    chapter_text = down_chapter_soup.find('div', id="content")
    chapter_text = chapter_text.text  # 获取html中的文本
    chapter_texts = ""
    for s in chapter_text.splitlines(
            True):  # 去除空行,去除每行的单引号  # chapter_text = "".join([s for s in chapter_text.splitlines(True) if s.strip()])
        s.strip()  # 替换空格和空行
        s = s.replace("'", "''")  # 单引号替换为双引号
        chapter_texts += s
    chapter_text = chapter_texts
    save_db(chapter_id, chapter_name, chapter_text, chapter_url, book_id, book_name, book_url)


def save_db(chapter_id, chapter_name, chapter_text, chapter_url, book_id, book_name, book_url):
    """
    保存数据到数据库
    :param chapter_id:
    :param chapter_name:
    :param chapter_text:
    :param chapter_url:
    :param book_id:
    :param book_name:
    :param book_url:
    :return:
    """
    try:
        table_name = create_table_book(book_id)
        conn = sqlite3.connect(dbname)
        cursor = conn.cursor()
        sql = "insert into " + table_name + " values('%d','%s','%s','%s','%s','%s','%s')" % (
            chapter_id, chapter_name, chapter_text, chapter_url, table_name, book_name, book_url)
        cursor.execute(sql)
        conn.commit()
        cursor.close()
        conn.close()
    except:
        print(
            "保存章节出错 {}书名:{},章节{:<4s}{},章节链接:{}".format('.' * 10, book_name, str(chapter_id), chapter_name, chapter_url))


def create_table_book(table_id):
    """
    创建表
    :param table_id:
    :return:表名
    """
    conn = sqlite3.connect(dbname)
    cursor = conn.cursor()
    table_name = "book_" + str(table_id)
    sql = "create table IF NOT EXISTS " + table_name + "(chapter_id int,chapter_name varchar(20)," + 
          "chapter_text varchar(10000),chapter_url varchar(60),book_id varchar(20),book_name varchar(100),book_url varchar(100))"  # 表不存在,就创建;存在就跳过
    cursor.execute(sql)
    conn.commit()
    cursor.close()
    conn.close()
    return table_name


def show_books():
    """
    导出数据库中所有书籍
    :return:
    """
    start = datetime.now()
    conn = sqlite3.connect(dbname)
    cursor = conn.cursor()
    cursor.execute("select count(*) from sqlite_master where tbl_name like 'book_%'")
    num = cursor.fetchone()
    cursor.close()
    conn.close()
    total = num[0] + 1  # print(num[0])
    threads = []
    count = 0
    for i in range(1, total):
        t = Thread(target=show_book, args=(i,))  # show_book(i)
        threads.append(t)
        count += 1
    for t in threads:
        t.start()
        time.sleep(0.1)
    for t in threads:
        t.join()
    run_time = (datetime.now() - start).total_seconds()
    print("

导出小说完成,共导出{}部小说。".format(count), end="	")
    print("花费{}秒。".format(run_time), end="

")


def show_book(table_id):
    """
    导出单本书籍
    :param table_id:
    :return:
    """
    conn = sqlite3.connect(dbname)
    cursor = conn.cursor()
    sql = "select book_id,book_name,chapter_name,chapter_text,chapter_id,book_url from book_" + str(
        table_id) + " order by chapter_id"
    cursor.execute(sql)
    results = cursor.fetchall()
    file_name = results[0][0] + " " + results[0][1] + "[共" + str(len(results)) + "章]" + ".txt"
    if os.path.exists(file_name): os.remove(file_name)
    is_first = True
    print("正在导出小说>>>{}".format(file_name))
    for r in results:
        with open(file_name, 'a', encoding='utf-8') as f:
            if is_first:  # 首行写入书名
                f.write("{}完本小说{}

{}**【{}】**
{}共有{}章节
{}在线阅读网址:{}

{}"
                        .format("*"*40,"*"*40," "*20,results[0][1]," "*20, len(results), " "*20,results[0][5],"*"*90))
                is_first = False
            f.write("{}第{}章:{}{}".format("

", r[4], r[2], "

"))  # 循环写入各个章节
            f.write(r[3])
    cursor.close()
    conn.close()


if __name__ == '__main__':  # 程序入口
    global category, save_path, headers, count, dbname, delay
    category = "笔趣阁"
    save_path = os.getcwd() + '/down/' + category
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}

    dbname = "笔趣阁小说_" + time.strftime("%Y-%m-%d", time.localtime()) + ".sqlite"
    count = 0
    delay = 20  # 休息时间,单位秒
    fun_makedir(save_path)  # 创建文件夹
    # main()  # 执行主函数
    show_books()

原文地址:https://www.cnblogs.com/yuexiao/p/12823899.html