Python爬取小说+Servlet+C3P0+MVC构建小说api

一、摘要:

使用python爬取网络小说数据存入数据库,利用C3P0数据库连接池获取数据库数据,采用MVC三层架构对数据库数据进行操作,输出JSON格式数据到前端页面

二、内容:

1.数据库设计

1.1 小说种类表

1.1.1 表数据类型

image-20201015093342540

1.1.2 表数据格式

image-20201015093349559

1.2 小说信息表

1.1.1 表数据类型

image-20201015093753173

1.1.2 表数据格式

image-20201015093750302

1.3 小说章节表

1.1.1 表数据类型

image-20201015093836976

1.1.2 表数据格式

image-20201015093834492

2.核心源码分析

2.1 通过Python爬取小说数据存入数据库

import os
from time import sleep
import requests
import parsel
from parsel import Selector #请求网页html数据
import pymysql #请求数据库


# 从chapter_url中得到小说的名字对应的xiaoshuoid  chapter_urlshuzu[4],然后从章节中得到zhangjieid chapter_urlshuzu[5].split('.')[0],章节内容是for循环里所有 利用test 和章节名称就是title好拿
def download_one_chapter(chapter_url,bookname,lebieming): #爬取单一章节内容代码块,函数
    response=requests.get(chapter_url)
    chapter_urlshuzu=chapter_url.split('/') #拆分章节url,通过切片获取需要的信息
    response.encoding=response.apparent_encoding    #字符编码 自动识别
    # response.encoding = 'utf-8' 手动改变字符编码
    # 提取网页内容的方法:正则表达式:提取字符串   xpath css选择器 提取网页数据结构(html) 语法糖 或者 lxml pyquery parsel

    sel=Selector(response.text) # 提取网页小说内容
    title=sel.css('h1::text').get()
    f=open('biquge/'+lebieming+'/'+bookname+'/'+title+'.txt',mode='w',encoding='utf-8')     # f 打开的文件对象 打开文件,创建文件
    f.write(title)
    for line in sel.css('#content::text').getall():
        print(line.strip(),file=f) #逐行读入
    f.close()  # 关闭文件

    # 连接数据库
    conn = pymysql.connect(host='localhost',port=3306,user='root',passwd='123456',db='xiaoshuo',charset='utf8',)
    cursor = conn.cursor()
    f = open('biquge/' + lebieming + '/' + bookname + '/' + title + '.txt', mode='r', encoding='utf-8')
    while True:
        # 逐行读取
        line = f.readlines()
        if line:
            # 处理每行

            line = "".join(line)
            line = line.strip('
')
            line = line.split(",")
            content = line[0]
            cursor.execute(
                "insert into zhangjie(zhangjieid,zhangjiename,zhangjieleirong,xiaoshuoid) values(%s,%s,%s,%s)",
                [chapter_urlshuzu[5].split('.')[0],title,content,chapter_urlshuzu[4]])
        else:
            break
    f.close()
    cursor.close()
    conn.commit()
    conn.close()

# xiaoshuoming就是bookname xiaoshuoid就是book_url里的参数book_urlshuzu[4] xiaoshuofenleiid就是用lebieming对应数组来取leibieshuzu.index(lebieming)+1
def download_one_book(book_url,bookname,lebieming): # 下载一本小说
    response=requests.get(book_url)
    book_urlshuzu=book_url.split('/')
    response.encoding=response.apparent_encoding
    sel=Selector(response.text)
    if os.path.exists('biquge/' + lebieming + '/' + bookname): # 创建小说的文件夹
        print("已存储有" + bookname + "小说")
        return
    if not os.path.exists('biquge/' + lebieming + '/' + bookname):
        os.mkdir('biquge/' +lebieming+'/'+ bookname)
        leibieshuzu=['玄幻','武侠','都市','历史','侦探','网游','科幻']
        xiaoshuoimg='http://www.shuquge.com/files/article/image/'+book_urlshuzu[4][0:len(book_urlshuzu[4])-3]+'/'+book_urlshuzu[4]+'/'+book_urlshuzu[4]+'s.jpg'
        # 连接数据库
        conn = pymysql.connect(host='localhost',port=3306,user='root',passwd='123456',db='xiaoshuo',charset='utf8',)
        cursor = conn.cursor()
        cursor.execute(
            "insert into xiaoshuoinfo(xiaoshuoid,xiaoshuoname,xiaoshuofenleiid,xiaoshuoimg) values(%s,%s,%s,%s)",
            [book_urlshuzu[4],bookname,leibieshuzu.index(lebieming)+1,xiaoshuoimg])
        cursor.close()
        conn.commit()
        conn.close()
        i=0;
        index=sel.css('.listmain a::attr(href)').getall()
        # 限制不下载前十二章
        # print(index)
        # for line in index:
        #     i+=1
        #     # print('http://www.shuquge.com/txt/'+ book_urlshuzu[4]+'/'+line)
        #     if i>12:
        #         download_one_chapter('http://www.shuquge.com/txt/'+ book_urlshuzu[4]+'/'+line,bookname,lebieming)
        download_one_chapter('http://www.shuquge.com/txt/'+ book_urlshuzu[4]+'/'+index[12],bookname,lebieming)
        download_one_chapter('http://www.shuquge.com/txt/'+ book_urlshuzu[4]+'/'+index[13],bookname,lebieming)
        print("读取"+bookname+"小说完成")

# 下载单一类别小说
def download_category(category_url,leibieming):
    response = requests.get(category_url)
    response.encoding = response.apparent_encoding
    sel = Selector(response.text)
    os.mkdir('biquge/' + leibieming)
    if os.path.exists('biquge/' + leibieming):
        index2 = sel.css('span.s2 a::text').getall()
        i=0
        index = sel.css('span.s2 a::attr(href)').getall()
        for line in index:
            download_one_book(line,index2[i],leibieming)
            i+=1

def download_allcategory(website_url): # 下载全部类别的小说
    response = requests.get(website_url)
    response.encoding = response.apparent_encoding
    sel = Selector(response.text)
    index2 = sel.css('.nav a::text').getall()
    i=1
    j=1
    index = sel.css('.nav a::attr(href)').getall()
    for line in index:
        if i>=8:
            break
        if j!=0 and j<8:
            print(line)
            download_category(line, index2[i])
        i+=1
        j+=1

download_allcategory('http://www.shuquge.com/')

2.2 获取所有小说信息

2.2.1 Controller—BookServlet
 //获取当前book路由下的方法
String method = req.getParameter("method");
System.out.println("BookServlet中使用方法为:" + method);
Gson gson = new Gson();
String json="";
List<NovelInfo> list=new ArrayList<>();
//如果为空默认使用findAll方法
if (method == null) {
    method = "findAllBook";
}
switch(method){
    case "findAllBook":
        list=bookService.findAllBook();
        json = gson.toJson(list);
        break;
}
2.2.2 Service—BookService
public interface BookService {
    public List<NovelInfo> findAllBook();
}
public class BookServiceImpl implements BookService {
    private BookRepository bookRepository=new BookRepositoryImpl();
    @Override
    public List<NovelInfo> findAllBook() {
        return bookRepository.findAllBook();
    }
}
2.2.3 Repository—Book Repository
public interface BookRepository {
    public List<NovelInfo> findAllBook();
}
public class BookRepositoryImpl implements BookRepository {
    List<NovelInfo> list=new ArrayList<>();
    @Override
    public List<NovelInfo> findAllBook() {
        //        获取连接对象
        Connection connection= JdbcTools.getConnection();
        //        运行数据库语句的对象
        PreparedStatement statement=null;
        //        保存结果集的对象
        ResultSet resultSet =null;
        //需要执行的sql语句
        String sql="SELECT * FROM xiaoshuoinfo";
        try {
            statement=connection.prepareStatement(sql);

            resultSet=statement.executeQuery();

            while (resultSet.next()){
                list.add(new NovelInfo(resultSet.getString(1),resultSet.getString(2),resultSet.getString(3),resultSet.getString(4)));
            }
        } catch (SQLException e) {
            e.printStackTrace();
        }finally {
            JdbcTools.release(connection,statement,resultSet);
        }
        return list;
    }
}

2.3 其他

根据小说种类ID获取该种类所有小说信息,获取小说所有种类,获取所有小说章节,获取一本小说所有章节,根据小说ID获取该小说信息参看源码链接

python爬取小说

小说api

原文地址:https://www.cnblogs.com/dongxuelove/p/13819251.html