python获取小说内容

在使用前要安装python的第3方库，BeautifulSoup，pymysql

代码里面用了mysql数据库

代码里面获取小说网站地址是：http://www.kbiquge.com

mysql里面的表结构：

CREATE TABLE `story` (
  `id` varchar(200) NOT NULL DEFAULT '',
  `name` varchar(200) DEFAULT NULL COMMENT '名称',
  `start` varchar(20) DEFAULT NULL COMMENT '状态',
  `end_start` varchar(200) DEFAULT NULL COMMENT '更新时间',
  `author` varchar(200) DEFAULT NULL COMMENT '作者',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;


CREATE TABLE `chapter` (
  `chapter_id` varchar(200) NOT NULL DEFAULT '0' COMMENT '章节ID',
  `story_id` varchar(200) DEFAULT NULL COMMENT '小说ID',
  `chapter_name` varchar(200) DEFAULT NULL COMMENT '章节名称',
  `chapter_content` mediumtext COMMENT '内容',
  `chapter_href` varchar(2000) DEFAULT NULL COMMENT 'URL',
  PRIMARY KEY (`chapter_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

以下是源码：

 1 #coding=utf-8
 2 import pymysql
 3 import time
 4 import datetime
 5 import uuid
 6 
 7 
 8 from urllib import request
 9 from bs4 import BeautifulSoup
10 
11 
12 #数据存入章节表中 批量提价数据， usersvalues[] 包含chapter_id，story_id，chapter_name，chapter_content,chapter_href
13 def Write_info(usersvalues):
14     db = pymysql.connect("localhost","root","123456","python" )
15     cursor = db.cursor()
16     try:
17         sql = "INSERT  INTO chapter(chapter_id,story_id,chapter_name,chapter_content,chapter_href) 
18           VALUES(%s,%s,%s,%s,%s)"
19         # 执行sql语句 批量插入数据
20         cursor.executemany(sql, usersvalues)
21         db.commit()
22     except ZeroDivisionError:
23         print ("Error: unable to fetch data")
24         db.rollback()
25     db.close()
26 
27 #小说名称 story_name
28 def Story_name(story_name):
29     db = pymysql.connect("localhost","root","123456","python" )
30     uuids=str(uuid.uuid1()).replace('-','')
31     cursor = db.cursor()
32     try:
33         cursor.execute("select id from story  where name='"+story_name+"'")
34         fname=""
35         results = cursor.fetchall()
36         for row in results:
37             fname= row[0]
38         if cursor.rowcount!=1:
39             sql = """INSERT INTO STORY(id,name, start, end_start,author) 
40              VALUES ('"""+uuids+"""', '"""+story_name+"""', '1', '1', 'wangyh')"""
41             cursor.execute(sql)
42             db.commit()
43             return uuids
44         else:
45             return fname
46     except ZeroDivisionError:
47         print ("Error: unable to fetch data")
48         db.rollback()
49     db.close()
50 
51 
52 if __name__ == '__main__':
53     # 目录页
54     url_xs='http://www.kbiquge.com'
55     url = url_xs+'/86_86683/'
56     head = {}
57     head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19'
58     req = request.Request(url, headers = head)
59     response = request.urlopen(req)
60     html = response.read()
61     # 解析目录页
62     soup = BeautifulSoup(html, 'lxml')
63     #小说名称 id="info"
64     story_name = soup.find('div', id = 'info').find("h1").text
65     #查询是否存入 story表中 story_id 小说ID
66     story_id= Story_name(story_name)
67     print("story_id:"+story_id)
68     # find_next找到第二个<div> 小说目录
69     soup_texts = soup.find('div', id = 'list')
70     usersvalues=[]
71     # 遍历ol的子节点，打印出章节标题和对应的链接地址
72     for link in soup_texts.dl.children:
73         if link != '
':
74             print('start')
75             list_tmp=link.find_all('a')
76             for a in list_tmp:
77                 #0.5秒
78                 time.sleep(0.5)
79                 download_url = url_xs+a.get('href')
80                 download_req = request.Request(download_url, headers = head)
81                 download_response = request.urlopen(download_req)
82                 download_html = download_response.read()
83                 download_soup = BeautifulSoup(download_html, 'lxml')
84                 download_soup_texts = download_soup.find('div', id = 'content')
85                 download_soup_texts = download_soup_texts.text
86                 download_soup_texts= download_soup_texts.replace(u'xa0', u' ')
87                 uuids="w"+str(int(round(time.time() * 1000)))
88                 data=(uuids,story_id,a.text,download_soup_texts,download_url)
89                 usersvalues.append(data)
90     Write_info(usersvalues)

View Code