爬取豆瓣古典文学(数据库存储)

杂谈:

爬取豆瓣读书的古典文学专栏,提取了主要的七个字段,然后将数据保存到sqlite中。

由于没有使用IP代理,导致爬到差不多第一千本书的时候被豆瓣暂时封了IP。因此接下来就要研究如何使用IP代理了。

 

爬取数据:

代码如下:

  1 # coding:utf-8
  2 import cPickle
  3 import random
  4 import requests
  5 from lxml import etree
  6 import time
  7 import re
  8 import sys
  9 import codecs
 10 import sqlite3
 11 from pyquery import PyQuery as Q
 12 
 13 class Spider:
 14     def __init__(self):
 15         self.con = sqlite3.connect(r'BookInformation.db')
 16         self.cur = self.con.cursor()
 17         # sql = '''
 18         # CREATE TABLE BookInfo(
 19         #     Name varchar(200),
 20         #     Author varchar(100),
 21         #     Tag text,
 22         #     Rating double,
 23         #     ContentIntro text,
 24         #     AuthorIntro text,
 25         #     Catalogue text,
 26         #     Commentary text )
 27         # '''
 28         # self.cur.execute(sql)
 29         # self.con.commit()
 30         self.home = ''
 31         self.Referer = 'https://book.douban.com/'
 32         self.user_agent_list = []
 33         with open('user_agent.txt', 'rb') as f:
 34             self.user_agent_list = cPickle.load(f)
 35 
 36     def GetHeaders(self):
 37         UserAgent = random.choice(self.user_agent_list)
 38         headers = {'Referer': self.Referer, 'User-Agent': UserAgent}
 39         return headers
 40 
 41     def SaveBook(self,info):
 42         sql = 'INSERT INTO BookInfo VALUES(?,?,?,?,?,?,?,?)'
 43         info_list = (info["Name"],info["Author"],info["Tag"],info["Rating"],info["ContentIntro"],info["AuthorIntro"],info["Catalogue"],info["Commentary"])
 44         self.cur.execute(sql, info_list)
 45         self.con.commit()
 46 
 47     def Crawl(self):
 48         for index in range(0,50):
 49             self.home = 'https://book.douban.com/tag/%E5%8F%A4%E5%85%B8%E6%96%87%E5%AD%A6?start='+str(index*20)+'&type=T'
 50             html = requests.get(self.home, headers=self.GetHeaders()).text
 51             html_tree = etree.HTML(html)
 52             booksList = html_tree.xpath('/html/body/div[3]/div[1]/div/div[1]/div/ul/li')
 53             for book in booksList:
 54                 time.sleep(random.randint(2,5))
 55                 bookUrl = book.xpath('div[2]/h2/a')[0].get('href')
 56                 pageHtml = requests.get(bookUrl,headers=self.GetHeaders()).text
 57                 page_tree = etree.HTML(pageHtml)
 58                 book_info = self.GetPage(bookUrl,page_tree)
 59                 print book_info['Name']
 60                 # self.SaveBook(book_info)
 61         self.con.close()
 62 
 63 
 64     def GetPage(self, page_url,page_tree):
 65         book_info = {}
 66         try:
 67             Name = self.GetName(page_tree)
 68             book_info['Name'] = Name
 69         except:
 70             book_info['Name'] = ''
 71         try:
 72             Author = self.GetAuthor(page_tree)
 73             book_info['Author'] = Author
 74         except:
 75             book_info['Author'] = ''
 76         try:
 77             Rating = self.GetRating(page_tree)
 78             book_info['Rating'] = Rating
 79         except:
 80             book_info['Rating'] = ''
 81         try:
 82             ContentIntro = self.GetContentIntro(page_tree)
 83             book_info['ContentIntro'] = ContentIntro
 84         except:
 85             book_info['ContentIntro'] = ''
 86         try:
 87             AuthorIntro = self.GetAuthorIntro(page_tree)
 88             book_info['AuthorIntro'] = AuthorIntro
 89         except:
 90             book_info['AuthorIntro'] = ''
 91         try:
 92             Catalogue = self.GetCatalogue(page_url,page_tree)
 93             book_info['Catalogue'] = Catalogue
 94         except :
 95             book_info['Catalogue'] = ''
 96         try:
 97             Tag = self.GetTag(page_tree)
 98             book_info['Tag'] = Tag
 99         except :
100             book_info['Tag'] = ''
101         try:
102             Commentary = self.GetCommentary(page_tree)
103             book_info['Commentary'] = Commentary
104         except :
105             book_info['Commentary'] = ''
106         return book_info
107 
108     def GetName(self, page_tree):
109         return page_tree.xpath('/html/body/div[3]/h1/span')[0].text
110 
111     def GetAuthor(self,page_tree):
112         author_list = page_tree.xpath('/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div[2]/span[1]/a')
113         result = ''
114         if len(author_list) is not 0:
115             list = []
116             for author in author_list:
117                 list.append(author.text.strip())
118             result = '/'.join(list)
119         else:
120             result = page_tree.xpath('/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div[2]/a')[0].text.strip()
121         return re.sub(r's+',' ',result)
122 
123 
124     def GetRating(self, page_tree):
125         rating = page_tree.xpath('/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[2]/div/div[2]/strong')[0].text.strip()
126         return eval(rating)
127 
128     def GetContentIntro(self, page_tree):
129         para_div = page_tree.xpath('//*[@id="link-report"]//div[@class="intro"]')
130         result = ''
131         if len(para_div) is not 0:
132             para_para = para_div[len(para_div)-1].xpath('p')
133             for para in para_para:
134                 result = result+'	'+para.text+'
'
135         return result
136 
137     def GetAuthorIntro(self, page_tree):
138         para_div = page_tree.xpath('/html/body/div[3]/div[2]/div/div[1]/div[3]/div[@class="indent "]//div[@class="intro"]')
139         result = ''
140         if len(para_div) is not 0:
141             para_para = para_div[-1].xpath('p')
142             for para in para_para:
143                 result = result + '	' + para.text + '
'
144         return result
145 
146     def GetCatalogue(self, page_url, page_tree):
147         bookid = page_url.split('/')[-2]
148         result = ''
149         para_div = page_tree.xpath('//*[@id="dir_'+bookid+'_full"]')
150         if len(para_div) is 0:
151             para_div = page_tree.xpath('//*[@id="dir_'+bookid+'_short"]')
152         if len(para_div) is not 0:
153                 result = Q(etree.tostring(para_div[0])).text()
154         return result
155 
156     def GetTag(self, page_tree):
157         list = page_tree.xpath('//*[@id="db-tags-section"]/div[@class="indent"]//a[@class="  tag"]')
158         result = []
159         if len(list) is not 0:
160             for tag in list:
161                 result.append(tag.text)
162         return '/'.join(result)
163 
164     def GetCommentary(self, page_tree):
165         list = page_tree.xpath('//*[@class="comment-list hot show"]//p[@class="comment-content"]')
166         result = ''
167         num = 0
168         if len(list) is not 0:
169             for comment in list:
170                 num = num+1
171                 result = result + '	'+str(num)+'.'+comment.text+'
'
172         return result
173 
174 
175 if __name__ == '__main__':
176     s = Spider()
177     s.Crawl()
原文地址:https://www.cnblogs.com/DOLFAMINGO/p/9210568.html