爬虫 ——(50页)books

代码如下所示

 1 import scrapy
 2 from scrapy.selector.unified import SelectorList
 3 from bookspider.items import BooksItem
 4 class BooksSpider(scrapy.Spider):
 5     name = 'books'
 6     allowed_domains = ['books.toscrape.com']
 7     start_urls = ['http://books.toscrape.com/catalogue/page-1.html']
 8     url = "http://books.toscrape.com"
 9     def parse(self, response):
10         # print('*')
11         # print(type(response))    #  <class 'scrapy.http.response.html.HtmlResponse'>
12         # print('*')
13 
14         Books_lis = response.xpath("//div/ol[@class='row']/li")
15         # print('*')
16         # print(type(Books_lis))
17         # print('*')
18         for Books_li in Books_lis:
19             book_name = Books_li.xpath(".//h3/a/@title").getall()
20             book_price = Books_li.xpath(".//div[@class='product_price']/p[@class='price_color']/text()").getall()
21 
22             book_price = "".join(book_price)
23             item = BooksItem(book_name=book_name,book_price=book_price)
24             # books_all = {"book_name":book_name,"book_price":book_price}
25             # yield books_all
26             yield item
27 
28         next_url = response.xpath("//ul[@class = 'pager']/li[@class = 'next']/a/@href").get()
29         #
30         if  next_url:
31             # 如果找到下一页的URL,就得到绝对路径,构造新的Request对象
32             next_url = response.urljoin(next_url)
33             yield scrapy.Request(next_url,callback=self.parse)
34 
35         # if  not next_url:  #此方法报错  待解决
36         #     return
37         # else:
38         #     yield scrapy.Request(self.url + next_url, callback=self.parse)
View Code
原文地址:https://www.cnblogs.com/cfancy/p/11860937.html