JDBOOK

# -*- coding: utf-8 -*-
import scrapy

from BOOK.items import BookItem
import json

from copy import deepcopy

class BookSpider(scrapy.Spider):
    name = 'book'

    # 域名范围
    allowed_domains = ['jd.com', 'p.3.cn']
    start_urls = ['https://book.jd.com/booksort.html']

    index = 0

    # 1.解析大类的名字  和 dt(为了小类)
    def parse(self, response):
        dt_list = response.xpath('//*[@id="booksort"]/div[2]/dl/dt')

        for dt in dt_list[:1]:
            item = BookItem()
            item['big_name'] = dt.xpath('.//a/text()').extract_first()

            # 2.提取小类dd 名字 和 url
            em_list = dt.xpath('./following-sibling::*[1]/em')

            for em in em_list[:1]:
                item['small_name'] = em.xpath('./a/text()').extract_first()
                item['small_link'] = 'https:' + em.xpath('./a/@href').extract_first()

                # 3. 发送每个小类的请求
                yield scrapy.Request(
                    item['small_link'],
                    callback=self.parse_book_info,
                    meta={'key': deepcopy(item)}
                )

    # 解析每本书的数据
    def parse_book_info(self, response):

        # 接收 从 小类 传入的 item
        item = response.meta['key']

        # 3.1图书列表
        book_list = response.xpath('//*[@id="plist"]/ul/li')

        # 3.2 遍历每一本书  取出信息 100本书
        for book in book_list[:1]:
            # 书的图片
            item['book_img_src'] = book.xpath('.//div[@class="p-img"]/a/img/@src').extract_first()

            # 书的名字
            item['book_name'] = book.xpath('.//div[@class="p-name"]/a/em/text()').extract_first()

            # 书的作者
            item['book_auth'] = book.xpath('.//span[@class="p-bi-name"]/span/a/text()').extract_first()
            # 出版社
            item['book_store'] = book.xpath('.//span[@class="p-bi-store"]/a/text()').extract_first()
            # 出版时间
            item['book_time'] = book.xpath('.//span[@class="p-bi-date"]/text()').extract_first()

            # 书的价格 前端里面 callback 前端jsonp 跨域
            price_link = 'https://p.3.cn/prices/mgets?ext=11000000&pin=&type=1&area=1_72_4137_0&skuIds={}'
            skuId = book.xpath('./div/@data-sku').extract_first()

            # 发送每本书的价格
            yield scrapy.Request(
                price_link.format(skuId),
                callback=self.parse_price,
                meta={'key': deepcopy(item)}
            )

        self.index += 1
        # 4.翻页 判断什么时候结束
        next_url = response.xpath('//*[@id="J_bottomPage"]/span[1]/a[10]/@href').extract_first()

        if next_url is not None:

            # 验证数据 翻5页
            if self.index > 3:
                return

            yield response.follow(
                next_url,
                callback=self.parse_book_info,
                meta={'key': item}
            )

    # 解析价格
    def parse_price(self, response):
        print(response.body.decode())
        # 接收 从 book 传入的 item
        item = response.meta['key']
        item['book_price'] = json.loads(response.body.decode())[0]['op']
        # 交给 engien -- pipeline
        yield item

原文地址:https://www.cnblogs.com/hanjian200ok/p/9534447.html