scrapy 异步写入mysql和 ItemLoader的基本使用和

scrapy 异步写入mysql和 ItemLoader的基本使用和

ItemLoader的基本使用

使用itemloader能够提高scrapy在解析数据时的便利性和提升代码的可维护性,itemloader和Field的使用可以更好的提升代码的可读性。

class QuanshuSpider(scrapy.Spider):
    name = 'quanshu'
    # allowed_domains = ['quanshu.92kaifa.com']
    start_urls = ['http://quanshu.92kaifa.com/']

    def parse(self, response, **kwargs):
        article_list = response.xpath('//ul[@class="b-all-content cf"]/li')
        for article in article_list:
            # title = scrapy.Field()
            # image_url = scrapy.Field()
            # type = scrapy.Field()
            # author = scrapy.Field()
            # id = scrapy.Field()
            item_loader = ItemLoader(item=BookItem(), response=response)

            item_loader.add_value('title', article.xpath('a[@class="msgBorder"]/@title').get())
            item_loader.add_value('image_url', article.xpath('a[@class="msgBorder"]/img/@src').get())
            item_loader.add_value('type', article.xpath('a[3]/text()').get())
            item_loader.add_value('id', article.xpath('a[@class="msgBorder"]/@href').get())
            item_loader.add_value('author', article.xpath('a[4]/text()').get())

            item = item_loader.load_item()

            yield item

spider文件

import scrapy
import re
from scrapy.loader.processors import TakeFirst, MapCompose


class BookItem(scrapy.Item):
    title = scrapy.Field(output_processor=TakeFirst())
    image_url = scrapy.Field(output_processor=TakeFirst())
    type = scrapy.Field(output_processor=TakeFirst())
    author = scrapy.Field(output_processor=TakeFirst())
    id = scrapy.Field(
        input_processor=MapCompose(lambda x: re.findall('d+', x)),
        output_processor=TakeFirst()
    )

item文件

scrapy异步写入mysql

from twisted.enterprise import adbapi

import MySQLdb
import MySQLdb.cursors


class BookPipeline:

    def __init__(self, **kwargs):
        self.db_pool: adbapi.ConnectionPool = kwargs.get('db_pool')

    @classmethod
    def from_settings(cls, settings):
        # MYSQL_CONFIG = {
        #     'host': 'localhost',
        #     'user': 'root',
        #     'passwd': 'qwe123',
        #     'db': 'scrapy_test',
        #     'charset': 'utf8',
        # }
        config = settings['MYSQL_CONFIG']
        config['cursorclass'] = MySQLdb.cursors.DictCursor
        config['use_unicode'] = True

        db_pool = adbapi.ConnectionPool('MySQLdb', **config)

        return cls(db_pool=db_pool)

    def process_item(self, item, spider):

        instance = self.db_pool.runInteraction(self.insert, item)

        instance.addErrback(self.handle_err, item, spider)

        return item

    def insert(self, cursor, item):
        sql = f"""
        INSERT INTO books 
            (   
                `title`,
                `image_url`,
                `type`, 
                `author`, 
                `id`
            ) VALUES 
            (
                "{item['title']}",
                "{item['image_url']}",
                "{item['type']}", 
                "{item['author']}", 
                "{item['id']}"
            )
        """

        cursor.execute(sql)

    def handle_err(self, error, item, spider):
        """处理异常"""
        print(error)

结合mysqldb和twisted一起可以让写入mysql数据库的操作异步化,最后在setting里面打开这个pipline即可。

原文地址:https://www.cnblogs.com/ivy-blogs/p/13462902.html