scrapy 异步写入mysql和 ItemLoader的基本使用和
ItemLoader的基本使用
使用itemloader能够提高scrapy在解析数据时的便利性和提升代码的可维护性,itemloader和Field的使用可以更好的提升代码的可读性。
class QuanshuSpider(scrapy.Spider):
name = 'quanshu'
# allowed_domains = ['quanshu.92kaifa.com']
start_urls = ['http://quanshu.92kaifa.com/']
def parse(self, response, **kwargs):
article_list = response.xpath('//ul[@class="b-all-content cf"]/li')
for article in article_list:
# title = scrapy.Field()
# image_url = scrapy.Field()
# type = scrapy.Field()
# author = scrapy.Field()
# id = scrapy.Field()
item_loader = ItemLoader(item=BookItem(), response=response)
item_loader.add_value('title', article.xpath('a[@class="msgBorder"]/@title').get())
item_loader.add_value('image_url', article.xpath('a[@class="msgBorder"]/img/@src').get())
item_loader.add_value('type', article.xpath('a[3]/text()').get())
item_loader.add_value('id', article.xpath('a[@class="msgBorder"]/@href').get())
item_loader.add_value('author', article.xpath('a[4]/text()').get())
item = item_loader.load_item()
yield item
spider文件
import scrapy
import re
from scrapy.loader.processors import TakeFirst, MapCompose
class BookItem(scrapy.Item):
title = scrapy.Field(output_processor=TakeFirst())
image_url = scrapy.Field(output_processor=TakeFirst())
type = scrapy.Field(output_processor=TakeFirst())
author = scrapy.Field(output_processor=TakeFirst())
id = scrapy.Field(
input_processor=MapCompose(lambda x: re.findall('d+', x)),
output_processor=TakeFirst()
)
item文件
scrapy异步写入mysql
from twisted.enterprise import adbapi
import MySQLdb
import MySQLdb.cursors
class BookPipeline:
def __init__(self, **kwargs):
self.db_pool: adbapi.ConnectionPool = kwargs.get('db_pool')
@classmethod
def from_settings(cls, settings):
# MYSQL_CONFIG = {
# 'host': 'localhost',
# 'user': 'root',
# 'passwd': 'qwe123',
# 'db': 'scrapy_test',
# 'charset': 'utf8',
# }
config = settings['MYSQL_CONFIG']
config['cursorclass'] = MySQLdb.cursors.DictCursor
config['use_unicode'] = True
db_pool = adbapi.ConnectionPool('MySQLdb', **config)
return cls(db_pool=db_pool)
def process_item(self, item, spider):
instance = self.db_pool.runInteraction(self.insert, item)
instance.addErrback(self.handle_err, item, spider)
return item
def insert(self, cursor, item):
sql = f"""
INSERT INTO books
(
`title`,
`image_url`,
`type`,
`author`,
`id`
) VALUES
(
"{item['title']}",
"{item['image_url']}",
"{item['type']}",
"{item['author']}",
"{item['id']}"
)
"""
cursor.execute(sql)
def handle_err(self, error, item, spider):
"""处理异常"""
print(error)
结合mysqldb和twisted一起可以让写入mysql数据库的操作异步化,最后在setting里面打开这个pipline即可。