Scrapy用Pipeline写入MySQL

编辑pipelines.py,添加自定义pipelines类:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# class HongxiuPipeline(object):
# def process_item(self, item, spider):
# return item
import datetime
from twisted.enterprise import adbapi


class HongxiuMysqlPipeline(object):

@classmethod
def from_crawler(cls, crawler):
# 从项目的配置文件中读取相应的参数
# cls.MYSQL_DB_NAME = crawler.settings.get("MYSQL_DB_NAME")
cls.HOST = crawler.settings.get("MYSQL_HOST")
cls.PORT = crawler.settings.get("MYSQL_PORT")
cls.USER = crawler.settings.get("MYSQL_USER")
cls.PASSWD = crawler.settings.get("MYSQL_PASSWORD")
return cls()

def open_spider(self, spider):
self.dbpool = adbapi.ConnectionPool('pymysql', host=self.HOST, port=self.PORT, user=self.USER,
passwd=self.PASSWD, charset='utf8')


def process_item(self, item, spider):
#提交
self.dbpool.runInteraction(self.insert_db, item)
return item

def handle_error(self, failure):
# 处理异步插入时的异常
print(failure)

def close_spider(self, spider):
#关闭连接
self.dbpool.close()

def insert_db(self, cur, item):
#取出数据,执行cur sql
create_date = datetime.datetime.now().date()
create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
values = (
None,
item['book_id'],
item['book_name'],
item['book_author'],
item['book_type'],
item['tag'],
item['brief'],
item['website'],
None
)
sql = 'INSERT INTO 库名.表名 VALUES (%s'+',%s'*8+')'
cur.execute(sql, values)

接着在settings.py中写入相关配置参数,添加至item_pipelines中:

MYSQL_DB_NAME = 'scrapy_db'
MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'new.1234'
# 
ITEM_PIPELINES = {
    'toscrape_book.pipelines.MySQLPipeline': 400,
}
原文地址:https://www.cnblogs.com/HugJun/p/12176858.html