scrapy之持久化存储

持久化存储的两种方式

1基于磁盘文件存储

基于终端指令

　　基于终端指令需要注意两点：

保证parse方法返回一个可迭代类型的对象（存储解析到的页面内容）
使用终端指令完成数据存储到制定磁盘文件中的操作　　scrapy crawl 爬虫文件名称 –o 磁盘文件.后缀

# -*- coding: utf-8 -*-
import scrapy

class QiubaiSpider(scrapy.Spider):
    name = 'qiubai'
    #allowed_domains = ['www.qiushibaike.com/text']
    start_urls = ['https://www.qiushibaike.com/text/']
    def parse(self, response):
        #建议大家使用xpath进行指定内容的解析（框架集成了xpath解析的接口）
        # 段子的内容和作者
        div_list = response.xpath('//div[@id="content-left"]/div')
        #存储解析到的页面数据
        data_list = []
        for div in div_list:
            #xpath解析到的指定内容被存储到了Selector对象
            #extract()该方法可以将Selector对象中存储的数据值拿到
            #author = div.xpath('./div/a[2]/h2/text()').extract()[0]
            #extract_first()  ==   extract()[0]
            author = div.xpath('./div/a[2]/h2/text()').extract_first()
            content = div.xpath('.//div[@class="content"]/span/text()').extract_first()

            dict = {
                'author':author,
                'content':content
            }
            data_list.append(dict)
        return data_list

爬虫文件

不要忘记执行之前进行配置，每一个工程都应该配置的。

基于管道

items：存储解析到的页面数据
pipelines：处理持久化存储的相关操作
代码实现流程：
1. 将解析到的页面数据存储到items对象
2. 使用yield关键字将items提交给管道文件进行处理
3. 在管道文件中编写代码完成数据存储的操作
4. 在配置文件中开启管道操作

# -*- coding: utf-8 -*-
import scrapy
from qiubaiPro.items import QiubaiproItem

class QiubaiSpider(scrapy.Spider):
    name = 'qiubai'
    #allowed_domains = ['www.qiushibaike.com/text']
    start_urls = ['https://www.qiushibaike.com/text/']
    def parse(self, response):
        #建议大家使用xpath进行指定内容的解析（框架集成了xpath解析的接口）
        # 段子的内容和作者
        div_list = response.xpath('//div[@id="content-left"]/div')
        for div in div_list:
            #xpath解析到的指定内容被存储到了Selector对象
            #extract()该方法可以将Selector对象中存储的数据值拿到
            #author = div.xpath('./div/a[2]/h2/text()').extract()[0]
            #extract_first()  ==   extract()[0]
            author = div.xpath('./div/a[2]/h2/text()').extract_first()
            content = div.xpath('.//div[@class="content"]/span/text()').extract_first()

            #1.将解析到的数据值（author和content）存储到items对象
            item = QiubaiproItem()
            item['author'] = author
            item['content'] = content

            #2.将item对象提交给管道
            yield item

爬虫文件

import scrapy


class QiubaiproItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    author = scrapy.Field()
    content = scrapy.Field()

items.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class QiubaiproPipeline(object):
    fp = None

    # 整个爬虫过程中，该方法只会在开始爬虫的时候被调用一次
    def open_spider(self, spider):
        print('开始爬虫')
        self.fp = open('./qiubai_pipe.txt', 'w', encoding='utf-8')

    # 该方法就可以接受爬虫文件中提交过来的item对象，并且对item对象中存储的页面数据进行持久化存储
    # 参数：item表示的就是接收到的item对象
    # 每当爬虫文件向管道提交一次item，则该方法就会被执行一次
    def process_item(self, item, spider):
        # print('process_item 被调用！！！')
        # 取出item对象中存储的数据值
        author = item['author']
        content = item['content']

        # 持久化存储
        self.fp.write(author + ":" + content + '


')
        return item
    
    # 该方法只会在爬虫结束的时候被调用一次
    def close_spider(self, spider):
        print('爬虫结束')
        self.fp.close()

pipelines.py

# 开启管道的配置
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'qiubaiPro.pipelines.QiubaiproPipeline': 300,
}

2基于数据库存储

编码流程：

将解析到的页面数据存储到items对象
使用yield关键字将items提交给管道文件进行处理
在管道文件中编写代码完成数据存储的操作
在配置文件中开启管道操作

上述的文件只需要改一下pipelines.py 就可以了

基于MySQL数据库

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class QiubaiproPipeline(object):

    conn = None
    cursor = None
    def open_spider(self,spider):
        print('开始爬虫')
        #链接数据库
        self.conn = pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='123456',db='qiubai')
    #编写向数据库中存储数据的相关代码
    def process_item(self, item, spider):
        #1.链接数据库
        #2.执行sql语句
        sql = 'insert into qiubai values("%s","%s")'%(item['author'],item['content'])
        self.cursor = self.conn.cursor()
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        #3.提交事务

        return item
    def close_spider(self,spider):
        print('爬虫结束')
        self.cursor.close()
        self.conn.close()

数据库要有的，然后才可以向里面插入数据，直接创建一个含有两个字段（varchar类型字段）的表就ok

基于redis数据库

import redis

class QiubaiproPipeline(object):
    conn = None
    def open_spider(self,spider):
        print('开始爬虫')
        self.conn = redis.Redis(host='127.0.0.1',port=6379)
    def process_item(self, item, spider):
        dict = {
            'author':item['author'],
            'content':item['content']
        }
        self.conn.lpush('data', dict)
        return item