05 pipeline 管道

管道接收怕从爬取到的item, 并对item进行过滤, 去重, 清洗等工作, 然后对item进行持久化存储

去重管道:
  如果图片名称相同, 我们就认为图片重复, 删除该项目

from scrapy.exceptions import DropItem
class
DuplicatePipeline(object): def __init__(self): self.fingerprints = set() # 我们用set作为一个数据指纹, def process_item(self, item, spider): if item['picname'] in self.fingerprints: raise DropItem self.fingerprints.add(item['picname']) return item

# 将数据存储到csv文件中

import csv

class CsvFeedPipeline(object):
    def __init__(self):
        self.fp = open('data.csv','a',encoding='utf8')
        filednames = ['classname', 'picname', 'picurl']
        self.writer = csv.DictWriter(self.fp, fieldnames = fieldnames)
        self.writer.writeheader()
    
    def process_item(self, item, spider):
        self.writer.writerow( dict(item) )
        return item
       
    def close_spider(self, spider):
        self.fp.close()

# 过滤, 删除掉与游戏相关的新闻内容

class BlockGamePipeline(object):
    def process_item( self, item, spider ):
        filter_key = '游戏'
        if filter_key in (item['title']).encode('utf8'):
            raise DropItem
        return item

# 加工性管道:

class ProductPricePipeline():
    def process_item(self, item, spider):
        item['total'] = float( item['price'] ) * float( item['count'] )
        return item

# 存储型管道

class json 
class JsonFeedPipeline:
    def __init__(self):
        self.json_file = open('feed.json','wt')
        self.json_file.write('[
')
    
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + ',
'
        self.json_file.write(line)
    
    def close_spider(self, spider):
        self.json_file.write( '
]' )
        self.json_file.close()

    

原文地址:https://www.cnblogs.com/zhangjian0092/p/11693450.html