管道接收怕从爬取到的item, 并对item进行过滤, 去重, 清洗等工作, 然后对item进行持久化存储
去重管道:
如果图片名称相同, 我们就认为图片重复, 删除该项目
from scrapy.exceptions import DropItem
class DuplicatePipeline(object): def __init__(self): self.fingerprints = set() # 我们用set作为一个数据指纹, def process_item(self, item, spider): if item['picname'] in self.fingerprints: raise DropItem self.fingerprints.add(item['picname']) return item
# 将数据存储到csv文件中
import csv class CsvFeedPipeline(object): def __init__(self): self.fp = open('data.csv','a',encoding='utf8') filednames = ['classname', 'picname', 'picurl'] self.writer = csv.DictWriter(self.fp, fieldnames = fieldnames) self.writer.writeheader() def process_item(self, item, spider): self.writer.writerow( dict(item) ) return item def close_spider(self, spider): self.fp.close()
# 过滤, 删除掉与游戏相关的新闻内容
class BlockGamePipeline(object): def process_item( self, item, spider ): filter_key = '游戏' if filter_key in (item['title']).encode('utf8'): raise DropItem return item
# 加工性管道:
class ProductPricePipeline(): def process_item(self, item, spider): item['total'] = float( item['price'] ) * float( item['count'] ) return item
# 存储型管道
class json class JsonFeedPipeline: def __init__(self): self.json_file = open('feed.json','wt') self.json_file.write('[ ') def process_item(self, item, spider): line = json.dumps(dict(item)) + ', ' self.json_file.write(line) def close_spider(self, spider): self.json_file.write( ' ]' ) self.json_file.close()