scrapy+pymongo爬取小说实战

pymongo 的使用

  • pymongo 是Python操作mongodb的第三方库,操作起来方便简单

  • pymongo安装-官网连接

开始使用pymongo

# -*- coding: utf-8 -*-
import pymongo

# 获得与mongodb的连接
connection = pymongo.MongoClient()
# 创建数据库
tdb = connection.Jikexueyuan
# 使用数据库创建一个集合
post_info = tdb.test
# 字典
jike = {'name': '张三', 'age': 3, 'skill': 'python'}
god = {'name': '李四', 'age': 10, 'skill': 'pymongo'}

# 插入到创建的集合中
post_info.insert(jike)
post_info.insert(god)

#删除数据
post_info.remove({'name': '张三'})

# 查询
post_info.find()

scrapy中使用pymongo

  • 在pipelines.py 中
from scrapy.conf import settings
import pymongo

class PymongoPipeline(object):
    def __init__(self):
        host = settings['MONGO_HOST']
        port = settings['MONGO_PORT']
        db = settings['MONGO_DB']
        client = pymongo.MongoClient(host=host, port=port)
        tdb = lient[db]
        self.post = tdb[settings['MONGO_DOCNAME']
        
    def process_item(self, item, spider):
        bookInfo = dict(item)
        self.post.insert(bookInfo)
        return item
import pymongo

class MongoPipeline(object):

    collection_name = 'scrapy_items'

    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
        )

    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        self.db[self.collection_name].insert_one(dict(item))
        return item
  • 到settings 中启动pipelines 配置
ITEM_PIPELINES = {
    'myproject.pipelines.PricePipeline': 300,
    'myproject.pipelines.JsonWriterPipeline': 800,
}

小说实战

from scrapy imort Field, Item

class XsItem(Item):
    bookName = Field()
    bookTitle = Field()
    chapterNum = Field()
    chapterName = Field()
    chapterURL = Field()
  • spider中, 具体匹配规则请分析网页,这里只是个例
from scrapy.selector import Selector


def parse(self, response):
    selector = Selector(response)
    table = selector.xpath('//table')
    for each in talbe: 
        bookName = each.xpath('tr/td[@colspan="3"]/center/h2/text()').extract()[0]
        content = each.xpath('tr/td/a/@href').extract()
        url = each.xpath('tr/td/a/@href').extract()
        for i in range(len(url)):
            # 需要导入你 自己的ITEM
            item = XsItem()
            item['bookName'] = bookName
            item['chapterURL'] = url[i]
            try:
                item['bookTitle'] = content[i].split(' ')[0]
                item['chapterNum'] = content[i].split(' ')[1]
            except Exception,e:
                continue 
            try:
                item['chapterName'] = content[i].split(' ')[2]
            except Exception, e:
                item['chapterName'] = content[i].xplit(' ')[1][-3:]
            yield item
[注:] 具体解析请到小说网页分析数据,这里仅提供思路
原文地址:https://www.cnblogs.com/yymor/p/10232202.html