爬起点小说 day02

总的来说起点小说还是挺好爬的,就是爬取小说的时候太慢了,4000多本小说就爬了2天一夜

首先爬取的是网页的所有类别,并把类别名存入到mongodb中,链接存到redis中:

import scrapy
from bson.objectid import ObjectId
from scrapy.http import Request
from qidianNovel.spiders.connectionSQL import getredis,getMongodb
# 把起点首页的所有列表
class spider_list(scrapy.Spider):
    name = "spider_list" #要调用的名字
    allowed_domains = ["qidian.com"] #分一个域
    start_urls = [#所有要爬路径
        "https://www.qidian.com/all?orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0"
    ]
    #每爬完一个网页会回调parse方法
    def parse(self, response):
        hx=response.xpath('//div[@class="work-filter type-filter"]/ul/li/a|//div[@class="work-filter type-filter"]/ul/li/a')
        for i in range(1,len(hx)):
            print(hx[i].select("text()").extract()[0])  # 取长度
            Pid=self.insertMongodb(hx[i].select("text()").extract()[0],None)
            url="https:"+hx[i].select("@href").extract()[0]
            print(url)
            request=Request(url,callback=lambda response,Pid=str(Pid):self.detail_list(response,Pid))
            yield request
    def detail_list(self,response,Pid):
        links = response.xpath('//div[@class="sub-type"]/dl[@class=""]/dd/a')
        pid=ObjectId(Pid)
        for link in links:
            print("***************")
            print(link.select("text()").extract()[0])
            print(link.select('@href').extract()[0])
            print("***************")
            cid=self.insertMongodb(link.select("text()").extract()[0],pid)
            href="https:" + link.select('@href').extract()[0]
            self.pushRedis(pid,cid,href)
    def insertMongodb(self,className,pid):
        bcollection = getMongodb('novel', 'boy_list')
        id = bcollection.insert({'list_name': className, 'pid': pid})
        return id
    def pushRedis(self,pid,cid,href):
        red = getredis()
        href="%s,%s,%s" %(pid,cid,href)
        red.lpush('bnovel_all_list',href)

  

原文地址:https://www.cnblogs.com/qieyu/p/7989233.html