某家简单爬虫记录

定义爬取数据

import scrapy

class LianjianItem(scrapy.Item):
    name = scrapy.Field()
    address = scrapy.Field()
    type = scrapy.Field()
    size = scrapy.Field()
    price = scrapy.Field()

编写爬虫文件

# -*- coding: utf-8 -*-
import scrapy
from lianjian.items import LianjianItem
from scrapy.http import Request

class LianjiaspiderSpider(scrapy.Spider):
    name = 'lianjiaSpider'
    allowed_domains = ['lianjia.com']
    start_urls = ['https://hz.fang.lianjia.com/loupan/yuhang-xiaoshan-binjiang/pg1/']

    def parse(self, response):
        item = LianjianItem()
        item['name'] = response.xpath('//div[@class="resblock-name"]/a/text()').extract()
        item['address'] = response.xpath('//div[@class="resblock-location"]/a/text()').extract()
        item['type'] = response.xpath('//a[@class="resblock-room"]/span/text()').extract()
        item['size'] = response.xpath('//div[@class="resblock-area"]/span/text()').extract()
        item['price'] =response.xpath('//div[@class="resblock-price"]/div[@class="second"]/text()').extract()
        yield item
        for i in range(1,52):
            url = 'https://hz.fang.lianjia.com/loupan/yuhang-xiaoshan-binjiang/pg'+str(i)+'/'
            yield Request(url,callback=self.parse)

定义管道

编写管道文件

# -*- coding: utf-8 -*-

import xlwt
import xlrd

class LianjianPipeline(object):
    def __init__(self):
        pass

    def process_item(self, item, spider):
        # print("进来了"+str(item))
        line = 0
        for i in range(len(item['name'])):
            name = item['name'][i]
            # self.sheet.write(line,0,name)
            print(name)
            address = item['address'][i]
            print(address)
            # self.sheet.write(line,1,address)
            type = item['type'][i]
            print(type)
            # self.sheet.write(line, 2, type)
            size = item['size'][i]
            print(size)
            # self.sheet.write(line, 3, size)
            price = item['price'][i]
            print(price)
            # self.sheet.write(line, 4, price)
            print("-----------------------")
            line += 1
        # self.book.save("lianjia.xls")
        return item

settings.py开启管道

ITEM_PIPELINES = {
   'lianjian.pipelines.LianjianPipeline': 300,
}

启动爬虫文件

image.png

原文地址:https://www.cnblogs.com/yiweiblog/p/12652493.html