scrapy 爬取前程无忧

spider

# -*- coding: utf-8 -*-
import scrapy
from Jobs.items import JobsItem

class Job51spiderSpider(scrapy.Spider):
    name = 'Job51Spider'
    allowed_domains = ['www.51job.com', 'search.51job.com']
    offset = 1
    # 起始url
    url = "https://search.51job.com/list/090200,000000,0000,00,9,99,php,2,"
    start_urls = [url + str(offset) + ".html"]
    def parse(self, response):
        print(response.url)
        for each in response.css('#resultList .el:not(.title)'):
            # 初始化模型对象
            item = JobsItem()
            # 职位名
            item['zwname'] = each.css('.t1 a').xpath('./@title').extract_first()
            # 公司名字
            item['gsname'] = each.css('.t2 a').xpath('./@title').extract_first()
            # 工作地点
            item['gzdd'] = each.css('.t3::text').extract_first()
            # 工资
            item['gz'] = each.css('.t4::text').extract_first()
            # 发布时间
            item['fbtime'] = each.css('.t5::text').extract_first()
            yield item

        zong = response.xpath('//div[@class="dw_page"]/div/div/div/span/text()').extract_first().split('')[0].strip('')
        if self.offset < int(zong):
            self.offset += 1

        # import ipdb; ipdb.set_trace()
        ss = self.url + str(self.offset) + ".html"
        yield scrapy.Request(url=ss, callback=self.parse)

items

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class JobsItem(scrapy.Item):
    # 职位名
    zwname = scrapy.Field()
    # 公司名字
    gsname = scrapy.Field()
    # 工作地点
    gzdd = scrapy.Field()
    # 工资
    gz = scrapy.Field()
    # 发布时间
    fbtime = scrapy.Field()

原文地址:https://www.cnblogs.com/sxqfuture/p/10256462.html