使用scrapy爬取百度招聘

百度招聘都是通过ajax返回的数据，用scrapy爬就很尴尬了。
建模，items文件：

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define here the models for your scraped items
 4 #
 5 # See documentation in:
 6 # http://doc.scrapy.org/en/latest/topics/items.html
 7 
 8 import scrapy
 9 
10 
11 class BaiduItem(scrapy.Item):
12     # 职位名称
13     job_name = scrapy.Field()
14     # 职位类别
15     job_type = scrapy.Field()
16     # 工作地点
17     address = scrapy.Field()
18     # 招聘人数
19     number = scrapy.Field()
20     # 更新时间
21     pub_time = scrapy.Field()
22     # 详情页面  ref="#/jobDetail/2/1345536716"
23     detail_link = scrapy.Field()
24     # 工作职责
25     duty = scrapy.Field()
26     # 职责要求
27     require = scrapy.Field()
28     pass

items.py

settings.py文件如下，设定了MONGO_HOST，下载器中间件。

 1 # -*- coding: utf-8 -*-
 2 
 3 # Scrapy settings for BaiDu project
 4 #
 5 # For simplicity, this file contains only settings considered important or
 6 # commonly used. You can find more settings consulting the documentation:
 7 #
 8 #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 
12 BOT_NAME = 'BaiDu'
13 
14 SPIDER_MODULES = ['BaiDu.spiders']
15 NEWSPIDER_MODULE = 'BaiDu.spiders'
16 
17 MONGO_HOST = "127.0.0.1"
18 MONGO_PORT = 27017
19 MONGO_DBNAME = "baidu"
20 MONGO_COLNAME = "zhaopin"
21 
22 # Crawl responsibly by identifying yourself (and your website) on the user-agent
23 #USER_AGENT = 'BaiDu (+http://www.yourdomain.com)'
24 
25 # Obey robots.txt rules
26 ROBOTSTXT_OBEY = True
27 
28 # Configure maximum concurrent requests performed by Scrapy (default: 16)
29 #CONCURRENT_REQUESTS = 32
30 
31 # Configure a delay for requests for the same website (default: 0)
32 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
33 # See also autothrottle settings and docs
34 #DOWNLOAD_DELAY = 3
35 # The download delay setting will honor only one of:
36 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
37 #CONCURRENT_REQUESTS_PER_IP = 16
38 
39 # Disable cookies (enabled by default)
40 #COOKIES_ENABLED = False
41 
42 # Disable Telnet Console (enabled by default)
43 #TELNETCONSOLE_ENABLED = False
44 
45 # Override the default request headers:
46 #DEFAULT_REQUEST_HEADERS = {
47 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
48 #   'Accept-Language': 'en',
49 #}
50 
51 # Enable or disable spider middlewares
52 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
53 #SPIDER_MIDDLEWARES = {
54 #    'BaiDu.middlewares.BaiduSpiderMiddleware': 543,
55 #}
56 
57 # Enable or disable downloader middlewares
58 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
59 #DOWNLOADER_MIDDLEWARES = {
60 #    'BaiDu.middlewares.MyCustomDownloaderMiddleware': 543,
61 #}
62 
63 # Enable or disable extensions
64 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
65 #EXTENSIONS = {
66 #    'scrapy.extensions.telnet.TelnetConsole': None,
67 #}
68 
69 # Configure item pipelines
70 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
71 ITEM_PIPELINES = {
72    'BaiDu.pipelines.BaiduPipeline': 300,
73    'BaiDu.pipelines.MongPipeline': 301,
74 }
75 
76 # Enable and configure the AutoThrottle extension (disabled by default)
77 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
78 #AUTOTHROTTLE_ENABLED = True
79 # The initial download delay
80 #AUTOTHROTTLE_START_DELAY = 5
81 # The maximum download delay to be set in case of high latencies
82 #AUTOTHROTTLE_MAX_DELAY = 60
83 # The average number of requests Scrapy should be sending in parallel to
84 # each remote server
85 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
86 # Enable showing throttling stats for every response received:
87 #AUTOTHROTTLE_DEBUG = False
88 
89 # Enable and configure HTTP caching (disabled by default)
90 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
91 #HTTPCACHE_ENABLED = True
92 #HTTPCACHE_EXPIRATION_SECS = 0
93 #HTTPCACHE_DIR = 'httpcache'
94 #HTTPCACHE_IGNORE_HTTP_CODES = []
95 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

settings

主要的代码来了spider（baidu.py）文件如下，找到了百度的ajax请求获取数据。

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 import requests
 4 import json
 5 from BaiDu.items import BaiduItem
 6 
 7 
 8 class BaiduSpider(scrapy.Spider):
 9     name = 'baidu'
10     allowed_domains = ['baidu.com']
11     # start_urls = ['http://talent.baidu.com/external/baidu/index.html#/social/2']
12     # 百度是ajax，传入ajax的url
13     start_urls = [
14         'http://talent.baidu.com/baidu/web/httpservice/getPostList?workPlace=0%2F4%2F7%2F9&recruitType=2&pageSize=10&curPage=1']
15 
16     def start_requests(self):
17         i = 1
18         while True:
19             url = 'http://talent.baidu.com/baidu/web/httpservice/getPostList?workPlace=0%2F4%2F7%2F9&recruitType=2&pageSize=10&curPage={}'.format(
20                 i)
21             i = i + 1
22             try:
23                 # 到最后一页应该会报错吧
24                 yield scrapy.FormRequest(url)
25             except Exception as e:
26                 print(e)
27                 break
28 
29     def parse(self, response):
30         # with open('baidu.html', 'wb') as f:  # debug
31         #     f.write(response.body)
32         host = 'http://talent.baidu.com/external/baidu/index.html#/jobDetail/2/'
33         item = BaiduItem()
34         json_datas = json.loads(response.body)['postList']  # 每次返回一页的十条数据
35         for json_data in json_datas:
36             item['job_name'] = json_data['name']
37             item['job_type'] = json_data['postType']
38             item['address'] = json_data['workPlace']
39             item['number'] = json_data['recruitNum']
40             item['pub_time'] = json_data['publishDate']
41             item['detail_link'] = host + str(json_data['postId'])
42             item['duty'] = json_data['workContent']
43             item['require'] = json_data['serviceCondition']
44             # todo 字符串需要处理
45             # print('------------------',item)
46             yield item

baidu.py

pipelines.py文件，把数据保存成json格式，插入到mongo数据库。

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define your item pipelines here
 4 #
 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 import json
 8 from scrapy.conf import settings
 9 from pymongo import MongoClient
10 
11 
12 class BaiduPipeline(object):
13     """保存json格式"""
14 
15     def __init__(self):
16         self.file = open('baidu.json', 'wb')
17 
18     def process_item(self, item, spider):
19         str_data = json.dumps(dict(item), ensure_ascii=False) + ',
'
20         self.file.write(str_data.encode())
21         return item
22 
23     def close_spider(self):
24         self.file.close()
25 
26 
27 class MongPipeline(object):
28     """将数据插入到本地mongo数据库中，框架爬虫很快，插入数据稍慢"""
29 
30     def __init__(self):
31         host = settings['MONGO_HOST']
32         port = settings['MONGO_PORT']
33         dbname = settings['MONGO_DBNAME']
34         colname = settings['MONGO_COLNAME']
35 
36         # 链接数据库
37         self.client = MongoClient(host, port)
38         # 选择数据库
39         self.db = self.client[dbname]
40         # 选择集合
41         self.col = self.db[colname]
42 
43     def process_item(self, item, spider):
44         dict_data = dict(item)
45         self.col.insert(dict_data)
46         return item
47 
48     def close_spider(self, spider):
49         self.client.close()

piplines.py