2019.10.24

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from ..items import JobscrawlerQianchengwuyouItem
 4 import datetime
 5 
 6 class QianchengSpiderSpider(scrapy.Spider):
 7     name = 'qiancheng_spider'
 8     # allowed_domains = ['qq.com']
 9     start_urls = [
10         # 数据分析师
11         'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
12         # 人工智能
13         'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E4%25BA%25BA%25E5%25B7%25A5%25E6%2599%25BA%25E8%2583%25BD,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
14         # 算法工程师
15         'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E7%25AE%2597%25E6%25B3%2595%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
16         # 深度学习
17         'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%25B7%25B1%25E5%25BA%25A6%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
18         # 数据挖掘
19         'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E6%258C%2596%25E6%258E%2598,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=',
20         # 机器学习
21         'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%259C%25BA%25E5%2599%25A8%25E5%25AD%25A6%25E4%25B9%25A0,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
22     ]
23     start_url_tags=[
24         "数据分析师",
25         "人工智能",
26         "算法工程师",
27         "深度学习",
28         "数据挖掘",
29         "机器学习",
30     ]
31 
32     def __init__(self):
33         self.record_date = datetime.datetime.now().strftime('%Y-%m-%d')
34 
35     def start_requests(self):
36         for index in range(len(self.start_urls)):
37             url = self.start_urls[index]
38             tag = self.start_url_tags[index]
39             yield scrapy.Request(url,callback=self.parse,meta={'tag':tag},dont_filter=True)
40 
41     def parse(self, response):
42         tag = response.meta['tag']
43         xpath = '//div[@class="el"]'
44         items = response.xpath(xpath)
45         for item in items:
46             if not len(item.xpath('./p[@class="t1 "]')):
47                 continue
48             url = item.xpath('./p[@class="t1 "]//a/@href').extract_first()
49             title = item.xpath('./p[@class="t1 "]//a/text()').extract_first()
50             if tag == '算法' and not ('算法' in title):
51                 continue
52             yield scrapy.Request(url,callback=self.detail_parse,meta={'tag':tag},dont_filter=True)
53         next_page_url = response.xpath('//a[@id="rtNext"]/@href').extract_first()
54         if next_page_url is None:
55             yield scrapy.Request(next_page_url, callback=self.parse,meta={'tag':tag},dont_filter=True)
56 
57     def detail_parse(self,response):
58         item = JobscrawlerQianchengwuyouItem()
59         item['job_tag'] = response.meta['tag']
60         item['job_url'] = response.url
61         item['record_date'] = self.record_date
62         # 招聘名称、职位信息、薪资、职位福利、经验要求、学历要求
63         item['job_name'] = response.xpath('//div[@class = "cn"]/h1/text()').extract_first().strip()
64         item['job_info'] = "".join(response.xpath('//div[@class = "bmsg job_msg inbox"]//text()').extract()).strip()
65         item['job_salary'] = "".join(response.xpath('//div[@class = "cn"]/strong/text()').extract()).strip()
66         item['job_welfare'] = ",".join(response.xpath('//span[@class="sp4"]/text()').extract()).strip()
67         item['job_exp_require'] = response.xpath('//p[@class="msg ltype"]/text()').extract()[1].strip()
68         item['job_edu_require'] = response.xpath('//p[@class="msg ltype"]/text()').extract()[2].strip()
69         # 公司名称、公司行业、公司性质、公司人数、公司地址、公司概况、公司融资阶段
70         item['company_name'] = response.xpath('//div[@class = "com_msg"]//p/text()').extract_first().strip()
71         item['company_industry'] = "".join(response.xpath('//span[@class = "i_trade"]/..//text()').extract()).strip()
72         item['company_nature'] = "".join(response.xpath('//span[@class = "i_flag"]/../text()').extract()).strip()
73         item['company_people'] = "".join(response.xpath('//span[@class = "i_people"]/../text()').extract()).strip()
74         item['company_location'] = ""
75         item['company_overview'] = "".join(response.xpath('//div[@class = "tmsg inbox"]//text()').extract()).strip()
76         item['company_financing_stage'] = ""
77         yield item