数据关键词招聘职位爬虫

  1 # -*- coding:utf-8 -*-
  2 # Author:Sure Feng
  3 
  4 from selenium import webdriver
  5 from lxml import etree
  6 import time
  7 import json
  8 import openpyxl
  9 
 10 class LaGou(object):
 11     # 定义浏览器地址
 12     # time = time.time()
 13     driver_path = r'E:surewarechromedriver.exe'
 14     def __init__(self):
 15         # 创建一个浏览器
 16         self.drive = webdriver.Chrome(executable_path=LaGou.driver_path)
 17         # 定义主页url
 18         self.url = 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE?city=%E5%B9%BF%E5%B7%9E'
 19         self.positions = []
 20 
 21     def request_detail_page(self,url):
 22         '''请求详情页面'''
 23 
 24         # 创建新窗口打开详情页面
 25         self.drive.execute_script("window.open('%s')" % url)
 26         # 切换到详情页面
 27         self.drive.switch_to.window(self.drive.window_handles[1])
 28         # 获取详情页数据
 29         source = self.drive.page_source
 30         # 解析页面,获取具体数据
 31         self.parse_detial_page(source)
 32         # 关闭详情页面
 33         self.drive.close()
 34         # 切换到主页面
 35         self.drive.switch_to.window(self.drive.window_handles[0])
 36 
 37     def parse_detial_page(self, source):
 38         '''解析页面,获取具体数据'''
 39         html = etree.HTML(source)
 40         info_list = []
 41         # xpath解析html,获取具体数据
 42         position_id = html.xpath("//a[@class='send-CV-btn s-send-btn fr']/@data-position-id")
 43         position_web = "https://www.lagou.com/jobs/{}.html".format(position_id[0] if len(position_id) > 0 else None)
 44         info_list.append(position_web)
 45         position_name = html.xpath("//div[@class='job-name']/@title")
 46         position_name = position_name[0].strip() if len(position_name) > 0 else None
 47         info_list.append(position_name)
 48         salary = html.xpath("//dd[@class='job_request']/p/span[@class='salary']/text()")
 49         salary = salary[0].strip() if len(salary) > 0 else None
 50         info_list.append(salary)
 51         job_year = html.xpath("//dd[@class='job_request']/p/span[3]/text()")
 52         job_year = job_year[0].replace("/","").strip() if len(job_year) > 0 else None
 53         info_list.append(job_year)
 54         grade = html.xpath("//dd[@class='job_request']/p/span[4]/text()")
 55         grade = grade[0].replace("/","").strip() if len(grade) > 0 else None
 56         info_list.append(grade)
 57         publish_time = html.xpath("//p[@class='publish_time']/text()")
 58         publish_time = publish_time[0].replace("xa0 发布于拉勾网","").strip() if len(publish_time) > 0 else None
 59         info_list.append(publish_time)
 60         company_name = html.xpath("//img[@class='b2']/@alt")
 61         company_name = company_name[0] if len(company_name) > 0 else None
 62         info_list.append(company_name)
 63         company = html.xpath("//h2[@class='fl']/text()")
 64         company = company[0].strip() if len(company) > 0 else None
 65         info_list.append(company)
 66         job_advantage = html.xpath("//dd[@class='job-advantage']/p/text()")
 67         job_advantage = job_advantage[0].strip() if len(job_advantage) > 0 else None
 68         info_list.append(job_advantage)
 69         job_detail = html.xpath("//div[@class='job-detail']//text()")
 70         job_detail = str(job_detail).replace(" ","").strip() if len(job_detail) > 0 else None
 71         info_list.append(job_detail)
 72         work_addr = html.xpath("//div[@class='work_addr']/a[2]/text()")
 73         work_addr = work_addr[0].strip() if len(work_addr) > 0 else None
 74         info_list.append(work_addr)
 75         work_addr_detail = html.xpath("//div[@class='work_addr']//text()")
 76         work_addr_detail = work_addr_detail[-3].strip() if len(work_addr_detail) > 0 else None
 77         info_list.append(work_addr_detail)
 78         position_label_clearfix = html.xpath("//ul[@class='position-label clearfix']/li[@class='labels']//text()")
 79         position_label_clearfix = str(position_label_clearfix).strip() if len(position_label_clearfix) > 0 else None
 80         info_list.append(position_label_clearfix)
 81         c_feature = html.xpath("//ul[@class='c_feature']/li/text()")
 82         zone = c_feature[1].strip() if len(c_feature) > 0 else None
 83         info_list.append(zone)
 84         development = html.xpath("//i[@class='icon-glyph-trend']/../text()")
 85         development = development[1].strip() if len(development) > 0 else None
 86         info_list.append(development)
 87         people_num = html.xpath("//i[@class='icon-glyph-figure']/../text()")
 88         people_num = people_num[1].strip() if len(people_num) > 0 else None
 89         info_list.append(people_num)
 90         Investment_institution = html.xpath("//p[@class='financeOrg']/text()")
 91         Investment_institution = Investment_institution[0].strip() if len(Investment_institution) > 0 else None
 92         info_list.append(Investment_institution)
 93 
 94         # 将具体数据保存至dict中
 95         # info_dict = {
 96         #     'company':company,
 97         #     'position_name':position_name,
 98         #     'salary':salary,
 99         #     'job_year':job_year,
100         #     'grade':grade,
101         #     'publish_time':publish_time,
102         #     'zone':zone,
103         #     'job_advantage':job_advantage,
104         #     'job_detail':job_detail,
105         # }
106         # 将dict保存至职位列表中
107         # self.positions.append(info_dict)
108         self.positions.append(info_list)
109         # with open("lagou.json",encoding="utf-8",mode="a") as f:
110         #     f.write(json.dumps(info_dict,ensure_ascii=False,indent=2))
111         #     f.write("
")
112         print(str(self.positions).encode('GBK','ignore').decode('GBk') )
113 
114     def parse_list_page(self, source):
115         '''解析列表页'''
116         html = etree.HTML(source)
117         # 获取详情url列表
118         links = html.xpath("//a[@class='position_link']/@href")
119         for link in links:
120             print(link)
121             # 请求详情页面
122             self.request_detail_page(link)
123             time.sleep(1)
124             # break
125 
126     def save_excel(self, list):
127         wb = openpyxl.Workbook()
128         now_time = time.time()
129         ws = wb.create_sheet("lagou" + str(now_time))
130         title = ["position_web","position_name","salary","job_year","grade","publish_time","company","company_name","job_advantage","job_detail","work_addr","work_addr_detail","position_label_clearfix","zone","development","people_num","Investment_institution"]
131         ws.append(title)
132         for li in list:
133             ws.append(li)
134 
135         wb.save("lagou.xls")
136 
137 
138     def run(self):
139          # 打开主页
140          self.drive.get(self.url)
141          # num = 0
142          while True:
143              # num += 1
144              # if num == 2:
145              #     break
146              # 解析主页,获取详情页url
147              source = self.drive.page_source
148              # 解析列表页
149              self.parse_list_page(source)
150              # 跳转下一页
151              next_btn = self.drive.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
152              if "pager_next pager_next_disabled" in next_btn.get_attribute("class"):
153                  break
154              else:
155                  next_btn.click()
156              time.sleep(3)
157          # 保存数据到Excel中
158          self.save_excel(self.positions)
159 
160 
161 
162 if __name__ == '__main__':
163     # 创建爬虫对象
164     spider = LaGou()
165     # 调用run()执行爬虫
166     spider.run()
原文地址:https://www.cnblogs.com/sure-feng/p/10204229.html