关于拉勾网的爬虫

  1 # coding:utf-8
  2 import json
  3 import re
  4 from lxml import etree
  5 import requests
  6 import time
  7 
  8 
  9 class Lagou(object):
 10     def __init__(self):
 11         # 构建初始url
 12         self.url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false&isSchoolJob=0'
 13         # 构建请求头,lg需要设置登录cookie,登录后浏览器检查工具获得即可,referer为跳转网页,也可以独立请求跳转
 14         self.headers = {
 15             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
 16             'Cookie': '此处代码为登录后获取的cookie,全部粘贴进来即可',
 17             'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
 18         }
 19         self.pattern = re.compile(r'"positionId":(d+)')  # 构建正则匹配用于获取urlId
 20         self.base_url = 'https://www.lagou.com/jobs/{}.html'
 21         self.file = open('lagou.json', 'w')
 22 
 23     def get_post_data(self, page=1):
 24         """获取列表页json数据"""
 25         print('正在登陆----')
 26         post_data = {
 27             'first': 'true',
 28             'pn': page,  # 页数
 29             'kd': 'python', # 此处可以动态修改
 30         }
 31         response = requests.post(self.url, headers=self.headers, data=post_data)
 32         print('获取得列表页响应')
 33         return response.content.decode()
 34 
 35     def get_page(self, url):
 36         """获取详情页响应数据"""
 37         response = requests.post(url, headers=self.headers)
 38         return response.content
 39 
 40     def parse_url(self, data):
 41         """解析列表页数据获取urlid"""
 42         print('开始解析列表页数据,获取id')
 43         id_list = self.pattern.findall(data)
 44         url_list = []
 45         for id in id_list:
 46             url_list.append(self.base_url.format(id))
 47         print('id获取完毕')
 48         return url_list
 49 
 50     def parse_detail_data(self, str_data):
 51         """解析详情页数据"""
 52         print('正在获取详情页数据')
 53         html = etree.HTML(str_data)
 54         data = {}
 55         data['name'] = html.xpath('//div/span[@class="name"]/text()')[0] if len(html.xpath('//div/span[@class="name"]/text()'))>0 else None
 56         data['salary'] = html.xpath('//span[@class="salary"]/text()')[0] if len(html.xpath('//span[@class="salary"]/text()'))>0 else None
 57         temp = html.xpath('//dd[@class="job_request"]/p[1]/span[2]/text()')[0] if len(html.xpath('//dd[@class="job_request"]/p[1]/span[2]/text()'))>0 else None
 58         data['city'] = temp.replace('/', '').strip()
 59         data['company'] = html.xpath('//div[@class="company"]/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None
 60         temp = html.xpath('//dd/p[1]/span[4]/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None
 61         data['education'] = temp.replace('/', '').strip()
 62         data['job_type'] = html.xpath('//dd/p[1]/span[5]/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None
 63         data['anvantage'] = html.xpath('//dd[@class="job-advantage"]/p/text()')[0] if len(html.xpath('//div[@class="company"]/text()'))>0 else None
 64         desc_list = html.xpath('//dd[@class="job_bt"]/div/p/text()')
 65         temp = ''
 66         for desc in desc_list:
 67             temp += desc
 68         data['responsibilities'] = temp.replace('xa0', '')
 69         return data
 70 
 71     def parse_detail(self, url_list):
 72         """获取单页详情页数据列表"""
 73         print('开始拼装详情页url')
 74         data_list = []
 75         for url in url_list:
 76             str_data = self.get_page(url)
 77             # print(str_data.decode())
 78             data_list.append(self.parse_detail_data(str_data))
 79             # print(str)
 80         print('获取完毕')
 81         return data_list
 82 
 83     def save_data(self, data_list):
 84         """保存数据模块"""
 85         print('开始保存数据')
 86         for data in data_list:
 87             str_data = json.dumps(data, ensure_ascii=False) + ',
'  # 将python字典转换为json字符串
 88             self.file.write(str_data)
 89 
 90     def run(self):
 91         """爬虫运行逻辑模块"""
 92         for page in range(1, 10):  # 翻页
 93             data = self.get_post_data(page)  # lg数据需要登陆爬取,使用post稍微安全些,大概吧
 94             url_list = self.parse_url(data)  # 获取详情页所需id
 95             data_list = self.parse_detail(url_list)  # 获取单页详情页数据列表
 96             # print(data_list)  # debug
 97             self.save_data(data_list)  # 保存数据
 98 
 99     def __del__(self):
100         print('数据保存完毕')
101         self.file.close()  # 关闭文件
102 
103 
104 if __name__ == '__main__':
105     lagou = Lagou()
106     lagou.run()

  一个简单的爬取拉勾网详情页信息的爬虫,非常粗糙,主要使用了requests进行请求,登录后才能爬取所有数据,需要用到登录后的cookie,最好使用post请求,虽然只安全一点点,仅供参考。

原文地址:https://www.cnblogs.com/qiukujun/p/one_simple_spider.html