拉钩爬虫

# -*-coding:utf-8-*-
'''
    FileName:LaG爬取岗位信息
    CreatTime:2018-4-10
    Author: ___dx___
    FileDescript:
'''
import requests
import xlwt
import ssl

ssl._create_default_https_context = ssl._create_unverified_context    # https校验证书

class Lagou_job(object):
    def __init__(self):
        self.url = 'https://www.lagou.com/jobs/positionAjax.json?px=new&needAddtionalResult=false'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
            'Referer': 'https://www.lagou.com/jobs/list_%E6%B5%8B%E8%AF%95?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
            'Connection': 'keep - alive',
            'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
            'Origin':'https://www.lagou.com',
            'X-Anit-Forge-Code': '0',
            'X-Anit-Forge-Token': 'None',
            'X-Requested-With': 'XMLHttpRequest'

        }

    # 抓取接口函数
    def getJobList(self, page):
        self.data = {
            'first': 'true',
            'pn': page,
            'kd': '测试'
        }
        session = requests.Session()
        res =session.post(self.url, data=self.data, headers=self.headers)
        result = res.json()
        print(result)  # debug
        print(res.status_code)
        jobs = result['content']['positionResult']['result']
        return jobs

    # 抓取结果存入excel
    def saveExcel(self):
        excelTabel = xlwt.Workbook()  # 创建excel对象
        # 如果对一个单元格重复操作,会引发
        # returns error:
        # Exception: Attempt to overwrite cell:
        # sheetname=u'sheet 1' rowx=0 colx=0
        # 所以在打开时加cell_overwrite_ok=True 解决
        sheet_1 = excelTabel.add_sheet('daixiang', cell_overwrite_ok=True)  #创建sheet页
        sheet_1.write(0, 0, u'公司全名')
        sheet_1.write(0, 1, u'公司简称')
        sheet_1.write(0, 2, u'城市')
        sheet_1.write(0, 3, u'区域')
        sheet_1.write(0, 4, u'工作性质')
        sheet_1.write(0, 5, u'职位名称')
        sheet_1.write(0, 6, u'薪资范围')
        sheet_1.write(0, 7, u'职位')
        sheet_1.write(0, 8, u'工作年限')
        sheet_1.write(0, 9, u'公司规模')
        sheet_1.write(0, 10, u'学历要求')
        n = 1
        for page in range(1, 2):  # 前99页
            for job in self.getJobList(page=page):
                if '' in job['workYear'] and u'' in job['jobNature'] and u'' in job['education']:
                 if '' in job['workYear'] and u'全职' in job['jobNature'] and u'深圳' in job['city']:
                    sheet_1.write(n, 0, job['companyFullName'])
                    sheet_1.write(n, 1, job['companyShortName'])
                    sheet_1.write(n, 2, job['city'])
                    sheet_1.write(n, 3, job['district'])
                    sheet_1.write(n, 4, job['jobNature'])
                    sheet_1.write(n, 5, job['positionName'])
                    sheet_1.write(n, 6, job['salary'])
                    sheet_1.write(n, 7, job['secondType'])
                    sheet_1.write(n, 8, job['workYear'])
                    sheet_1.write(n, 9, job['companySize'])
                    sheet_1.write(n, 10, job['education'])
                    n += 1
            print (job['companyShortName'],job['salary'])
            #print ('{},{}'.format(job['companyShortName'].encode('utf-8'),job['salary'].encode('utf-8')))
            #print "{0[0]} is {0[1]} years old".format(li)
            #print {0}{1}.format(job['companyShortName'], job['salary'])

            #print('[{name:<{len}}	x'.format(name=job['companyShortName'] + ']', len=50 - len(job['companyShortName'].encode('utf-8')) + len(job['companyShortName'])))

        # 保存文件到excel
        #excelTabel.save('daidai.xls')
        excelTabel.save("深圳测试_By_dx.xls")

if __name__ == '__main__':
    lagou_job = Lagou_job()
    #lagou_job.getJobList(1)
    lagou_job.saveExcel()
原文地址:https://www.cnblogs.com/jsondai/p/11393056.html