python 爬取拉勾网

import requests
import random
import time
import os
import csv
import pandas as pd
req_url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
def file_do(list_info):
    # 获取文件大小
    file_size = os.path.getsize('csv1.csv')

    if file_size == 0:
        # 表头
        name = ['ID','薪资', '学历要求', '工作经验']
        # 建立DataFrame对象
        file_test = pd.DataFrame(columns=name, data=list_info)

        # 数据写入
        file_test.to_csv('csv1.csv', encoding='gbk', index=False)
    else:
        with open("csv1.csv", 'a+', newline='') as file_test:
            # 追加到文件后面
            writer = csv.writer(file_test)
            # 写入文件
            writer.writerows(list_info)
def get_info(headers):
    # 3.for 循环请求（一共30页）
    for i in range(1,31):
        # 翻页
        data = {
            'first': 'true',
            'kd': 'Python爬虫',
            'pn': i
        }
        # 3.1 requests 发送请求
        req_result = requests.post(req_url, data=data, headers=headers)
        req_result.encoding = 'utf-8'
        print("第%d页："%i+str(req_result.status_code))
        # 3.2 获取数据
        req_info = req_result.json()
        # 定位到我们所需数据位置
        req_info = req_info['content']['positionResult']['result']
        print(len(req_info))
        list_info = []
        # 3.3 取出具体数据
        for j in range(0, len(req_info)):
            salary = req_info[j]['salary']
            education = req_info[j]['education']
            workYear = req_info[j]['workYear']
            positionId = req_info[j]['positionId']
            list_one = [positionId,salary, education, workYear]
            list_info.append(list_one)
        print(list_info)
        # 存储文件
        file_do(list_info)
        time.sleep(1.5)

def main():
    agent = [
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
        'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
    ]
    agents = random.sample(agent, 1)


    # 2.请求头 headers
    headers = {
        'Accept': 'application/json,text/javascript,*/*;q=0.01',
        'Connection': 'keep-alive',
        'Cookie': 'JSESSIONID=ABAAABAAADEAAFI0BBF29567D8C34FBFF43B86890ADE515; user_trace_token=20180807134835-a45c92e4-b9fb-4cae-a241-37adc3e358e9; _gat=1; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D5qzER67kT-h5iPuzvS7sxo2ttborkvB08HUuHwUFIjR8foCXJttuRTX1prFUuy2TSMeQSeQ9Os1GNoyAEOvO-K%26wd%3D%26eqid%3Da28c59de00008677000000035b690aa6; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2F2715155.html; LGUID=20180807134836-86c198ef-9a05-11e8-a341-5254005c3644; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=index_search; SEARCH_ID=25c15d546900454886c333c88a2bb732; _gid=GA1.2.775929706.1533620922; _ga=GA1.2.124209160.1533620922; LGSID=20180807134836-86c19555-9a05-11e8-a341-5254005c3644; LGRID=20180807134913-9d36f9dd-9a05-11e8-b762-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1533620922; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1533620960',
        'Host': 'www.lagou.com',
        'Referer': 'https://www.lagou.com/jobs/list_Python?labelWords=&fromSearch=true&suginput=',
        'User-Agent': str(agents),
    }
    get_info(headers)

if __name__ == '__main__':
    main()