爬取51job招聘信息(一)

目标,将网页上的内容爬取下来,并实现翻页,存储为csv。

import os
from concurrent.futures.thread import ThreadPoolExecutor
from threading import Thread

import requests
from re import findall
from json import loads
import time
import pymysql
from multiprocessing import Queue

import csv
# 获取每页的内容,定义一个函数
def get_one_page(page, city_code='000000'):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 90.0.4430.212 Safari / 537.36'
    }
    url = f'https://search.51job.com/list/000000,000000,0000,00,9,99,数据分析,2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        json_data = findall(r'window.__SEARCH_RESULT__s*=s*({.+?})</script>', response.text)[0]
        return loads(json_data)['engine_search_result']
    else:
        print('请求失败!')
# 需要多少页!
start_page=1
ts=[]
for i in range(10):
    result = get_one_page(start_page)
    if not result:
        print('没有更多数据')
        break
    ts.append(result)
    start_page += 1
#data_1 = get_one_page(1) #尝试保存一页的内容
data_1=[] # 创建空列表,用于存储多页


for i in range(len(ts)):
    for j in range(50):#一页50条
        data_1.append(ts[i][j])
# 我需要存储的信息

jobs = []
for job in data_1:
    job_info = [job.get('job_name'),
               job.get('providesalary_text'),
               job.get('company_name'),
               job.get('companytype_text'),
               job.get('workarea_text'),
               '-'.join(job.get('attribute_text', ['-', '-', '-', '-', '-'])),
               job.get('jobwelf')
               ]
    jobs.append(job_info)
name=['job_name','providesalary_text','company_name','companytype_text','workarea_tex','attribute_text','jobwelf']
test=pd.DataFrame(columns=name,data=jobs)
test.to_csv("testcsv.csv") # 保存为csv格式
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   job_name            500 non-null    object
 1   providesalary_text  500 non-null    object
 2   company_name        500 non-null    object
 3   companytype_text    500 non-null    object
 4   workarea_tex        500 non-null    object
 5   attribute_text      500 non-null    object
 6   jobwelf             500 non-null    object
dtypes: object(7)
memory usage: 27.5+ KB


重要参考:https://gitee.com/wenhaha8/job51_analysis
原文地址:https://www.cnblogs.com/Cookie-Jing/p/15149865.html