04爬取拉勾网Python岗位分析报告

# 导入需要的包
import requests
import time,random
from openpyxl import Workbook
import pymysql.cursors
#@ 连接数据库;
# 这个是我本地上边运行的程序,用来获取代理服务器。
def get_proxy():
try:
PROXY_POOL_URL = 'http://localhost:5555/random'
response = requests.get(PROXY_POOL_URL)
print(response.text)
if response.status_code == 200:
return response.text
except ConnectionError:
return None
# 用来连接本地mysql,可以不连接,直接写入Excel中
def get_conn():
"""连接本地数据库"""
# 定义要连接的主机IP,账号名称和密码,连接的数据库,编码等等
conn = pymysql.connect(host = 'localhost',
user = 'root',
password = '123456',
db = 'python',
charset = 'utf8mb4',
cursorclass = pymysql.cursors.DictCursor)
return conn
# 将数据写入到数据库中
def insert(conn,info):
"""数据写入数据库"""
with conn.cursor() as cursor:
sql = "INSERT INTO `python` (`companyShortName`, `companyFullName`, `industryField`, `companySize`, `salary`, `city`, `education`) VALUES (%s, %s, %s, %s, %s, %s, %s)"
cursor.execute(sql, info)
conn.commit()
# 获取当前网址的信息
def get_json(url,page,lang_name):
"""返回当前页面的信息列表"""
data = {'first':'false','pn':page,'kd':lang_name}
proxies = get_proxy()
proxies = {
"http": "http://" + proxies
}
json = ses.post(url,data,proxies = proxies).json()
list_con = json['content']['positionResult']['result']
info_list = []
for i in list_con:
info = []
info.append(i.get('companyShortName','无')) # 公司名称
info.append(i.get('companyFullName','无'))
info.append(i.get('industryField','无'))
info.append(i.get('companySize','无'))
info.append(i.get('salary','无'))
info.append(i.get('city','无'))
info.append(i.get('education','无'))
info_list.append(info)
return info_list

def main():
lang_name = 'python'
wb = Workbook() # 打开Excel工作薄
conn = get_conn() # 建立数据库连接 不存放数据,注释此行
for i in ['北京','上海','广州','深圳','杭州']: #五个城市
page = 1
wsl = wb.active
wsl.title = lang_name
url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'.format(i)
while page < 2: # 每个城市30页信息
info = get_json(url,page,lang_name)
page += 1
# time.sleep(random.randint(10,20))
for row in info:
# 插入数据库,若不想存入 注释此行
insert(conn,tuple(row))
wsl.append(row)
# 关闭数据库连接,不存放数据,注释此行
conn.close()
wb.save('{}职位信息.xlsx'.format(lang_name))

if __name__ == "__main__":
my_headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36",
"Referer": "https://www.lagou.com/jobs/list_Python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=",
"Content-Type": "application/x-www-form-urlencoded;charset = UTF-8"
}
# time.sleep(5)
ses = requests.session() # 获取 session
ses.headers.update(my_headers) # 更新
ses.get(
"https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=")
main()












原文地址:https://www.cnblogs.com/cong12586/p/13376765.html