爬虫再探实战(一)——爬取智联招聘职位信息

  本人呢,算是学统计的,就想着爬一下智联的统计岗位信息,嗯,岗位很强势。。。

  这里用了requests,bs4进行抓取与解析,数据存入mysql数据库。代码比较乱,先凑和着看,有时间再整理吧。。。

import requests
from bs4 import BeautifulSoup
import re
import time
import datetime
import MySQLdb

now = datetime.datetime.now()

# 获取网页数据,并解析网页
def get_save_url(url, headers):
	html = requests.get(url, headers=headers)
	if html.status_code == 200:
		print("网页" + url + "已经打开")
	soup = BeautifulSoup(html.text, 'lxml')
	global soup
	html = html.text
	global html
	get_post_fact_city_num(soup,html)
	print("正在抓取第"+pagenum+"页的职位信息")
	


# 定义解析网页的函数
def get_post_fact_city_num(soupx,htmlx):

	l_post_name = []
	l_fact_name = []
	l_city_name = []
	l_num = []
	l_lab = []
	l_post_time = []
	l_now_time = []
	regex = "__ga__fullResult(.*)postname_clicksfullresult(.*)postnames_001"
	posts = soup.findAll("a", {"class":re.compile(regex)}) 
	for post in posts[::2]:		
		post = post.get_text()
		#print(post)
		l_post_name.append(post)
	facts = soup.findAll("p", {"class":"searchResultCompanyname"})

	for fact in facts[::2]:
		fact = fact.get_text()
		l_fact_name.append(fact)

	cities = soup.findAll("em", {"class":"searchResultJobCityval"})
	for city in cities[::2]:

		city = city.get_text()
		l_city_name.append(city)
	
	nums = soup.findAll("em", {"class":"searchResultJobPeopnum"})
	for num in nums:
		num = num.get_text()
		l_num.append(num)
	#print("nums: "+str(inums))
	labs = soup.findAll("p", {"class":"searchResultCompanyIndustry"})

	for lab in labs:
		lab = lab.get_text()
		l_lab.append(lab)

	time_regex = "<span>发布时间:<em></em>(.*)</span>"
	time_pa = re.compile(time_regex)
	times = re.findall(time_pa,html) 
	itimes = 1
	for time in times:
		l_post_time.append(time)

	save_to_sql(l_post_name,l_fact_name,l_city_name,l_num,l_lab,l_post_time)

headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"}

# 这里页数可以在搜索结果包含的范围内任意设置
urls = ["http://xiaoyuan.zhaopin.com/full/0/0_0_0_0_0_-1_%E7%BB%9F%E8%AE%A1_{}_0".format(str(x)) for x in range(2, 20)]
#url = "http://xiaoyuan.zhaopin.com/full/0/0_0_0_0_0_-1_%E7%BB%9F%E8%AE%A1_2_0"
db = MySQLdb.connect(host="localhost", user="root", passwd="这里是你的密码,我的不给看",db="test",use_unicode=True,charset="utf8")
cursor = db.cursor()

# 这里注意字符的大小尽量大,有些公司介绍比较长。。。
sqlxx = """CREATE TABLE zhilian_tongji(
		post_name VARCHAR(100),
		fact_name VARCHAR(100),
		city_name VARCHAR(20),
		num VARCHAR(50),
		lab VARCHAR(200),
		post_time VARCHAR(50),
		now_time VARCHAR(50)
		) """

cursor.execute(sqlxx)

def save_to_sql(l_post_name,l_fact_name,l_city_name,l_num,l_lab,l_post_time):
	now_time = datetime.datetime.now()
	sql = """INSERT INTO zhilian_tongji
	   SET post_name=%s, fact_name=%s, city_name=%s, num=%s, lab=%s, post_time=%s, now_time=%s"""
	for x in range(0,len(l_post_name)):
		#print(len(l_post_name))
		#print(x)
		#print(l_fact_name)
		cursor.execute(sql,(l_post_name[x],l_fact_name[x],l_city_name[x],l_num[x],l_lab[x],l_post_time[x],now_time))
		db.commit()
	print("抓取成功,已存入数据库!")

for url in urls:
	try:
		time.sleep(1)
		pagenum = url.split("_")[-2]
		get_save_url(url=url,headers=headers)
	except:
		print("第 "+str(pagenum)+" 失败...")
		pass
db.close()
print("大功告成!!!")

  代码输出结果如下。

  数据库查询结果如下。

原文地址:https://www.cnblogs.com/buzhizhitong/p/5657301.html