#-*- codding = utf-8 -*- #@Time: 2020/7/7 14:47 #@Author: liruifeng #@File: zhuaqu.py #@Software: PyCharm from bs4 import BeautifulSoup import urllib.request,urllib.error from urllib import parse import sqlite3 #关键字二次转译 kw = input("请输入你要搜索的岗位关键字:") keyword = parse.quote(parse.quote(kw)) pageNum = 1 jobData = {} #每一个记录是一个列表,每个列表中有多个键值对 jobList = [] #所有工作信息放入列表中,每个列表的元素是上面的字典 def main(): for i in range(1): url = "https://search.51job.com/list/010000,000000,0000,00,9,99," + keyword + ",2," + str(pageNum) + ".html" pageList = getLink(url) #爬取一个获取页获得所有信息 if len(pageList) == 0: break for jobpage in pageList: getData(jobpage) #一个详情页的链接 #datalist = getData(baseurl) dbpath = "./51job.db" #保存数据 #saveData2(datalist,dbpath) #保存至sqlite print(jobList) def getLink(url): jobLink = [] html = askURL(url) #获取列表页 bs = BeautifulSoup(html,"html.parser") eldiv = bs.select(".el > .t1 > span > a") for link in eldiv: jobLink.append(link["href"]) jobList.append({"link":link["href"]}) #print(jobList) return jobLink def getData(jobpage): jobHtml = askURL(jobpage) #获取详情页 bs = BeautifulSoup(jobHtml,"html.parser") #解析数据 for job in jobList: if jobpage == job["link"]: jobName = bs.select(".cn > h1") for name in jobName: print(name["title"]) #job["title"] = jobName["title"][0] #将岗位标题放入字典 CnameList = bs.select(".catn") #公司名称 for cNmae in CnameList: print(cNmae["title"]) days = bs.select(".ltype") #招聘要求 info = days[0]["title"].split("|") # for inf in days: # print(inf.strip()) print(info[0].strip(),end=' ') print(info[1].strip(),end=' ') print(info[2].strip(),end=' ') print(info[3].strip(),end=' ') try: print(info[3].strip(),end=' ') except IndexError as e: break fuli = bs.select(".sp4") #福利 for fulis in fuli: print(fulis.text,end=' ') jobMsgList = bs.select(".job_msg > p") #工作描述 jobMsgStr = "" for str in jobMsgList: jobMsgStr = jobMsgStr + str.text print(jobMsgStr.lstrip()) #print(jobHtml) return jobHtml def askURL(url): head = { "User-Agent": "Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 81.0.4044.138Safari / 537.36" } #模拟请求头 #用户代理 request = urllib.request.Request(url,headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("gbk") #print(html) except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) return html def init_db(dbpath): conn = sqlite3.connect(dbpath) #打开或创建数据文件 c = conn.cursor() #获取游标 sql = ''' create table job (id integer primary key autoincrement, job_link text, job_name text, cname varchar, area varchar, ssalary numeric, educate numeric, info text) ''' #创建数据表 c.execute(sql) #执行SQL conn.commit() #提交数据库操作 conn.close() #关闭数据库连接 #保存数据 def saveData2(datalist,dbpath): init_db(dbpath) conn = sqlite3.connect(dbpath) cur = conn.cursor() for data in datalist: for index in range(len(data)): if index == 4 or index == 5: continue data[index] = '"'+data[index]+'"' sql = ''' insert into job(job_link, job_name, cname, area, ssalary, educate, info) values (%s)'''%",".join(data) cur.execute(sql) conn.commit() cur.close() conn.close() #init_db(dbpath) if __name__ == '__main__': main()