python爬取招聘网站数据

# -*- coding: utf-8 -*-

# 爬虫分析
from bs4 import BeautifulSoup
from lxml import etree
from selenium import webdriver
import time
from pymongo import MongoClient


class WorkSpider:
    def __init__(self):
        self.client = MongoClient('mongodb://localhost:27017/')
        self.zfdb = self.client.zfdb
        #self.zfdb.authenticate("mongodbUser", "yourpassward")

    # 要爬取的城市列表
    def getCity(self):
        return [
            "全国",
            "北京",
            "郑州",
            #"上海",
            #"深圳",
            #"广州",
        ]

    # 要爬取的语言列表
    def getLanguage(self):
        return [
             "Java",
             "Python",
            # "C",
            # "机器学习",
            # "图像识别",
            # "自然语言处理",
            # "区块链",
            # "精准推荐",
            # "Node.js",
            # "Go",
            # "Hadoop",
            # "Php",
            # ".NET",
            # "Android",
            # "iOS",
            # "web前端",
        ]

    # 经过观察发现，拉钩的 url 随语言和城市的变化如下
    def getUrl(self, language, city):
        url = "https://www.lagou.com/jobs/list_" + language + "?px=default&city=" + city
        return url

    # 获取一个城市，列表中所有语言的 url 列表
    def getCityUrl(self, city):
        urlList = []
        for language in self.getLanguage():
            urlList.append(self.getUrl(language, city))
        return urlList

    # 获取一门语言，不同城市的 url 列表
    def getLanguageUrl(self, language):
        urlList = []
        for city in self.getCity():
            urlList.append(self.getUrl(language, city))
        return urlList

    def getOnePageData(self):

        pass

    # MongoDB 存储数据结构
    def getRentMsg(self, name, company, welfare, salaryMin, salaryMid, salaryMax, experience, education, companyType,
                   companyLevel, companySize):
        return {
            "name": name,  # 职位名称(python工程师)
            "company": company,  # 公司名称（xxx有限公司）
            "welfare": welfare,  # 福利（餐补、下午茶、带薪年假）
            "salaryMin": salaryMin,  # 工资下限（9k）
            "salaryMid": salaryMid,  # 工资下限（9k+15k）/2
            "salaryMax": salaryMax,  # 工资上限（15k）
            "experience": experience,  # 工作经验（经验3-5年）
            "education": education,  # 教育程度（本科）
            "companyType": companyType,  # 公司类型（移动互联网/信息安全）
            "companyLevel": companyLevel,  # 公司级别（上市公司）
            "companySize": companySize,  # 公司人数规模（150-500人）
        }


    # 获取网页源码数据
    # language => 编程语言
    # city => 城市
    # collectionType => 值：True/False  True => 数据库表以编程语言命名   False => 以城市命名
    def main(self, language, city, collectionType):
        print(" 当前爬取的语言为 => " + language + "  当前爬取的城市为 => " + city)
        #print(" 当前爬取的语言为 => " + language + "  当前爬取的城市为 => " + city)
        #print(" 当前爬取的语言为 => " + language + "  当前爬取的城市为 => " + city)
        url = self.getUrl(language, city)
        print(" 当前爬取的路径为 => " + url )
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--start-maximized')  # 最大化运行（全屏窗口）,不设置，取元素会报错
        chrome_options.add_argument('--disable-infobars')  # 禁用浏览器正在被自动化程序控制的提示
        chrome_options.add_argument('--incognito')  # 隐身模式（无痕模式）
        #chrome_options.add_argument('--headless')  # 浏览器不提供可视化页面
        browser = webdriver.Chrome(executable_path = "chromedriver",options=chrome_options)
        #browser = webdriver.Chrome("chromedriver")
        browser.get(url)
        browser.implicitly_wait(10)
        for i in range(30):
            selector = etree.HTML(browser.page_source)  # 获取源码
            soup = BeautifulSoup(browser.page_source, "html.parser")
            span = soup.find("div", attrs={"class": "pager_container"}).find("span", attrs={"action": "next"})
            print("span =>" + str(span))  # <span action="next" class="pager_next pager_next_disabled" hidefocus="hidefocus">下一页<strong class="pager_lgthen pager_lgthen_dis"></strong></span>
            classArr = span['class']
            print("classArr =>"+ str(classArr))  # 输出内容为 -> ['pager_next', 'pager_next_disabled']
            attr2 = list(classArr)[1]
            if attr2 == "pager_next_disabled":
                print("已经爬到最后一页，爬虫结束")
                break
            else:
                print("还有下一页，爬虫继续")
                #browser.find_element_by_xpath('//*[@id="order"]/li/div[4]/div[2]').click()  # 点击下一页
                browser.find_element_by_xpath('//span[@class="pager_is_current"]/following-sibling::span').click()  # 点击下一页
            time.sleep(5)
            print('第{}页抓取完毕'.format(i + 1))
            self.getItemData(selector, language, city, collectionType)
        browser.close()

    # 解析一条 item 数据，并存进数据库
    def getItemData(self, selector, language, city, collectionType):
        items = selector.xpath('//*[@id="s_position_list"]/ul/li')
        for item in items:
            try:
                name = item.xpath('div[1]/div[1]/div[1]/a/h3/text()')[0]
                company = item.xpath('div[1]/div[2]/div[1]/a/text()')[0]
                welfare = item.xpath('div[2]/div[2]/text()')[0]
                salaryArray = item.xpath('div[1]/div[1]/div[2]/div/span/text()')[0].strip().split("-")
                salaryMin = salaryArray[0][:len(salaryArray[0]) - 1]
                salaryMax = salaryArray[1][:len(salaryArray[1]) - 1]
                salaryMid = (int(salaryMin) + int(salaryMax)) / 2
                educationArray = item.xpath('div[1]/div[1]/div[2]/div//text()')[3].strip().split("/")
                education = educationArray[0].strip()
                experience = educationArray[1].strip()
                conmpanyMsgArray = item.xpath('div[1]/div[2]/div[2]/text()')[0].strip().split("/")
                companyType = conmpanyMsgArray[0].strip()
                companyLevel = conmpanyMsgArray[1].strip()
                companySize = conmpanyMsgArray[2].strip()

                data = self.getRentMsg(
                    name,
                    company,
                    welfare,
                    int(salaryMin),
                    salaryMid,
                    int(salaryMax),
                    experience,
                    education,
                    companyType,
                    companyLevel,
                    companySize
                )
                if collectionType:
                    self.zfdb["z_" + language].insert_one(data)
                else:
                    self.zfdb["z_" + city].insert_one(data)

                print(data)
            except:
                print("=======  exception  =======")
                continue




spider = WorkSpider()# 职业爬虫
for language in spider.getLanguage():
    for city in spider.getCity():
        spider.main(language, city, True)
        time.sleep(5)

Spider.py

以上是爬取功能的全部代码：

参考github上的源码修改：

主要步骤如下：

1、组装url

2、selenium爬取数据

3、存入数据库mongo

4、去广告：

browser.get(url)

browser.implicitly_wait(10)

try:

browser.find_element_by_xpath('//div[@class="body-container showData"]/div/div[2]').click() # 点击广告

except:

pass

---------------------------------------------------------------------------------------------------------------------------------------------------------

分析数据：

# -*- coding: utf-8 -*-
# 数据分析，数据可视化
from os import path
from wordcloud import WordCloud, ImageColorGenerator
import jieba.analyse
import matplotlib.pyplot as plt
#from scipy.misc 
import imageio
import os
import time
from pymongo import MongoClient


class Analycis:
    def __init__(self):
        self.client = MongoClient('mongodb://localhost:27017/')
        self.zfdb = self.client.zfdb
        #self.zfdb.authenticate("mongodbUser", "yourpassward")

    def getCity(self):
        return [
            "全国",
            "北京",
            "郑州",
            #"上海",
            #"深圳",
            #"广州",
        ]

    def getLanguage(self):
        return [
            "Java",
            "Python",
            # "C",
            # "机器学习",
            # "图像识别",
            # "自然语言",
            # "区块链",
            # "Go",
            # "Php",
            # ".NET",
            # "Android",
            # "iOS",
            # "web前端",
            # "精准推荐",
            # "Node.js",
            # "Hadoop",

        ]

    # 统计的数据量
    # 各语言平均工资
    # 各语言学历要求
    # 各语言工作年限要求
    #

    # 福利词云
    # 公司级别排行（A轮、B轮）
    # 公司类型排行

    # 获取各语言样本数量
    def getLanguageNum(self):
        analycisList = []
        for index, language in enumerate(self.getLanguage()):
            collection = self.zfdb["z_" + language]
            totalNum = collection.aggregate([{'$group': {'_id': '', 'total_num': {'$sum': 1}}}])
            totalNum2 = list(totalNum)[0]["total_num"]
            analycisList.append(totalNum2)
        return (self.getLanguage(), analycisList)

    # 获取各语言的平均工资
    def getLanguageAvgSalary(self):
        analycisList = []
        for index, language in enumerate(self.getLanguage()):
            collection = self.zfdb["z_" + language]
            totalSalary = collection.aggregate([{'$group': {'_id': '', 'total_salary': {'$sum': '$salaryMid'}}}])
            totalNum = collection.aggregate([{'$group': {'_id': '', 'total_num': {'$sum': 1}}}])
            totalNum2 = list(totalNum)[0]["total_num"]
            totalSalary2 = list(totalSalary)[0]["total_salary"]
            analycisList.append(round(totalSalary2 / totalNum2, 2))
        return (self.getLanguage(), analycisList)

    # 获取一门语言的学历要求（用于 pyecharts 的词云）
    def getEducation(self, language):
        results = self.zfdb["z_" + language].aggregate([{'$group': {'_id': '$education', 'weight': {'$sum': 1}}}])
        educationList = []
        weightList = []
        for result in results:
            educationList.append(result["_id"])
            weightList.append(result["weight"])
        # print(list(result))
        return (educationList, weightList)

    # 获取一门语言的工作年限要求（用于 pyecharts 的词云）
    def getExperience(self, language):
        results = self.zfdb["z_" + language].aggregate([{'$group': {'_id': '$experience', 'weight': {'$sum': 1}}}])
        totalAvgPriceDirList = []
        for result in results:
            totalAvgPriceDirList.append(
                {"value": result["weight"], "name": result["_id"] + "  " + str(result["weight"])})
        return totalAvgPriceDirList

    # 获取 welfare 数据，用于构建福利词云
    def getWelfare(self):
        content = ''
        queryArgs = {}
        projectionFields = {'_id': False, 'welfare': True}  # 用字典指定
        for language in self.getLanguage():

            collection = self.zfdb["z_" + language]
            searchRes = collection.find(queryArgs, projection=projectionFields).limit(1000)
            for result in searchRes:
                print(result["welfare"])
                content += result["welfare"]
        return content

    # 获取公司级别排行（用于条形图）
    def getAllCompanyLevel(self):
        levelList = []
        weightList = []
        newWeightList = []
        attrList = ["A轮", "B轮", "C轮", "D轮及以上", "不需要融资", "上市公司"]
        for language in self.getLanguage():
            collection = self.zfdb["z_" + language]
            # searchRes = collection.find(queryArgs, projection=projectionFields).limit(1000)
            results = collection.aggregate([{'$group': {'_id': '$companyLevel', 'weight': {'$sum': 1}}}])
            for result in results:
                levelList.append(result["_id"])
                weightList.append(result["weight"])
        for index, attr in enumerate(attrList):
            newWeight = 0
            for index2, level in enumerate(levelList):
                if attr == level:
                    newWeight += weightList[index2]
            newWeightList.append(newWeight)
        return (attrList, newWeightList)



    # ========================================================

    # 展示饼图
    def showPie(self, title, attr, value):
        from pyecharts import Pie
        pie = Pie(title)
        # pie.add("aa", attr, value, is_label_show=True, title_pos='center')
        pie.add("",
                attr,
                value,
                radius=[40, 75],
                label_text_color=None,
                is_label_show=True,
                legend_orient="vertical",
                legend_pos="left", )
        pie.render()

    # 展示矩形树图
    def showTreeMap(self, title, data):
        from pyecharts import TreeMap
        data = data
        treemap = TreeMap(title, width=1200, height=600)
        treemap.add("深圳", data, is_label_show=True, label_pos='inside', label_text_size=19)
        treemap.render()

    # 展示条形图
    def showLine(self, title, attr, value):
        from pyecharts import Bar
        bar = Bar(title)
        bar.add("深圳", attr, value, is_convert=False, is_label_show=True, label_text_size=18, is_random=True,
                xaxis_interval=0,
                # xaxis_label_textsize=9,
                legend_text_size=18, label_text_color=["#000"])
        bar.render()

    # 展示词云
    def showWorkCloud(self, content, image_filename, font_filename, out_filename):
        d = path.dirname(__name__)
        # content = open(path.join(d, filename), 'rb').read()
        # 基于TF-IDF算法的关键字抽取, topK返回频率最高的几项, 默认值为20, withWeight
        # 为是否返回关键字的权重
        tags = jieba.analyse.extract_tags(content, topK=100, withWeight=False)
        text = " ".join(tags)
        # 需要显示的背景图片
        img = imageio.imread(path.join(d, image_filename))
        # 指定中文字体, 不然会乱码的
        wc = WordCloud(font_path=font_filename,
                       background_color='black',
                       # 词云形状，
                       mask=img,
                       # 允许最大词汇
                       max_words=500,
                       # 最大号字体，如果不指定则为图像高度
                       max_font_size=130,
                       # 画布宽度和高度，如果设置了msak则不会生效
                       # width=600,
                       # height=400,
                       margin=2,
                       # 词语水平摆放的频率，默认为0.9.即竖直摆放的频率为0.1
                       prefer_horizontal=0.9
                       )
        wc.generate(text)
        img_color = ImageColorGenerator(img)
        plt.imshow(wc.recolor(color_func=img_color))
        wc.to_file("loutput.jpeg")
        plt.axis("off")
        plt.show()
        wc.to_file(path.join(d, out_filename))

    # 展示 pyecharts 的词云
    def showPyechartsWordCloud(self, attr, value):
        from pyecharts import WordCloud
        wordcloud = WordCloud(width=1300, height=620)
        wordcloud.add("", attr, value, word_size_range=[20, 100])
        wordcloud.render()


analycis = Analycis()


# 计算样本数量
(attr, value) = analycis.getLanguageNum()
analycis.showLine("样本数量", attr, value)
os.rename("render.html","sampleNum.html")

# 计算样本数量
(attr, value) = analycis.getLanguageAvgSalary()
analycis.showLine("各语言平均工资", attr, value)
os.rename("render.html","languageAvgSalary.html")


# 语言学历要求
for language in analycis.getLanguage():
    (attr, value) = analycis.getEducation(language)
    print(attr, value)
    analycis.showPie("                       "+language + " 工作年限", attr, value)
    os.rename("render.html", "./languageEducation/" + language + "Education.html")
#


#  语言工作年限要求要求
for language in analycis.getLanguage():
    data = analycis.getExperience(language)
    print(data)
    analycis.showTreeMap("                       "+language+"工作学历要求", data)
    os.rename("render.html", "./languageExperience/" + language + "Experience.html")

#  福利词云
analycis.showWorkCloud(analycis.getWelfare(), "docker.jpeg", "kh.ttf", out_filename="loutput.jpeg")

# 公司级别（A轮、B轮） pyechart 词云
(attr, value) = analycis.getAllCompanyLevel()
print(attr, value)
analycis.showLine("公司级别", attr, value)
os.rename("render.html", "companyLevel.html")

分析