数据分析数据获取(数据分析师岗位分析)

import requests
import re
import time
from lxml import etree
import pymysql
class my_spider:
    
    #初始化(第一步)
    def __init__(self,num1,num2):
        self.base_url = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,{}.html"
        self.headers = {
            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":"gzip,deflate,br",
            "Accept-Language":"zh-CN,zh;q=0.9",
            "Cache-Control":"max-age=0",
            "Connection":"keep-alive",
            "Host":"search.51job.com",
            "Sec-Fetch-Mode":"navigate",
            "Sec-Fetch-Site":"none",
            "Sec-Fetch-User":"?1",
            "Upgrade-Insecure-Requests":"1",
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
        }
        self.page_num1 = num1
        self.page_num2 = num2
        #定义一个存放详细页连接的列表,方便取数
        self.det_link = []
    #构建页面列表连接(第一步)
    def get_url(self):
        url_List = []
        for i in range(self.page_num1,self.page_num2):
            url_List.append(self.base_url.format(i))
        return url_List
    #获得主列表页面信息(第一步)
    def get_pages(self,url):
        proxy={
    "http":"http://H53AXE0994W90HAD:720EEA0408F81FA2@http-dyn.abuyun.com:9020",
    "https":"http://H53AXE0994W90HAD:720EEA0408F81FA2@http-dyn.abuyun.com:9020"
}
        response = requests.get(url=url,headers=self.headers)
        #print(response.content.decode('gbk'))
        
        return self.parse_pages(response.content.decode('gbk'))
    
    #解析主列表信息连接(第一步)
    def parse_pages(self,text):
        
        html_5job = etree.HTML(text)
        all_div = html_5job.xpath("//div[@id='resultList']//div[@class='el']")
        info_List = []

        for item in all_div:
            info = {}
            info['job_info_link'] = item.xpath("./p/span/a/@href")[0]            
            info_List.append(info)
         
        return info_List
        
    
    
    

   #定义函数循环抽取页面信息
    def run(self):
        index_urlList = self.get_url()
        #print(index_urlList)
        for url in index_urlList:
            time.sleep(1)
            page_info = self.get_pages(url)
            #print(page_info,"打印结果")
            for job_info_link in page_info:
                self.det_link.append(job_info_link['job_info_link'])
        
        
                
    
    #获得页面信息
    def get_page_info(self,url):
        url = url
        print(url)
        proxy={
    "http":"http://H53AXE0994W90HAD:720EEA0408F81FA2@http-dyn.abuyun.com:9020",
    "https":"http://H53AXE0994W90HAD:720EEA0408F81FA2@http-dyn.abuyun.com:9020"
}
        response = requests.get(url=url,headers=self.headers)
        #print(response.content.decode('gbk'))
        return self.parse_det_info(response.content.decode('gbk'))
    
    
    #解析详细信息
    def parse_det_info(self,pages):
        
        item = etree.HTML(pages)
        #all_div = html_51job.xpath("//div[@class='cn']")
        #print(all_div)
        #info_List = []

        #for item in all_div:
        info = {}
        try:
            info['job_name'] = item.xpath("//div[@class='cn']/h1/@title")[0]
        except IndexError:
            info['job_name'] = 'NaN'
        try:
            info['job_money'] = item.xpath("//div[@class='cn']/strong/text()")[0]  # 这里报错
        except IndexError:
            info['job_money'] = 'NaN'
        try:
            info['company_name'] = item.xpath("//div[@class='cn']/p[@class='cname']/a/@title")[0]
        except IndexError:
            info['company_name'] = 'NaN'
        try:
            info['job_request'] = item.xpath("//div[@class='cn']/p[@class='msg ltype']/@title")[0]
        except IndexError:
            info['job_request'] = 'NaN'    
            #info_List.append(info)
         
        return info
    
    #main
    def main(self):
        self.run()
        print(self.det_link)
        for url in self.det_link:
            #print(url)
            time.sleep(1)
            det_pageinfo = self.get_page_info(url)
            print(det_pageinfo)
            self.save_to_mysql(det_pageinfo)
        
            
            
            
            
            
            
            
    #保存数据
    def save_to_mysql(self, page_Info):
        # 链接数据库
        conn = pymysql.connect(host='localhost', user='root', passwd='root123', db='baidu', port=3306)

        # 游标对象
        cursor = conn.cursor()

        # 插入数据
        tt = page_Info
        cursor.execute("insert into det_job_info(job_name,company_name,job_money,job_request) VALUES('{}','{}','{}','{}')".format(tt['job_name'],tt['company_name'],tt['job_money'],tt['job_request']))
        conn.commit()
        # 关闭游标,关闭连接
        cursor.close()
        conn.close()
if __name__ == "__main__":
    
    #spider.get_pages()
    #spider.get_url()
    for i in range(159,159,2):
        time.sleep(1)
        spider = my_spider(159,161)
        print('正在获取{}-{}页数据'.format(i,i+2))
        spider.main()
原文地址:https://www.cnblogs.com/luweilehei/p/11446485.html