python实战项目之爬虫(一)

因为马上就要大四实习了,博主实在懒得在学校官网上一个个翻,直接用爬虫将所有数据都爬下来

放在表格里,这样感觉简单多了,可惜还没找到工作,so sad

总共选择了三个学校:湖南大学,中南大学,湘潭大学

三个项目代码分别如下(新手代码,惨不忍睹):

湘潭大学:

#!/usr/bin/python3
#coding=utf-8
import requests
import json
import logging
import os
import xlwt
basic_url = 'http://jobs.xtu.edu.cn/index/getdaycareers?day=2018-10-'
logging.basicConfig(level=logging.DEBUG,format='')

workbook = xlwt.Workbook()
sheet1 = workbook.add_sheet('list1')
sheet1.write(0,0,'时间')
sheet1.write(0,1,'地点')
sheet1.write(0,2,'公司名称')
sheet1.write(0,3,'专业要求')
sheet1.write(0,5,'详细信息')
count=1
for i in range(1,32):
    url = basic_url+str(i)
    logging.debug('the clawer web site is:'+url)
    clawertext = requests.get(url)
    logging.debug(type(clawertext))
    logging.debug(clawertext.json())
    logging.debug(clawertext.json()['data'])
    logging.debug(type(clawertext.json()['data']))
    
    data_list = clawertext.json()['data']#the useful data 

    for i in data_list:
        sheet1.write(count,0,i['meet_day'])
        sheet1.write(count,1,i['address'])
        sheet1.write(count,2,i['meet_name'])
        sheet1.write(count,3,i['professionals'])
        sheet1.write(count,5,'http://jobs.xtu.edu.cn/detail/career?id='+i['career_talk_id'])
        count=count+1
workbook.save('湘潭大学十月份招聘信息.xlsx')

中南大学:

这个最坑,花了我一个多小时

#!/usr/bin/python3
#coding=utf-8
import requests
import xlwt
import json
import logging
import bs4
from bs4 import BeautifulSoup
#初始化日志保存路劲,及格式
logging.basicConfig(filename='log.txt',level=logging.DEBUG,format='%(asctime)s - %(levelname)s - %(message)s')
logging.getLogger('requests').setLevel(logging.WARNING)#禁用requests的日志

#初始化表格
workbook = xlwt.Workbook()
sheet1 = workbook.add_sheet('list')
sheet1.write(0,0,'时间')
sheet1.write(0,1,'地点')
sheet1.write(0,2,'公司名称')
sheet1.write(0,3,'职位名称')
sheet1.write(0,4,'教育水平')
sheet1.write(0,5,'专业要求')
sheet1.write(0,6,'空缺数量')
sheet1.write(0,7,'详细信息')

#初始化地址
json_all_url = 'http://jobsky.csu.edu.cn/Home/SearchDateAllMonth'
dt1={'Date':'2018-09-04'}
post_data = requests.post(json_all_url,data=dt1)
json_data = post_data.json()
logging.debug(type(json_data))
'''with open('json.txt','w') as fileTxt:
    for i in json_data:
        fileTxt.write(str(i)+'
')    
'''
basic_html_url = 'http://jobsky.csu.edu.cn/Home/ArticleDetails/'

counter_all = 1
for data in json_data:
    company_Id=data['NewsID']
    #logging.debug('the commpanyID is:'+company_Id)
    html_url=basic_html_url+company_Id
#html_url=basic_html_url+'13713'#static url,please delete and repaire after you have used it
    
    html_txt = requests.get(html_url)
#  logging.debug('the web site using code is:'+str(html_txt.status_code))
    bs = BeautifulSoup(html_txt.text,'lxml')
    
    #get the commpanyName
    
    list_soup_CN = bs.find('h1',attrs={'class':'text-center title'})
    try:    
        advertise_company_name=list_soup_CN.getText()
        sheet1.write(counter_all,2,advertise_company_name)
    except:
        logging.debug("the url"+html_url+'has some problem')
    #get the time and place
    try:
        list_soup_TP = bs.find('div',attrs={'id':'placeAndTime'})
        advertise_time=list_soup_TP.find('p',attrs={'class':'text-center time'}).getText()
        advertise_place=list_soup_TP.find('p',attrs={'class':'text-center place'}).getText()
        sheet1.write(counter_all,0,advertise_time)
        sheet1.write(counter_all,1,advertise_place)
    except:
        logging.debug("the url"+html_url+'has some problem')
    
    try:     
        list_soup_demand = bs.find('table',attrs={'class':'table table-bordered'})
        list_td = list_soup_demand.find_all('td')
        counter_even = 0#use to counter ,so that we can find the number of td,and get we need data
        #we can get the useful data by looking the source
        for td in list_td:
            if counter_even==1 :
                sheet1.write(counter_all,3,td.getText())
            if counter_even==3 :
                sheet1.write(counter_all,4,td.getText())
            if counter_even==5 :
                sheet1.write(counter_all,5,td.getText())
            if counter_even==7 :
                sheet1.write(counter_all,6,td.getText())
            counter_even =counter_even+1
        sheet1.write(counter_all,7,html_url)
        counter_all+=1
    except:
        logging.debug("the url"+html_url+'has some problem')
    #保存文件
    workbook.save('中南大学招聘信息.xlsx')
View Code

最后是湖南大学,不知道为什么,湖南大学招聘信息少的可怜

#!/usr/bin/python3
#coding=utf-8
import requests
import json
import logging
import os
import xlwt
json_url = 'https://hnu.bysjy.com.cn/module/getcareers?start_page=1&keyword=&type=inner&day=&count=15&start=1&_=1536044186160'
logging.basicConfig(level=logging.DEBUG,format='')

json_data = requests.get(json_url)
#print(json_data.text)

workbook = xlwt.Workbook()
sheet1 = workbook.add_sheet('list1')
sheet1.write(0,0,'时间')
sheet1.write(0,1,'地点')
sheet1.write(0,2,'公司名称')
sheet1.write(0,3,'招聘会')
sheet1.write(0,4,'专业要求')
sheet1.write(0,6,'详细信息')
count=1
    
data_list = json_data.json()['data']#the useful data 

for i in data_list:
        sheet1.write(count,0,i['meet_day']+i['meet_time'])
        sheet1.write(count,1,i['address'])
        sheet1.write(count,2,i['company_name'])
        sheet1.write(count,3,i['meet_name'])
        sheet1.write(count,4,i['professionals'])
        sheet1.write(count,5,'https://hnu.bysjy.com.cn/detail/career?id='+i['career_talk_id'])
        count=count+1
workbook.save('湖南大学招聘信息.xlsx')
View Code
原文地址:https://www.cnblogs.com/gambler/p/9584689.html