某市近年毕业情况的一个可视化

数据来源于:https://data.cityofnewyork.us/Education/2005-2010-Graduation-Outcomes-By-Borough/avir-tzek

数据理解

原数据其实是有点乱的,第一列Demographic可以说是一些标签吧,有English Language Learners和English Proficient Students,有Special Education和General Education,有Asian、Black、Hispanic、white,有Female和Male,有Borough Total(这个便是总的了,只可惜我当初没发现,而是把男女加在一起来算总的的,哎哎哎走了弯路)

第二列便是所谓的Borough,一共有5个镇Bronx、Brooklyn、Manhattan、Queens、Staten Island

第三列Cohort,年份;第三列Total  Cohort,本年到毕业年所有的人数

第四列Total Grads - n,本年毕业的人数。

其他的列基本上就是毕业生中再区分类别的人数及比例了。本文没用到,便不再描述了。

数据预处理

library(dplyr)
library(ggplot2)

dat=read.csv("Graduation.csv",header=T)
dat$Cohort=as.factor(dat$Cohort)#将年份转换成因子类型
dat=dat[,c(1,2,3,4,5)] #以下只取前5列进行分析
dat_df=tbl_df(dat)

  

每年的毕业生总人数

borough_total=dat[dat$Demographic=="Borough Total",] #筛选出统计总数的行
borough_total_df=tbl_df(borough_total)
by_Cohort=group_by(borough_total_df,Cohort)
total=summarise(by_Cohort,total=sum(Total.Grads...n))

ggplot(total,aes(x=Cohort,y=total))
  +geom_col(aes(fill=total))
  +theme(plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", colour = "grey50"))
  +labs(title = "Total Graduations of 2001-2006")

 

毕业生男女分布

female=dat[dat$Demographic=="Female",]
male=dat[dat$Demographic=="Male",]

female_df=tbl_df(female)
by_Cohort_fe=group_by(female_df,Cohort)
female_tol=summarise(by_Cohort_fe,total=sum(Total.Grads...n))
female_tol$sex="female"
male_df=tbl_df(male) 
by_Cohort_male=group_by(male_df,Cohort) 
male_tol=summarise(by_Cohort_male,total=sum(Total.Grads...n)) 
male_tol$sex="male"

s1=rbind(female_tol,male_tol) #其实这里也可以先把female和male合并,然后做groupby 
names(s1)=c("Cohort","num","sex") 

ggplot(s1,aes(x=Cohort,y=num,fill = factor(sex)))+geom_col(position = "dodge")+theme(legend.title=element_blank(),plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", colour = "grey50"))+labs(title = "female & male graduations of 2001-2006")

  

不同ethnic groups所占比例

 这里计算的是Asian、Black、Hispanic、White这四种在2001-2006毕业的总人数占6年总毕业生人数的比例。

by_Demographic=group_by(dat_df,Demographic)
Demographic=summarise(by_Demographic,num=sum(Total.Grads...n))
bing=Demographic[c(1,2,8,11),]
bing$rat=paste(bing$Demographic,"(",round(bing$num/sum(bing$num)*100,2),"%)",sep="")

ggplot(bing,aes(x="",y=num,fill=Demographic))+geom_bar(stat="identity",width=1)+coord_polar(theta = "y")+labs(x="",y="",title="total graduations of different regions")+theme(axis.ticks = element_blank(),plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", colour = "grey50"),legend.title = element_blank())+scale_fill_discrete(breaks=bing$Demographic,labels=bing$rat)

  

不同城镇的毕业人数及毕业率

毕业人数

by_year_Borough=group_by(borough_total_df,Cohort,Borough)
year_Borough_num=summarise(by_year_Borough,gra=sum(Total.Grads...n),all=sum(Total.Cohort))
year_Borough_num$rat=year_Borough_num$gra/year_Borough_num$all
ggplot(year_Borough_num,aes(x=Cohort,y=gra,fill=factor(Borough)))+geom_col(position = "dodge")+theme(legend.title=element_blank(),plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", colour = "grey50"))+labs(title = " graduations of different boroughs of 2001-2006")

  

 

毕业率

ggplot(year_Borough_num,aes(x=Cohort,y=rat,group=Borough,color=Borough))+geom_line()+geom_point(size=4, shape=20)+theme(plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", colour = "grey50"))+labs(title = " graduation rate of Brooklyn from 2001 to 2006")

  

 

不同Education

by_Demographic_Cohort=group_by(dat_df,Demographic,Cohort)
Demographic_Cohort=summarise(by_Demographic_Cohort,num=sum(Total.Grads...n),all=sum(Total.Cohort))
Demographic_Cohort$rat=Demographic_Cohort$num/Demographic_Cohort$all

#不同教育 Special Education" General Education Demographic_Cohort_education=Demographic_Cohort[Demographic_Cohort$Demographic=="Special Education"|Demographic_Cohort$Demographic=="General Education",] ggplot(Demographic_Cohort_education,aes(x=Demographic,y=rat))+geom_col(aes(fill=Demographic))+facet_grid(. ~ Cohort)+theme(plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", colour = "grey50"),axis.text.x = element_blank())+labs(title = " graduation rate of different degree of Education"

  

  

不同English

#不同英文程度English Proficient Students  English Language Learners 
Demographic_Cohort_english=Demographic_Cohort[Demographic_Cohort$Demographic=="English Proficient Students"|Demographic_Cohort$Demographic=="English Language Learners",]
ggplot(Demographic_Cohort_english,aes(x=Demographic,y=rat))+geom_col(aes(fill=Cohort))+theme(plot.title = element_text(hjust = 0.5),panel.background = element_rect(fill = "white", colour = "grey50"))+labs(title = " graduation rate of different degree of English")

  

 

原文地址:https://www.cnblogs.com/Hyacinth-Yuan/p/7995122.html