R 代码积累

R 代码积累不定期更新

1.阶乘、递归、reduce、sprintf

#NO.1
# 阶乘函数
fact <- function(n){
  if(n==0) return(1) #基例在这
  else return(n*fact(n-1))
}

#1+2!+3!+...+20!的和
#测试 Reduce('+', lapply(1:3,fact)) 结果是9
Reduce('+', lapply(1:20,fact))

#NO.2
#判断101-200之间有多少个素数，并输出所有素数
is.prime <- function(num) {
  if (num == 2) {
    TRUE
  } else if (any(num %% 2:(num-1) == 0)) {
    FALSE
  } else { 
    TRUE
  }
}
#101共有多少个素数
Reduce('+', lapply(101:200,is.prime))
#打印出每一个素数
for(i in 101:200){
  if(!is.prime(i)) next
  print(sprintf('%d 是素数 %s',i,is.prime(i)))
}

2.MD5加密卡号

library(magrittr)
library(digest)
library(data.table)
library(readxl)
library(lubridate)

dat=read_excel('data_union.xlsx',sheet=4)
dim(dat)
get_len <- function(a){
  return(paste0(nchar(a),a))
}
dat$card_len=lapply(dat$bank_card, get_len)%>%unlist()
dat$card_md5 <- lapply(dat$card_len, digest, algo="md5",serialize=F)%>%unlist()
dat$card_md5_upper <- toupper(dat$card_md5)
write.table(
  dat[,c('card_md5_upper')]
  ,'yqb_card_md5.txt',row.names = F
  ,col.names=F
  ,quote = F)

#ym_apply
dat$ym_apply=dat$time %m-% months(1)%>%
  format("%Y%m")%>%as.numeric()

#write out card_md5 and ym_apply
write.table(
  dat[c('card_md5_upper','ym_apply')]
  ,sep=','
  ,'yqb_card_ym_md5.txt',row.names = F
  ,col.names=F
  ,quote = F)

unique(dat$ym_apply)

3.时间函数

https://cran.r-project.org/web/packages/lubridate/vignettes/lubridate.html

4.随机森林可视化

library(randomForest)
library(kernlab) # spam数据集 来源于这个 包
library(magrittr)
library(caret)
#setwd('...')
#data <- read.csv('...')

data(spam) #加载数据集
# spam 是一个数据集，一条记录代表一封邮件包含哪些特殊词
# ，这封邮件是否是垃圾邮件（y变量）等等一些变量
View(spam)
table(spam$type) #看一下Y变量的分布，是否存在不均衡（imbalanced data）
#type 是y变量 也可以是0,1，但注意，如果你的数据集是 df,  要预先 将y变量转换为factor ,df$y <- as.factor(df$y)
set.seed(2016) #为使模型可再现，这里预先设定随机种子，因为随机森林的 每棵树对行是进行随机有放回的抽样的

rf <- randomForest(type~.
                   ,data=spam
                   ,importance=TRUE #保留变量的重要性 TURE的时候,可通过rf$importance 查看
                   ,proximity=FALSE #在n棵树种，一个矩阵任意两条记录 落在同一个叶子节点的概率，可以表示两条记录的相似程度
                   ,ntree=500 #种多少棵树
                   ,do.trace=20 #每20条显示 误分率
                   ,na.action=na.omit#一行中只要存在一个NA,这条记录就删掉
                   ,strata=spam$type #按照Y变量中（0,1的比例，进行上下采样，对占比少的用oversampling，对占比多的用downsampling）
                   ,sampsize=c(1500,1500) #对0,1采样的个数分别是多少，都是有放回的
                   ,mtry=7 #每一棵决策树，选取几个feature?，对于classification 一般是feature总数的开平方（这个是default）
                   ,keep.forest=FALSE) #保留每一棵数

rf$confusion #混淆矩阵

##变量重要性,如果你的数据是有上千个变量，可以根据变量的重要性对数据进行降维
par(mfrow = c(2,2))
for(i in 1:4){
  plot(sort(rf$importance[,i],decreasing =TRUE)%>%head(20)
       ,type='h'
       ,main=paste0(colnames(rf$importance)[i])
       ,xlab='variable'
       ,ylab='importance')
  
  text(sort(rf$importance[,i],decreasing =TRUE)
       ,labels=names(sort(rf$importance[,i]))%>%head(20)
       ,pos=1
       ,cex=0.9)
}

## 下面画ROC 曲线，计算AUC
library(ROCR)
predictions=as.vector(rf$votes[,2])
pred=prediction(predictions,spam$type)

perf_AUC=performance(pred,"auc") #Calculate the AUC value
AUC=perf_AUC@y.values[[1]]

perf_ROC=performance(pred,"tpr","fpr") #plot the actual ROC curve
plot(perf_ROC, main="ROC plot")
text(0.5,0.5,paste("AUC = ",format(AUC, digits=5, scientific=FALSE)))

#cutoff accuracy
perf <- performance(pred, measure="acc", x.measure="cutoff")
# Get the cutoff for the best accuracy
bestAccInd <- which.max(perf@"y.values"[[1]])
bestMsg <- paste("best accuracy=", perf@"y.values"[[1]][bestAccInd], 
                 " at cutoff=", round(perf@"x.values"[[1]][bestAccInd], 4))
plot(perf, sub=bestMsg)
abline(v=round(perf@"x.values"[[1]][bestAccInd], 4))

# calculate the confusion matrix and plot
cm <- confusionMatrix(rf$predicted, reference = spam$type)
draw_confusion_matrix(cm)

#confusion matrix visualization
draw_confusion_matrix <- function(cm) {
  
  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('CONFUSION MATRIX', cex.main=2)
  
  # create the matrix 
  rect(150, 430, 240, 370, col='#3F97D0')
  text(195, 435,  rf$classes[1], cex=1.2)
  rect(250, 430, 340, 370, col='#F7AD50')
  text(295, 435, rf$classes[2], cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col='#F7AD50')
  rect(250, 305, 340, 365, col='#3F97D0')
  text(140, 400,  rf$classes[1], cex=1.2, srt=90)
  text(140, 335, rf$classes[2], cex=1.2, srt=90)
  
  # add in the cm results 
  res <- as.numeric(cm$table)
  text(195, 400, res[1], cex=1.6, font=2, col='white')
  text(195, 335, res[2], cex=1.6, font=2, col='white')
  text(295, 400, res[3], cex=1.6, font=2, col='white')
  text(295, 335, res[4], cex=1.6, font=2, col='white')
  
  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
  
  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}

　5.一个多线程爬虫

##  99健康网 医院抓取  ##

#step1:by city
url_0='http://yyk.99.com.cn/city.html'
content_0=url_0%>%read_html()
city_nm=c(
  content_0%>%html_nodes('.cityarea li a')%>%html_text(trim=T)
  ,content_0%>%html_nodes('.cityarea .background li a')%>%html_text(trim=T)
)
city_url=c(
  content_0%>%html_nodes('.cityarea li a')%>%html_attr('href')%>%paste0('http://yyk.99.com.cn',.)
  ,content_0%>%html_nodes('.cityarea .background li a')%>%html_attr('href')%>%paste0('http://yyk.99.com.cn',.)
)
length(city_nm)
length(city_url)

#step2:get hospital by city
get_hospital_1 <- function(i){
  # i=1
  content_1=city_url[i]%>%read_html(encoding='gbk')
  hospital_nm=content_1%>%html_nodes('.tablist li span')%>%html_text(trim=T)
  hospital_url=content_1%>%html_nodes('.tablist li a')%>%html_attr('href')%>%paste0(.,'jianjie.html')
  return(data.frame(
    stringsAsFactors = F
    ,city_nm=rep(city_nm[i],length(hospital_nm))
    ,hospital_nm=hospital_nm
    ,hospital_url=hospital_url
  ))
}
system.time({
  x <- 301:length(city_url)
  res_99jk_3 <- do.call('rbind',lapply(x,get_hospital_1))
})
write.table(res_99jk_1,'res_99jk_1.csv',row.names = F,sep='	')
write.table(res_99jk_2,'res_99jk_2.csv',row.names = F,sep='	')
write.table(res_99jk_3,'res_99jk_3.csv',row.names = F,sep='	')
res_99jk <- rbind(res_99jk_1
                  ,res_99jk_2
                  ,res_99jk_3)

#step3 get all fileds level beds_cnt...
res_99jk <- read.table('res_99jk.csv',header = T)
hospital_url_pool=res_99jk$hospital_url%>%unique()
get_detail <- function(hospital_url){
  tryCatch({
    res=hospital_url%>%read_html(encoding='gbk')%>%html_nodes('.tdr')%>%html_text(trim=T)
    df=data.frame(stringsAsFactors = F
                  ,hospital_url=hospital_url
                  ,nm=res[1]
                  ,addr=res[2]
                  ,hp_admin=res[3]
                  ,start_dt=res[4]
                  ,type=res[5]
                  ,level=res[6]
                  ,section_cnt=res[7]
                  ,doc_cnt=res[8]
                  ,bed_cnt=res[9]
                  ,traffic=res[10]
                  ,insurance=res[11]
                  ,web_site=res[12]
                  ,phone=res[13]
                  ,addr=res[14]
                  ,postcode=res[15])
  },error = function(e) {
    rm(df)
    df=data.frame(stringsAsFactors = F
                  ,hospital_url=hospital_url
                  ,nm=NA
                  ,addr=NA
                  ,hp_admin=NA
                  ,start_dt=NA
                  ,type=NA
                  ,level=NA
                  ,section_cnt=NA
                  ,doc_cnt=NA
                  ,bed_cnt=NA
                  ,traffic=NA
                  ,insurance=NA
                  ,web_site=NA
                  ,phone=NA
                  ,addr=NA
                  ,postcode=NA)
  })
  return(df)
}

op <- pboptions(type = "timer")
x <- hospital_url_pool[1:5000]
system.time(rdf_99jk_1 <- do.call(rbind.data.frame,pblapply(x,get_detail)))
pboptions(op)
write.table(rdf_99jk_1,'rdf_99jk_1_5000.csv',row.names = F,sep='	')

##多线程爬虫
pkg <- c('RMySQL','DBI','xml2','rvest','magrittr','xml2','stringr','httpuv','R2HTML','pbapply','curl')
for(i in pkg){
  if(!i %in% installed.packages()[,1]) (install.packages(i))
}

library(magrittr)
library(bitops)
library(rvest)
library(stringr)
library(DBI)
library(RCurl)
library(curl)
library(sqldf)
library(gdata)
library(xml2)
library(pbapply)
library(parallel)

res_99jk <- read.table('res_99jk.csv',header = T,stringsAsFactors = F)
hospital_url_pool=res_99jk$hospital_url%>%unique()
get_detail <- function(hospital_url){
  tryCatch({
    res=hospital_url%>%read_html(encoding='gbk')%>%html_nodes('.tdr')%>%html_text(trim=T)
    df=data.frame(stringsAsFactors = F
                  ,hospital_url=hospital_url
                  ,nm=res[1]
                  ,addr=res[2]
                  ,hp_admin=res[3]
                  ,start_dt=res[4]
                  ,type=res[5]
                  ,level=res[6]
                  ,section_cnt=res[7]
                  ,doc_cnt=res[8]
                  ,bed_cnt=res[9]
                  ,traffic=res[10]
                  ,insurance=res[11]
                  ,web_site=res[12]
                  ,phone=res[13]
                  ,addr=res[14]
                  ,postcode=res[15])
  },error = function(e) {
    rm(df)
    df=data.frame(stringsAsFactors = F
                  ,hospital_url=hospital_url
                  ,nm=NA
                  ,addr=NA
                  ,hp_admin=NA
                  ,start_dt=NA
                  ,type=NA
                  ,level=NA
                  ,section_cnt=NA
                  ,doc_cnt=NA
                  ,bed_cnt=NA
                  ,traffic=NA
                  ,insurance=NA
                  ,web_site=NA
                  ,phone=NA
                  ,addr=NA
                  ,postcode=NA)
  })
  return(df)
}

# 用system.time来返回计算所需时间
system.time({
  x <- hospital_url_pool[5001:length(hospital_url_pool)]
  cl <- makeCluster(4) # 初始化四核心集群
  clusterEvalQ(cl, library(magrittr))
  clusterEvalQ(cl, library(rvest))
  clusterEvalQ(cl, library(RCurl))
  clusterEvalQ(cl, library(curl))
  clusterEvalQ(cl, library(xml2))
  results <- parLapply(cl,x,get_detail) # lapply的并行版本
  res.df <- do.call('rbind',results) # 整合结果
  stopCluster(cl) # 关闭集群
})

# res2=do.call(rbind.data.frame,results)
res2=data.frame(stringsAsFactors = F)
for(i in 1:length(results)){
  print(paste0('this is ',i))
  if (!identical(dim(results[[i]]),dim(results[[1]]))) next
  res2=rbind(res2,results[[i]])
}