玩一个预测人品的比赛

玩一个预测人品的比赛－代码积累

用xgboost进行训练，代码见下面

#设置路径，加载包
setwd('/Users/litao/R/eXtreme Gradient Boosting/eXtreme Gradient Boosting/比赛来一发数据集')
library(xgboost)
library(magrittr)
library(Matrix)
library(dplyr)
# step 1:loading data
train=read.csv('train_x.csv')
test=read.csv('test_x.csv')
train.y=read.csv('train_y.csv')
ft=read.csv('features_type.csv')
# step 2:创建训练集，测试集的index，方便以后进行筛选测试集和训练集
train.index <- seq(1,nrow(train),1)
test.index <- seq(nrow(train)+1, nrow(train)+nrow(test), 1)
#combine train and test
traintest.combine <- rbind(train,test)%>%cbind(index=c(train.index,test.index),.)
#把所有的变量存进feature这个向量
fea <- unique(ft[,1])
#转换数据类型，分类变量转换为因子
for(f in fea){
    if(ft[which(ft$feature==f),2]=='category') 
        traintest.combine[,f] <- as.factor(traintest.combine[,f])
}
#查看转化后的数据类型，和ft中的数据类型是否一致
# str(traintest.combine, list.len=ncol(traintest.combine))
# step 3:将分类变量转化为稀疏矩阵
df <- traintest.combine
res <- do.call('cbind',
               lapply(names(df), function(x) model.matrix(as.formula(paste0(' ~',x,'-1')), df[x])))
# 去掉存在分类变量中存在-1的变量
X <- colnames(res)
ol <- grep(glob2rx("*-1"), X)
dat <- Matrix(res[,-ol],sparse=T)
# step 4: modeling
dtrain=xgb.DMatrix(data=dat[train.index,c(-1,-2)],label=train.y$y)
dtest=xgb.DMatrix(data=dat[test.index,c(-1,-2)])
set.seed(1)
model100=xgboost(  booster='gbtree',
                   objective='binary:logistic',
                   scale_pos_weight=1542/13458,
                   gamma=0,
                   lambda=700,
                   subsample=0.7,
                   colsample_bytree=0.30,
                   min_child_weight=5,
                   max_depth=8,
                   eta=0.01,
                   data=dtrain,
                   nrounds=3820,
                   eval_metric='auc',
                   nthread=4)
pred=predict(model100,dtest)
write.csv(data.frame('uid'=test['uid'],'score'=pred),file='submit100.csv',row.names=F)
head(data.frame('uid'=test[,1],'score'=pred))

用随机森林训练，代码见下面

# how to calculate AUC in R?
# http://stackoverflow.com/questions/4903092/calculate-auc-in-r
if(!'ROCR' %in% installed.packages()[,1]) (install.packages('ROCR'))
library(ROCR)
library(randomForest)
library(e1071)
library(gbm)
library(xgboost)
library(data.table)
library(magrittr)
library(stringr)
library(foreach)
# randomForest 
# step 1: load data into R and convert data type by batch
setwd('/Users/litao/R/eXtreme Gradient Boosting/eXtreme Gradient Boosting/比赛来一发数据集')
list.files()
features_type <- read.csv('features_type.csv')
train_x <- fread('train_x.csv',header = TRUE)%>%as.data.frame()
train_y <- fread('train_y.csv',header = TRUE)%>%as.data.frame()
train_y$y <- as.factor(train_y$y)
test_x <- fread('test_x.csv',header = TRUE)%>%as.data.frame()
# for train_x, convert category into factor by batch.
for(i in 1:1138){
    if(features_type[i,2]=='category') 
        train_x[,i+1] <- as.factor(train_x[,i+1])
}
# for test_x,convert category into factory by batch
for(i in 1:1138){
    if(features_type[i,2]=='category') 
        test_x[,i+1] <- as.factor(test_x[,i+1])
}
# 统一level
for(i in 1:1138){
    if(features_type[i,2]=='category') 
        levels(test_x[,i+1]) <- levels(train_x[,i+1])
}
# step 2: is there any missing value in train_x????
## calculate missing value ratio for coloumns
missingvalue.ratio <- function(df){
    df <- as.data.frame(df)
    res <- is.na(df)%>%colSums()/length(df[,1])
    return(res)
}
missingvalue.ratio(train_x)
## stratify sampling with replace, down-sampling the majority class ,up-sampling the minority
dat <- cbind(y=train_y[,2],train_x[,-1])
set.seed(12)
#----5000 颗树木
train.rf.1000 <- randomForest(y~.,data=dat
                          ,mtry=34
                          ,ntree=5000
                          ,sampsize=c(1542,5000)
                          ,strata=dat$y
                          ,do.trace=1
                          ,nodesize=2
)
# calculate AUC in randomForest
library(ROCR)
calculate.auc <- function(rf_output,target){
    predictions=as.vector(train.rf$votes[,2])
    pred=prediction(predictions,dat$y)
    
    perf_AUC=performance(pred,"auc") #Calculate the AUC value
    AUC=perf_AUC@y.values[[1]]
    
    perf_ROC=performance(pred,"tpr","fpr") #plot the actual ROC curve
    plot(perf_ROC, main="ROC plot")
    text(0.5,0.5,paste("AUC = ",format(AUC, digits=5, scientific=FALSE)))
    #calculate.auc(train.rf.1000,y)
}