玩一个预测人品的比赛-代码积累
用xgboost进行训练,代码见下面
#设置路径,加载包 setwd('/Users/litao/R/eXtreme Gradient Boosting/eXtreme Gradient Boosting/比赛来一发数据集') library(xgboost) library(magrittr) library(Matrix) library(dplyr) # step 1:loading data train=read.csv('train_x.csv') test=read.csv('test_x.csv') train.y=read.csv('train_y.csv') ft=read.csv('features_type.csv') # step 2:创建训练集,测试集的index,方便以后进行筛选测试集和训练集 train.index <- seq(1,nrow(train),1) test.index <- seq(nrow(train)+1, nrow(train)+nrow(test), 1) #combine train and test traintest.combine <- rbind(train,test)%>%cbind(index=c(train.index,test.index),.) #把所有的变量存进feature这个向量 fea <- unique(ft[,1]) #转换数据类型,分类变量转换为因子 for(f in fea){ if(ft[which(ft$feature==f),2]=='category') traintest.combine[,f] <- as.factor(traintest.combine[,f]) } #查看转化后的数据类型,和ft中的数据类型是否一致 # str(traintest.combine, list.len=ncol(traintest.combine)) # step 3:将分类变量转化为稀疏矩阵 df <- traintest.combine res <- do.call('cbind', lapply(names(df), function(x) model.matrix(as.formula(paste0(' ~',x,'-1')), df[x]))) # 去掉存在分类变量中存在-1的变量 X <- colnames(res) ol <- grep(glob2rx("*-1"), X) dat <- Matrix(res[,-ol],sparse=T) # step 4: modeling dtrain=xgb.DMatrix(data=dat[train.index,c(-1,-2)],label=train.y$y) dtest=xgb.DMatrix(data=dat[test.index,c(-1,-2)]) set.seed(1) model100=xgboost( booster='gbtree', objective='binary:logistic', scale_pos_weight=1542/13458, gamma=0, lambda=700, subsample=0.7, colsample_bytree=0.30, min_child_weight=5, max_depth=8, eta=0.01, data=dtrain, nrounds=3820, eval_metric='auc', nthread=4) pred=predict(model100,dtest) write.csv(data.frame('uid'=test['uid'],'score'=pred),file='submit100.csv',row.names=F) head(data.frame('uid'=test[,1],'score'=pred))
用随机森林训练,代码见下面
# how to calculate AUC in R? # http://stackoverflow.com/questions/4903092/calculate-auc-in-r if(!'ROCR' %in% installed.packages()[,1]) (install.packages('ROCR')) library(ROCR) library(randomForest) library(e1071) library(gbm) library(xgboost) library(data.table) library(magrittr) library(stringr) library(foreach) # randomForest # step 1: load data into R and convert data type by batch setwd('/Users/litao/R/eXtreme Gradient Boosting/eXtreme Gradient Boosting/比赛来一发数据集') list.files() features_type <- read.csv('features_type.csv') train_x <- fread('train_x.csv',header = TRUE)%>%as.data.frame() train_y <- fread('train_y.csv',header = TRUE)%>%as.data.frame() train_y$y <- as.factor(train_y$y) test_x <- fread('test_x.csv',header = TRUE)%>%as.data.frame() # for train_x, convert category into factor by batch. for(i in 1:1138){ if(features_type[i,2]=='category') train_x[,i+1] <- as.factor(train_x[,i+1]) } # for test_x,convert category into factory by batch for(i in 1:1138){ if(features_type[i,2]=='category') test_x[,i+1] <- as.factor(test_x[,i+1]) } # 统一level for(i in 1:1138){ if(features_type[i,2]=='category') levels(test_x[,i+1]) <- levels(train_x[,i+1]) } # step 2: is there any missing value in train_x???? ## calculate missing value ratio for coloumns missingvalue.ratio <- function(df){ df <- as.data.frame(df) res <- is.na(df)%>%colSums()/length(df[,1]) return(res) } missingvalue.ratio(train_x) ## stratify sampling with replace, down-sampling the majority class ,up-sampling the minority dat <- cbind(y=train_y[,2],train_x[,-1]) set.seed(12) #----5000 颗树木 train.rf.1000 <- randomForest(y~.,data=dat ,mtry=34 ,ntree=5000 ,sampsize=c(1542,5000) ,strata=dat$y ,do.trace=1 ,nodesize=2 ) # calculate AUC in randomForest library(ROCR) calculate.auc <- function(rf_output,target){ predictions=as.vector(train.rf$votes[,2]) pred=prediction(predictions,dat$y) perf_AUC=performance(pred,"auc") #Calculate the AUC value AUC=perf_AUC@y.values[[1]] perf_ROC=performance(pred,"tpr","fpr") #plot the actual ROC curve plot(perf_ROC, main="ROC plot") text(0.5,0.5,paste("AUC = ",format(AUC, digits=5, scientific=FALSE))) #calculate.auc(train.rf.1000,y) }