泰坦尼克号生还者概率

加载数据 

赋值新数据集

字段值之间的关系

探索缺失值的方法

填补缺失值

使用决策树或随机森林处理缺失值

使用逻辑回归 

 

#加载数据

test <- read.csv(file.choose())
train <- read.csv(file.choose())

#注意:字符串要带有双引号

#“Sex”字段类型转换/生成新的字段值(因子类型字段处理)

train$sex[train$Sex == "female"] <- 0
train$sex[train$Sex == "male"] <- 1

test$sex[test$Sex == "female"] <- 0
test$sex[test$Sex == "male"] <- 1

#“Embarked“字段转换/生成新的字段值(因子类型字段处理)

train$embarked[train$Embarked == "C"] <-1
train$embarked[train$Embarked == "Q"] <-2
train$embarked[train$Embarked == "S"] <-3

test$embarked[test$Embarked == "C"] <-1
test$embarked[test$Embarked == "Q"] <-2
test$embarked[test$Embarked == "S"] <-3

#拆分字段"Name",获取称谓

title_train <- as.character(train$Name)
title_train <- strsplit(title_train," ")

title_test <- as.character(test$Name)
title_test <- strsplit(title_test," ")

#生成新字段title

train$title <- train$Survived
train$title <- as.character(train$title)

test$title <- test$PassengerId
test$title <- as.character(test$PassengerId)

#提取姓名中的称呼字段"Name" Warning

for(i in 1:length(train$title)){
temp_num <- grep("\\.",title_train[[i]]);
#if else 完善结构,无意义
if(is.integer(temp_num)) {
train$title[i] <- title_train[[i]][temp_num];
}
else {
train$title[i] <- NA;
}
}

#定义需要删除的字段

temp_name <- names(train)
temp_delete <- c("Embarked","Name","Sex")

#循环删除字段

for(i in 1:length(temp_delete))
temp_name <- temp_name[-grep(temp_delete[i],temp_name)]

#赋值新数据集 train_set

train_set <- train[temp_name]


for(i in 1:length(test$title)){
temp_num <- grep("\\.",title_test[[i]]);
#if else 完善结构,无意义
if(is.integer(temp_num)) {
test$title[i] <- title_test[[i]][temp_num];
}
else {
test$title[i] <- NA;
}
}

#定义需要删除的字段

temp_name <- names(test)
temp_delete <- c("Embarked","Name","Sex")

#循环删除字段

for(i in 1:length(temp_delete))
temp_name <- temp_name[-grep(temp_delete[i],temp_name)]

#赋值新数据集 test_set

test_set <- test[temp_name]

#剔除多余列的优化方案
# n<-names(train)
# head(train_ [n [n !="title" & n!="sex" & n!="Fare" ] ] )

#2017-08-06
#数据处理的目标:字段是否可用,如何转化为可用字段
#Ticket 可以根据字段特征(长度+前几位数字、相同字幕),确定字段是否属于同类型(级别)
#缺失值处理,可以对train和test数据集合并后进行处理

#作图

#导出图片

#savePlot(filename = "",
# type = c("wmf", "emf", "png", "jpg", "jpeg", "bmp",
# "tif", "tiff", "ps", "eps", "pdf"),
# device = dev.cur(),
# restoreConsole = TRUE)

#字段值之间的关系

#性别和生存关系

#savePlot(filename = "",
# type = c("wmf", "emf", "png", "jpg", "jpeg", "bmp",
# "tif", "tiff", "ps", "eps", "pdf"),
# device = dev.cur(),
# restoreConsole = TRUE)

 


#登船位置与生存关系

#barplot(table(train_set$Survived,train_set$embarked),col=c("red","green"),args.legend = list(x = "topleft"),legend.text = c("0", "1"))

 

 #年龄的统计量

summary(train$Age)
#Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#0.42 20.12 28.00 29.70 38.00 80.00 177

#箱线图查看生存和年龄的关系

#boxplot(Age~Survived, data=train)

 

#查看年龄变量缺失值对应title统计量

table(train_set$title[is.na(train_set$Age)])


 #查看train数据集中的title变量年龄的分布

#boxplot(Age~title,data=train_set)

 

 

#合并test和train数据集

train_age <- data.frame(train$Age,train$title)
names(train_age) <- c("age","title")
test_age <- data.frame(test$Age,test$title)
names(test_age) <- c("age","title")
combin_age <- rbind(train_age,test_age)
head(combind_age)

#剔除title出现次数较少行

temp <- combin_age[which(combin_age$title !="Capt."
& combin_age$title !="Countess."
& combin_age$title !="Don."
& combin_age$title !="Jonkheer."
& combin_age$title !="Lady."
& combin_age$title !="Mme."
& combin_age$title !="Sir."
& combin_age$title !="Dona."
& combin_age$title !="Ms."
& combin_age$title !="Mlle."
& combin_age$title !="Major."
& combin_age$title !="Col."
),]


#作图,合并后的age和title

#boxplot(age~title,data=temp)

 

#备份数据

train_ <- train_set
test_ <- test_set

#由于和年龄相关变量较少,故此处仅使用中位数作为缺失值

#获取年龄中位数

summary(temp[which(temp$title=="Dr."),]$age)
#Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#23.00 38.00 49.00 43.57 51.50 54.00 1
summary(temp[which(temp$title=="Master."),]$age)
#Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#0.330 2.000 4.000 5.483 9.000 14.500 8
summary(temp[which(temp$title=="Miss."),]$age)
#Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#0.17 15.00 22.00 21.77 30.00 63.00 50
summary(temp[which(temp$title=="Mr."),]$age)
#Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#11.00 23.00 29.00 32.25 39.00 80.00 176
summary(temp[which(temp$title=="Mrs."),]$age)
#Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
#14.00 27.00 35.50 36.99 46.50 76.00 27

#填补缺失值

train_$Age[train_$title =="Miss." & is.na(train_$Age)]<-median(temp[which(temp$title=="Miss."),]$age,na.rm=TRUE)
train_$Age[train_$title =="Master." & is.na(train_$Age)]<-median(temp[which(temp$title=="Master."),]$age,na.rm=TRUE)
train_$Age[train_$title =="Mr." & is.na(train_$Age)]<-median(temp[which(temp$title=="Mr."),]$age,na.rm=TRUE)
train_$Age[train_$title =="Mrs." & is.na(train_$Age)]<-median(temp[which(temp$title=="Mrs."),]$age,na.rm=TRUE)
train_$Age[train_$title =="Dr." & is.na(train_$Age)]<-median(temp[which(temp$title=="Dr."),]$age,na.rm=TRUE)


test_$Age[test_$title =="Miss." & is.na(test_$Age)]<-median(temp[which(temp$title=="Miss."),]$age,na.rm=TRUE)
test_$Age[test_$title =="Master." & is.na(test_$Age)]<-median(temp[which(temp$title=="Master."),]$age,na.rm=TRUE)
test_$Age[test_$title =="Mr." & is.na(test_$Age)]<-median(temp[which(temp$title=="Mr."),]$age,na.rm=TRUE)
test_$Age[test_$title =="Mrs." & is.na(test_$Age)]<-median(temp[which(temp$title=="Mrs."),]$age,na.rm=TRUE)
test_$Age[test_$title =="Dr." & is.na(test_$Age)]<-median(temp[which(temp$title=="Dr."),]$age,na.rm=TRUE)

#剔除embarked为NA的行

train_ <- train_[!is.na(train_$embarked),]
test_ <- test_[!is.na(test_$embarked),]

#剔除多余的列

train_ <- train_[names(train_)[(names(train_) !="title")]]
test_ <- test_[names(test_)[(names(test_) !="title")]]

#票价有为0的  

#从下图可以看出,船舱等级和票价之间的联系,然后对0票价进行处理

#summary(train_$Fare) 
#发现存在票价为0的情况,查看为0的数量
#length(train_$Fare[train_$Fare == 0])

#剔除(未分析的)干扰项目

train_ <- train_[names(train_)[(names(train_) !="Cabin")]]
train_ <- train_[names(train_)[(names(train_) !="Ticket")]]

test_ <- test_[names(test_)[(names(test_) !="Cabin")]]
test_ <- test_[names(test_)[(names(test_) !="Ticket")]]

#查看变量直接的关联强度(剔除NA值)

#此处应该先把非定量序数变量变成定量序数变量后,采用Speaman进行相关分析

 

heatmap(cor(as.matrix(na.omit(train_))),symm = TRUE, Rowv=NA, Colv=NA, col = cm.colors(256))

 

 

 

#查看票价和船仓等级之间的关系

boxplot(Fare~Pclass,data=subset(train_,Fare !=  "512.3292"),horizontal=TRUE)

 

#剔除0值后再观察

boxplot(Fare~Pclass,data=subset(train_,Fare !=  "512.3292" & Fare !="0"),horizontal=TRUE)

 


 

 #探索缺失值的方法

 

library(mice)
md.pattern(train_)

 

#使用决策树或随机森林处理缺失值

#用于熟悉分类方法,目前掌握的资料暂未发现使用该方法进行缺失值处理

#采用随机抽取的方法,把train_分为训练集和测试集

#分析两个模型

#获取到如下数据 head(newData)

#注意数据集中不能出现字符串,可以更改为因子

#指定百分比抽样数据,分作训练集和测试集

sampleData <- sample(2,replace= TRUE, nrow(newData), prob=c(0.7, 0.3)) 
trainData <- newData[sampleData==1,]
testData <- newData[sampleData==2,]

#条件推断树

library(party)
f1 <- emba ~ Survived + Pclass + Age + SibSp + Parch + Fare + sex 
c_tree <- ctree(f1, data=trainData)

#绘制决策树

plot(c_tree)

#对测试数据进行预测

testPred <- predict(c_tree, newdata = testData)
table(testPred,testData$emba)

#随机森林方法进行预测

 library(randomForest)
 rf <- randomForest(f1, data=trainData, ntree=100, proximity=TRUE)
 table(predict(rf), trainData$emba)

#由于预测效果极差,放弃该方法

#处理test中的NA的情况

#此处忽略操作......

#使用逻辑回归

f1<-glm(Survived~Pclass+Age+SibSp+Parch+Fare+sex+embarked,data=train_,family=binomial())

f2<-glm(Survived~Pclass+Age+SibSp+sex+embarked,data=train_,family=binomial())

anova(f1,f2,test="Chisq")

#Analysis of Deviance Table

#Model 1: Survived ~ Pclass + Age + SibSp + Parch + Fare + sex + embarked
#Model 2: Survived ~ Pclass + Age + SibSp + sex + embarked
#  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
#1       881     781.22                     
#2       883     782.29 -2  -1.0685   0.5861
f3<-glm(Survived~Pclass+Age+SibSp+sex+embarked,data=train_,family=binomial())

test_p<-test_[,c("Pclass","Age","Fare","sex","embarked")]

pred<-predict(f3,test_p)

perf[perf>0]<-1

perf[perf<=0]<-0

 

posted @ 2017-09-11 16:58  aongao  阅读(259)  评论(0编辑  收藏  举报