机器学习日志 泰坦尼克号获救预测 Titanic sklearn 决策树/随机森林
第一次做机器学习的题目
题目要求:给定一堆已知的泰坦尼克号船员信息,每个人的信息包括
PassengerId => 乘客ID
Pclass => 客舱等级(1/2/3等舱位)
Name => 乘客姓名
Sex => 性别
Age => 年龄
SibSp => 兄弟姐妹数/配偶数
Parch => 父母数/子女数
Ticket => 船票编号
Fare => 船票价格
Cabin => 客舱号
Embarked => 登船港口
其中一部分还知道是否获救
现在让你推测剩余的人是否获救?
做法是利用决策树来做
刚入门,决策树做的非常简陋
利用信息增益函数(ent)来找用哪个属性来划分决策树
因为年龄有部分船员确实所以我就忽略了
为了使属性值是离散的,我让属性值和平均值作了比较来划分
正确率77.511%,以后慢慢改进
import pandas as pd
import math
cnt = 0#树的id编号
bestA = []#树上当前位置最好的a(属性)划分
ye = []#是否是叶子节点(是否得到了分类结果)
ping = []
# 判断A是否都一样
def allSame(nowD, nowA):
for i in range(len(nowA)):
for j in range(len(nowD)):
if nowD[j][i] != nowD[1][i]:
return 0
return 1
def ent(nowD):
sum = 0
flag = [0, 0];
p = [0, 0]
for i in range(len(nowD)):
flag[nowD[i][0]] = flag[nowD[i][0]] + 1
p[0] = flag[0] / (flag[0] + flag[1])
p[1] = flag[1] / (flag[0] + flag[1])
return -p[0] * math.log(p[0] + 0.0001, 2) - p[1] * math.log(p[1] + 0.0001, 2)
def calcGain(nowD, a):
tmpD = [[], []]
for i in range(len(nowD)):
if nowD[i][a] < ping[a]:
tmpD[0].append(nowD[i])
else:
tmpD[1].append(nowD[i])
return ent(nowD) - len(tmpD[0]) / len(nowD) * ent(tmpD[0]) - len(tmpD[1]) / len(nowD) * ent(tmpD[1])
# 建决策树
def build(nowId, nowD, nowA):
flag = [0, 0];
tmpD = [[], []]
global ye, cnt, bestA
for i in range(len(nowD)):
flag[nowD[i][0]] = flag[nowD[i][0]] + 1
if flag[0] == 0:
ye[nowId] = 1
return
if flag[1] == 0:
ye[nowId] = 0
return
if len(nowA) == 0 or allSame(nowD, nowA):
ye[nowId] = 1 if flag[1] > flag[0] else 0
return
bestA[nowId] = nowA[0];
bestGain = calcGain(nowD, nowA[0])
for i in range(1, len(nowA)):
nowGain = calcGain(nowD, nowA[i])
if nowGain > bestGain:
bestA[nowId] = nowA[i];
bestGain = nowGain
for i in range(len(nowD)):
if nowD[i][bestA[nowId]] < ping[bestA[nowId]]:
tmpD[0].append(nowD[i])
else:
tmpD[1].append(nowD[i])
nowA.remove(bestA[nowId])
for i in [0, 1]:
if len(tmpD[i]) == 0:
ye[nowId * 2 + i] = 1 if flag[1] > flag[0] else 0
else:
build(nowId * 2 + i, tmpD[i], nowA)
def ask(nowId,D):
if ye[nowId]!=-1:
return ye[nowId]
if D[bestA[nowId]]<ping[bestA[nowId]]:
return ask(nowId*2,D)
else:
return ask(nowId*2+1,D)
myTrain = pd.read_csv('train.csv')
myTest = pd.read_csv('test.csv')
D = [];
A = [1, 2, 3, 4, 5]
PassengerId=[]
Survived=[]
# 预处理
for i in range(10000):
ye.append(-1)
for i in range(10000):
bestA.append(-1)
# 读入
for i in range(myTrain.shape[0]):
D.append(
[myTrain.values[i][1], myTrain.values[i][2], 1 if (myTrain.values[i][4] == 'male') else 0, myTrain.values[i][6],
myTrain.values[i][7], myTrain.values[i][9]])
# 计算各个属性平均值
for i in range(len(D[1])):
sum = 0
for j in range(len(D)):
sum = sum + D[j][i]
ping.append(sum / len(D))
build(1, D, A)
for i in range(len(myTest)):
PassengerId.append(myTest.values[i][0])
Survived.append(ask(1,[0,myTest.values[i][1], 1 if (myTest.values[i][3] == 'male') else 0, myTest.values[i][5],myTest.values[i][6], myTest.values[i][8]]))
myAns = pd.DataFrame({'PassengerId': PassengerId, 'Survived': Survived})
myAns.to_csv("myAns.csv", index = False, sep=',')
再放一个用sklearn做的随机森林版本,正确率差不多, 这里把nan的年龄用平均值代替了
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
y = train_data["Survived"]
features = ["Pclass", "Sex", "Age", "SibSp", "Parch"]#利用这些特征来做决策树
x = pd.get_dummies(train_data[features])
# print(train_data["Age"].sum()/len(train_data["Age"]))#23.79
x = x.fillna(24)#用平均值来填充
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=233)
model.fit(x, y)#训练模型
myIn = pd.get_dummies(test_data[features])
myIn = myIn.fillna(24)
predictions = model.predict(myIn)
myAns = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
myAns.to_csv('myAns.csv', index=False)