python 决策树建立 泰坦尼克号
### 泰坦尼克号海难生存人员预测 # 导入需要的库 import pandas as pd from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_predict import matplotlib.pyplot as plt data = pd.read_csv(r'file:///E:/学习/python/机器学习课件 9.20-9.21/titanic/train.csv',index_col = 0) # 查看数据集的基本特征 data.head() data.info data.shape #不涉及到训练集和测试集之间相互影响的 #删除缺失值过多的列,和观察判断来说和预测的y没有关系的列 data.drop(["Cabin","Name","Ticket"],inplace=True,axis=1) data.info data.head() # 对数据集中的缺失值进行处理 年龄中缺失值 这里用均值进行填补 data.loc[:,'Age'] = data.loc[:,'Age'].fillna(data.loc[:,'Age'].mean()) # 对缺失数据少的数据直接删除 data = data.dropna() data.info # ============================================================================= # #将分类变量转换为数值变量 # # 将二分类变量转化为0,1变量 # # astype能够轻松的将pandas中文本变量转换为数值型变量 # data['Sex'] = (data['Sex'] == 'male').astype("int") # data.head() # ============================================================================= labels=data['Sex'].unique().tolist() # 得到去重之后的数据 将其转换为列表 其各个数值所对应的数据的索引分别为0,1 # 我们用函数将其索引赋值给对应的数据 , 即实现了对其的离散化和数值化 data['Sex']=[*map(lambda x:labels.index(x),data['Sex'])] # 将三分类的问题转化为数值变量 labels = data['Embarked'].unique().tolist() # 对数据列进行去重 并将其转换为列表格式 data["Embarked"] = data["Embarked"].apply(lambda x: labels.index(x)) # 提取标签和特征矩阵 分裂数据集 -- 测试集合训练集 X = data.iloc[:,1:] Y = data.iloc[:,1] # 导入数据划分的包 from sklearn.model_selection import train_test_split # 注意 这里的X_train,X_test,y_train,y_test的顺序不能颠倒 X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.3,random_state = 420) # 首先修正分割后的索引 for i in [X_train,y_train,X_test,y_test]: i.index = range(i.shape[0]) # ============================================================================= # # 对数据集进行数据预处理 处理缺失值和异常值 # X_train.info # 发现年龄这一列中有缺失值 这里用均值进行填补 # X_train.loc[:,'Age'] = X_train.Age.fillna(X_train.loc[:,'Age'].mean()) # # X_test.loc[:,'Age'] = X_test.Age.fillna(X_test.loc[:,'Age'].mean()) # ============================================================================= # 建立预测模型 # 实例化模型 clf = DecisionTreeClassifier(random_state=25) # 利用训练数据集对实例化后的数据进行训练 clf = clf.fit(X_train,y_train) # 查看训练后的模型分别在训练集和测试集上面的得分 clf.score(X_train,y_train) clf.score(X_test,y_test) clf.feature_importances_ # 显示出各个特征属性所对应的名称以及重要性 [*zip(data.columns,clf.feature_importances_)] from sklearn.model_selection import cross_val_score var = cross_val_score(clf,Xtrain,Ytrain,cv=10).var() var # 调整树的深度观察模型的拟合程度 tr = [] te = [] test = [] # 设置树的深度在0到10之间 for i in range(1,10): clf = DecisionTreeClassifier(max_depth = i,random_state= 666) clf = clf.fit(X_train,y_train) score_train = clf.score(X_train,y_train) score_te = cross_val_score(clf,Xtrain,Ytrain,cv=10).mean() tr.append(score_train) te.append(score_te) # 模型在测试集上面的准确率 score_test = clf.score(X_test, y_test) test.append(score_test) print(len(test)) print(max(te)) len(tr) # 切换绘图风格 plt.style.use("ggplot") plt.plot(range(1,10),tr,color="red",label="train") plt.plot(range(1,10),te,color="blue",label="cross_val_score") plt.plot(range(1,10),test,color="green",label="test") plt.xticks(range(1,10)) plt.legend() plt.show() # 用网格搜索调整参数 from sklearn.model_selection import GridSearchCV import numpy as np gini_thresholds = np.linspace(0,0.5,20) gini_thresholds # 首先构造一个所有搜索参数的字典 parameters = {'splitter':('best','random') ,'criterion':("gini","entropy") ,"max_depth":range(1,10) ,'min_samples_leaf':range(1,50,5) ,'min_impurity_decrease':np.linspace(0,0.5,20) } # 实例化模型, 先不传参 clf = DecisionTreeClassifier(random_state=25) # 实例化网格搜索API GS = GridSearchCV(clf,parameters, cv = 5 , verbose=1 ) # 对数据进行网格搜索 GS.fit(Xtrain, Ytrain) #属性best_params_查看调整出来的最佳参数 GS.best_params_ #属性best_score_查看最佳分数 GS.best_score_