数据科学技术与应用第五章机器学习建模分析
基于Keras建立深度神经网络模型,在bankpep数据集上训练神经网络分类模型,将训练模型的耗时以及模型性能,与XGBoost、SVM、朴素贝叶斯等方法进行比较。
import pandas,datetime,xgboost,numpy from sklearn import model_selection,preprocessing,metrics,tree,naive_bayes,svm from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier import matplotlib.pyplot as plt from keras.models import Sequential from keras.layers import Dense,Activation from keras.utils import np_utils from graphviz import Source from IPython.display import Image #请根据 bankpep.csv 保存位置适当调整代码 df=pandas.read_csv('data/bankpep.csv',index_col='id') seq=['married','car','save_act','current_act','mortgage','pep'] for feature in seq: df.loc[df[feature]=='YES',feature]=1 df.loc[df[feature] == 'NO', feature] = 0 df.loc[df['sex']=='FEMALE','sex']=1 df.loc[df['sex']=='MALE','sex']=0 dumm_region=pandas.get_dummies(df['region'],prefix='region') dumm_child=pandas.get_dummies(df['children'],prefix='children') df=df.drop(['region','children'],axis=1) df=df.join([dumm_region,dumm_child],how='outer') x=df.drop(['pep'],axis=1).values.astype(float) #x=preprocessing.scale(x) y=df['pep'].values.astype(int) x_train,x_test,y_train,y_test=model_selection.train_test_split(x,y,test_size=0.2,random_state=1) featureName=df.drop(['pep'],axis=1).columns.values className=['pep','no pep'] #tree print('Tree') start_time=datetime.datetime.now() clf_tree=tree.DecisionTreeClassifier() clf_tree.fit(x_train,y_train) pre_y_train_tree=clf_tree.predict(x_train) pre_y_test_tree=clf_tree.predict(x_test) print('train_tree') print(clf_tree.score(x_train,y_train)) print(metrics.classification_report(y_train,pre_y_train_tree)) print(metrics.confusion_matrix(y_train,pre_y_train_tree)) print('test_tree') tree_score=clf_tree.score(x_test,y_test) print(tree_score) print(metrics.classification_report(y_test,pre_y_test_tree)) print(metrics.confusion_matrix(y_test,pre_y_test_tree)) ''' graph_tree=Source(tree.export_graphviz(clf_tree,out_file=None,feature_names=featureName,class_names=className)) png_bytes=graph_tree.pipe(format='png') with open('mooc_5.2_tree.png','wb') as f: f.write(png_bytes) ''' end_time = datetime.datetime.now() time_tree=end_time-start_time print("time:",time_tree) #naive_bayes.MultinomialNB print('MultinomialNB') start_time=datetime.datetime.now() clf_MultinomialNB=naive_bayes.MultinomialNB() clf_MultinomialNB.fit(x_train,y_train) pre_y_train_MultinomialNB=clf_MultinomialNB.predict(x_train) pre_y_test_MultinomialNB=clf_MultinomialNB.predict(x_test) print('train_MultinomialNB') print(clf_MultinomialNB.score(x_train,y_train)) print(metrics.classification_report(y_train,pre_y_train_MultinomialNB)) print(metrics.confusion_matrix(y_train,pre_y_train_MultinomialNB)) print('test_MultinomialNB') MultinomialNB_score=clf_MultinomialNB.score(x_test,y_test) print(MultinomialNB_score) print(metrics.classification_report(y_test,pre_y_test_MultinomialNB)) print(metrics.confusion_matrix(y_test,pre_y_test_MultinomialNB)) end_time=datetime.datetime.now() time_MultinomialNB=end_time-start_time print("time:",time_MultinomialNB) #naive_bayes.GaussianNB print('GaussianNB') start_time=datetime.datetime.now() clf_GaussianNB=naive_bayes.GaussianNB() clf_GaussianNB.fit(x_train,y_train) pre_y_train_GaussianNB=clf_GaussianNB.predict(x_train) pre_y_test_GaussianNB=clf_GaussianNB.predict(x_test) print('train_GaussianNB') print(clf_GaussianNB.score(x_train,y_train)) print(metrics.classification_report(y_train,pre_y_train_GaussianNB)) print(metrics.confusion_matrix(y_train,pre_y_train_GaussianNB)) print('test_GaussianNB') GaussianNB_score=clf_GaussianNB.score(x_test,y_test) print(GaussianNB_score) print(metrics.classification_report(y_test,pre_y_test_GaussianNB)) print(metrics.confusion_matrix(y_test,pre_y_test_GaussianNB)) end_time=datetime.datetime.now() time_GaussianNB=end_time-start_time print("time:",time_GaussianNB) #naive_bayes.BernoulliNB print('BernoulliNB') start_time=datetime.datetime.now() clf_BernoulliNB=naive_bayes.BernoulliNB() clf_BernoulliNB.fit(x_train,y_train) pre_y_train_BernoulliNB=clf_BernoulliNB.predict(x_train) pre_y_test_BernoulliNB=clf_BernoulliNB.predict(x_test) print('train_BernoulliNB') print(clf_BernoulliNB.score(x_train,y_train)) print(metrics.classification_report(y_train,pre_y_train_BernoulliNB)) print(metrics.confusion_matrix(y_train,pre_y_train_BernoulliNB)) print('test_BernoulliNB') BernoulliNB_score=clf_BernoulliNB.score(x_test,y_test) print(BernoulliNB_score) print(metrics.classification_report(y_test,pre_y_test_BernoulliNB)) print(metrics.confusion_matrix(y_test,pre_y_test_BernoulliNB)) end_time=datetime.datetime.now() time_BernoulliNB=end_time-start_time print("time:",time_BernoulliNB) #SVM print('SVM') start_time=datetime.datetime.now() clf_SVM=svm.SVC() clf_SVM.fit(x_train,y_train) pre_y_train_SVM=clf_SVM.predict(x_train) pre_y_test_SVM=clf_SVM.predict(x_test) print('train_SVM') print(clf_SVM.score(x_train,y_train)) print(metrics.classification_report(y_train,pre_y_train_SVM)) print(metrics.confusion_matrix(y_train,pre_y_train_SVM)) print('test_SVM') SVM_score=clf_SVM.score(x_test,y_test) print(SVM_score) print(metrics.classification_report(y_test,pre_y_test_SVM)) print(metrics.confusion_matrix(y_test,pre_y_test_SVM)) end_time=datetime.datetime.now() time_SVM=end_time-start_time print("time:",time_SVM) #GBM print('GBM') start_time=datetime.datetime.now() clf_GBM=GradientBoostingClassifier() clf_GBM.fit(x_train,y_train) pre_y_train_GBM=clf_GBM.predict(x_train) pre_y_test_GBM=clf_GBM.predict(x_test) print('train_GBM') print(clf_GBM.score(x_train,y_train)) print(metrics.classification_report(y_train,pre_y_train_GBM)) print(metrics.confusion_matrix(y_train,pre_y_train_GBM)) print('test_GBM') GBM_score=clf_GBM.score(x_test,y_test) print(GBM_score) print(metrics.classification_report(y_test,pre_y_test_GBM)) print(metrics.confusion_matrix(y_test,pre_y_test_GBM)) end_time=datetime.datetime.now() time_GBM=end_time-start_time print("time:",time_GBM) #XGBoost print('XGBoost') start_time=datetime.datetime.now() clf_XGBoost=xgboost.XGBClassifier() clf_XGBoost.fit(x_train,y_train) pre_y_train_XGBoost=clf_XGBoost.predict(x_train) pre_y_test_XGBoost=clf_XGBoost.predict(x_test) print('train_XGBoost') print(clf_XGBoost.score(x_train,y_train)) print(metrics.classification_report(y_train,pre_y_train_XGBoost)) print(metrics.confusion_matrix(y_train,pre_y_train_XGBoost)) print('test_XGBoost') XGBoost_score=clf_XGBoost.score(x_test,y_test) print(XGBoost_score) print(metrics.classification_report(y_test,pre_y_test_XGBoost)) print(metrics.confusion_matrix(y_test,pre_y_test_XGBoost)) end_time=datetime.datetime.now() time_XGBoost=end_time-start_time print("time:",time_XGBoost) #RandomForestClassifier print('RFC') start_time=datetime.datetime.now() clf_RFC=RandomForestClassifier() clf_RFC.fit(x_train,y_train) pre_y_train_RFC=clf_RFC.predict(x_train) pre_y_test_RFC=clf_RFC.predict(x_test) print('train_RFC') print(clf_RFC.score(x_train,y_train)) print(metrics.classification_report(y_train,pre_y_train_RFC)) print(metrics.confusion_matrix(y_train,pre_y_train_RFC)) print('test_RFC') RFC_score=clf_RFC.score(x_test,y_test) print(RFC_score) print(metrics.classification_report(y_test,pre_y_test_RFC)) print(metrics.confusion_matrix(y_test,pre_y_test_RFC)) end_time=datetime.datetime.now() time_RFC=end_time-start_time print("time:",time_RFC) #Keras print('Keras') start_time=datetime.datetime.now() model=Sequential() model.add(Dense(units=16,input_shape=(16,))) model.add(Activation('relu')) model.add(Dense(100)) model.add(Activation('relu')) model.add(Dense(2)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['binary_accuracy']) y_train_ohe=np_utils.to_categorical(y_train,2) y_test_ohe=np_utils.to_categorical(y_test,2) model.fit(x_train,y_train_ohe,epochs=25,batch_size=1,verbose=2,validation_data=(x_test,y_test_ohe)) loss,accuracy=model.evaluate(x_test,y_test_ohe) print(loss,accuracy) classes=model.predict(x_test,batch_size=1,verbose=2) Keras_score=loss end_time=datetime.datetime.now() time_Keras=end_time-start_time print("time:",time_Keras) #Matplotlib model=['tree','MultinomialNB','GaussianNB','BernoulliNB','SVM','GBM','XGBoost','RFC'] column=['Score','Time'] datas=[] for i in model: data=[] data.append(eval(i+"_score")) data.append(eval("time_"+i).total_seconds()) datas.append(data) df_Matplotlib=pandas.DataFrame(datas,columns=column,index=model) print(df_Matplotlib) print('Keras',loss,accuracy,time_Keras.total_seconds()) df_Matplotlib.plot() plt.grid() plt.show()
输出结果:
Score Time tree 0.775000 0.081810 MultinomialNB 0.666667 0.009974 GaussianNB 0.700000 0.008011 BernoulliNB 0.741667 0.009941 SVM 0.566667 0.027959 GBM 0.825000 0.100698 XGBoost 0.816667 0.153870 RFC 0.833333 0.282304 Keras 0.6881586909294128 0.550000011920929 13.049028
越努力越幸运!