数据科学第一战:Iris的Pyhton数据分析
用pyhton具体实现机器学习算法感觉并没有Octave方便
幸好python有专门的scikit-learn库来直接用,所以以后的策略就是用Octave学习和理解算法
用python应用库算法解决Kaggle问题
1,Iris数据集逻辑回归Python版,数据地址:https://www.kaggle.com/chuckyin/iris-datasets
import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import train_test_split df = pd.read_csv('iris.csv') y = df['Species'] X = df.drop('Species', axis=1) x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) clf = LogisticRegression() clf.fit(x_train, y_train) score = clf.score(x_test, y_test) print(score)
分类器的正确率只有93.3%,这还是没有把数据集分为训练集和测试集,接下来分20%的数据集为测试集,80%的数据集为训练集
1 import numpy as np 2 import pandas as pd 3 from sklearn.linear_model import LogisticRegression 4 5 6 df_train = pd.read_csv('iris_train.csv') 7 df_test = pd.read_csv('iris_test.csv') 8 y_train = y = df_train['Species'] 9 x_train = X = df_train.drop('Species', axis=1) 10 y_test = y = df_test['Species'] 11 x_test = X = df_test.drop('Species', axis=1) 12 clf = LogisticRegression() 13 clf.fit(x_train, y_train) 14 15 score = clf.score(x_test, y_test) 16 print(score)
正确率只有66.7%,数据太少,只有150个样例,4个特征,接下来换神经网络试试
import numpy as np
import pandas as pd def sigmoid(z): return 1 / (1 + np.exp(-z)) def sigmoidGradient(z): return (1 - sigmoid(z)) * sigmoid(z) def predict(X, theta1, theta2): # forwardPropagation m = len(X) a1 = np.hstack((np.ones([m, 1]), X)) z2 = np.dot(a1, theta1) a2 = sigmoid(z2) a2 = np.hstack((np.ones([m, 1]), a2)) z3 = np.dot(a2, theta2) # print (z3) a3 = sigmoid(z3) return a3 def convert(y): t = y.values sp = np.unique(t) tt = np.zeros([len(t), len(sp)]) for i in range(len(t)): for j in range(len(sp)): if t[i] == sp[j]: tt[i][j] = 1 return tt def getData(fileName): df_train = pd.read_csv(fileName) y = convert(df_train['Species']) X = df_train.drop(['Id', 'Species'], axis=1).values return X, y X_train, y_train = getData('iris_train.csv') X_test, y_test = getData('iris_test.csv') np.random.seed(1) theta1 = 2 * np.random.rand(5, 4) - 1 theta2 = 2 * np.random.rand(5, 3) - 1 for iter in range(1, 20000): # forwardPropagation m = len(X_train) a1 = np.hstack((np.ones([m, 1]), X_train)) z2 = a1.dot(theta1) a2 = sigmoid(z2) a2 = np.hstack((np.ones([m, 1]), a2)) z3 = a2.dot(theta2) a3 = sigmoid(z3) # backPropagation delta3 = (a3 - y_train).T
if (iter % 10000) == 0: print("Error:" + str(np.mean(np.abs(delta3)))) grad2 = np.dot(a2.T, delta3.T) delta2 = np.dot(theta2[1:, ], delta3) * sigmoidGradient(z2.T) #delta2 = np.dot(theta2[1:, ], delta3) * (1 - a2.T[1:, ]) * a2.T[1:, ] grad1 = np.dot(a1.T, delta2.T) theta1 -= grad1 / m theta2 -= grad2 / m res = np.argmax(predict(X_test, theta1, theta2), axis=1) print(np.mean(np.argmax(y_test, axis = 1) == res))
还是神经网络强大,100%的准确率,当然也是因为数据偏少,偏差值收敛,每迭代10000次的偏差值
Error:0.013312372435
Error:0.0119592971635
Error:0.0115683980319
加上正则化,准确率准确率虽然没有降低(也可能是数据少), 但是偏差值发散,每迭代10000次的偏差值
Error:0.0491758284265
Error:0.0457085051291
Error:0.0523013950821
import numpy as np import pandas as pd def sigmoid(z): return 1 / (1 + np.exp(-z)) def sigmoidGradient(z): return (1 - sigmoid(z)) * sigmoid(z) def predict(X, theta1, theta2): # forwardPropagation m = len(X) a1 = np.hstack((np.ones([m, 1]), X)) z2 = np.dot(a1, theta1) a2 = sigmoid(z2) a2 = np.hstack((np.ones([m, 1]), a2)) z3 = np.dot(a2, theta2) # print (z3) a3 = sigmoid(z3) return a3 def convert(y): t = y.values sp = np.unique(t) tt = np.zeros([len(t), len(sp)]) for i in range(len(t)): for j in range(len(sp)): if t[i] == sp[j]: tt[i][j] = 1 return tt def getData(fileName): df_train = pd.read_csv(fileName) y = convert(df_train['Species']) X = df_train.drop(['Id', 'Species'], axis=1).values return X, y X_train, y_train = getData('iris_train.csv') X_test, y_test = getData('iris_test.csv') np.random.seed(1) theta1 = 2 * np.random.rand(5, 4) - 1 theta2 = 2 * np.random.rand(5, 3) - 1 lamb = 0.1 for iter in range(1, 30001): # forwardPropagation m = len(X_train) a1 = np.hstack((np.ones([m, 1]), X_train)) z2 = a1.dot(theta1) a2 = sigmoid(z2) a2 = np.hstack((np.ones([m, 1]), a2)) z3 = a2.dot(theta2) a3 = sigmoid(z3) # backPropagation delta3 = (a3 - y_train).T if (iter % 10000) == 0: print("Error:" + str(np.mean(np.abs(delta3)))) grad2 = np.dot(a2.T, delta3.T) + lamb * theta2 grad2[0, :] -= lamb * theta2[0, :] delta2 = np.dot(theta2[1:, ], delta3) * sigmoidGradient(z2.T) delta2 = np.dot(theta2[1:, ], delta3) * (1 - a2.T[1:, ]) * a2.T[1:, ] grad1 = np.dot(a1.T, delta2.T) + lamb * theta1 grad1[0, :] -= lamb * theta1[0, :] theta1 -= grad1 / m theta2 -= grad2 / m res = np.argmax(predict(X_test, theta1, theta2), axis=1) print(np.mean(np.argmax(y_test, axis = 1) == res))