数据科学第一战:Iris的Pyhton数据分析

用pyhton具体实现机器学习算法感觉并没有Octave方便

幸好python有专门的scikit-learn库来直接用,所以以后的策略就是用Octave学习和理解算法

用python应用库算法解决Kaggle问题

1,Iris数据集逻辑回归Python版,数据地址:https://www.kaggle.com/chuckyin/iris-datasets

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split


df = pd.read_csv('iris.csv')

y = df['Species']
X = df.drop('Species', axis=1)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

clf = LogisticRegression()
clf.fit(x_train, y_train)

score = clf.score(x_test, y_test)
print(score)

 分类器的正确率只有93.3%,这还是没有把数据集分为训练集和测试集,接下来分20%的数据集为测试集,80%的数据集为训练集

 

 1 import numpy as np
 2 import pandas as pd
 3 from sklearn.linear_model import LogisticRegression
 4 
 5 
 6 df_train = pd.read_csv('iris_train.csv')
 7 df_test = pd.read_csv('iris_test.csv')
 8 y_train = y = df_train['Species']
 9 x_train = X = df_train.drop('Species', axis=1)
10 y_test = y = df_test['Species']
11 x_test = X = df_test.drop('Species', axis=1)
12 clf = LogisticRegression()
13 clf.fit(x_train, y_train)
14 
15 score = clf.score(x_test, y_test)
16 print(score)

正确率只有66.7%,数据太少,只有150个样例,4个特征,接下来换神经网络试试

import numpy as np
import
pandas as pd def sigmoid(z): return 1 / (1 + np.exp(-z)) def sigmoidGradient(z): return (1 - sigmoid(z)) * sigmoid(z) def predict(X, theta1, theta2): # forwardPropagation m = len(X) a1 = np.hstack((np.ones([m, 1]), X)) z2 = np.dot(a1, theta1) a2 = sigmoid(z2) a2 = np.hstack((np.ones([m, 1]), a2)) z3 = np.dot(a2, theta2) # print (z3) a3 = sigmoid(z3) return a3 def convert(y): t = y.values sp = np.unique(t) tt = np.zeros([len(t), len(sp)]) for i in range(len(t)): for j in range(len(sp)): if t[i] == sp[j]: tt[i][j] = 1 return tt def getData(fileName): df_train = pd.read_csv(fileName) y = convert(df_train['Species']) X = df_train.drop(['Id', 'Species'], axis=1).values return X, y X_train, y_train = getData('iris_train.csv') X_test, y_test = getData('iris_test.csv') np.random.seed(1) theta1 = 2 * np.random.rand(5, 4) - 1 theta2 = 2 * np.random.rand(5, 3) - 1 for iter in range(1, 20000): # forwardPropagation m = len(X_train) a1 = np.hstack((np.ones([m, 1]), X_train)) z2 = a1.dot(theta1) a2 = sigmoid(z2) a2 = np.hstack((np.ones([m, 1]), a2)) z3 = a2.dot(theta2) a3 = sigmoid(z3) # backPropagation delta3 = (a3 - y_train).T   

if (iter % 10000) == 0: print("Error:" + str(np.mean(np.abs(delta3)))) grad2 = np.dot(a2.T, delta3.T) delta2 = np.dot(theta2[1:, ], delta3) * sigmoidGradient(z2.T) #delta2 = np.dot(theta2[1:, ], delta3) * (1 - a2.T[1:, ]) * a2.T[1:, ] grad1 = np.dot(a1.T, delta2.T) theta1 -= grad1 / m theta2 -= grad2 / m res = np.argmax(predict(X_test, theta1, theta2), axis=1) print(np.mean(np.argmax(y_test, axis = 1) == res))

还是神经网络强大,100%的准确率,当然也是因为数据偏少,偏差值收敛,每迭代10000次的偏差值

Error:0.013312372435
Error:0.0119592971635
Error:0.0115683980319

 

加上正则化,准确率准确率虽然没有降低(也可能是数据少), 但是偏差值发散,每迭代10000次的偏差值

Error:0.0491758284265
Error:0.0457085051291
Error:0.0523013950821

import numpy as np
import pandas as pd

def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def sigmoidGradient(z):
    return (1 - sigmoid(z)) * sigmoid(z)

def predict(X, theta1, theta2):
    # forwardPropagation
    m = len(X)

    a1 = np.hstack((np.ones([m, 1]), X))

    z2 = np.dot(a1, theta1)
    a2 = sigmoid(z2)
    a2 = np.hstack((np.ones([m, 1]), a2))

    z3 = np.dot(a2, theta2)
    # print (z3)
    a3 = sigmoid(z3)
    return a3


def convert(y):
    t = y.values
    sp = np.unique(t)
    tt = np.zeros([len(t), len(sp)])
    for i in range(len(t)):
        for j in range(len(sp)):
            if t[i] == sp[j]:
                tt[i][j] = 1
    return tt


def getData(fileName):
    df_train = pd.read_csv(fileName)
    y = convert(df_train['Species'])
    X = df_train.drop(['Id', 'Species'], axis=1).values
    return X, y

X_train, y_train = getData('iris_train.csv')
X_test, y_test = getData('iris_test.csv')

np.random.seed(1)

theta1 = 2 * np.random.rand(5, 4) - 1
theta2 = 2 * np.random.rand(5, 3) - 1

lamb = 0.1

for iter in range(1, 30001):

    # forwardPropagation
    m = len(X_train)

    a1 = np.hstack((np.ones([m, 1]), X_train))

    z2 = a1.dot(theta1)
    a2 = sigmoid(z2)
    a2 = np.hstack((np.ones([m, 1]), a2))

    z3 = a2.dot(theta2)
    a3 = sigmoid(z3)

    # backPropagation

    delta3 = (a3 - y_train).T

    if (iter % 10000) == 0:
        print("Error:" + str(np.mean(np.abs(delta3))))

    grad2 = np.dot(a2.T, delta3.T) + lamb * theta2
    grad2[0, :] -= lamb * theta2[0, :]

    delta2 = np.dot(theta2[1:, ], delta3) * sigmoidGradient(z2.T)
    delta2 = np.dot(theta2[1:, ], delta3) * (1 - a2.T[1:, ]) * a2.T[1:, ]

    grad1 = np.dot(a1.T, delta2.T) + lamb * theta1
    grad1[0, :] -= lamb * theta1[0, :]

    theta1 -= grad1 / m

    theta2 -= grad2 / m



res = np.argmax(predict(X_test, theta1, theta2), axis=1)

print(np.mean(np.argmax(y_test, axis = 1) == res))

 

 
posted @ 2017-03-26 21:34    阅读(404)  评论(0编辑  收藏  举报