常见machine learning模型实现
一、感知机模型
二、线性回归(Linear Regression)
from numpy import * def loadData(filename): x = [] y = [] f = open(filename) for line in f.readlines(): lineData = line.strip().split(',') x.append([1.0,float(lineData[0])]) y.append(float(lineData[1])) return x,y #预测函数,theta,x都是一维数组,dot运算得到实数,对于二维数组,dot运算就是矩阵运算 def h(theta,x): return theta.dot(x) #批量梯度下降 def batch_gradient_descent(alpha,theta,x,y): m,n = x.shape newtheta = array([0] * n,dtype = float) for j in range(n): count = 0.0 for i in range(m): count += (h(theta,x[i,:]) - y[i])*x[i,j] newtheta[j] = newtheta[j] - count * alpha / m return newtheta #正则方程 def normal_equation(x,y): return linalg.inv(transpose(x).dot(x)).dot(transpose(x)).dot(y) #损失函数 def cost_function(theta,x,y): m = x.shape[0] return (x.dot(theta) - y).dot(x.dot(theta) - y) / (2 * m) def run(): x,y = loadData('ex1data1.txt') x = array(x) y = array(y) #列向量 m,n = x.shape theta = array([0] * n,dtype = float) costs = [] for iters in range(1000): costs.append(cost_function(theta,x,y)) theta = batch_gradient_descent(0.01,theta,x,y) print "batch gradient descent:\n" print "theta:",theta print 'cost:\n',costs print "normal equation:\n" theta = normal_equation(x,y) print "theta:",theta if __name__ == "__main__": run()
三、Logistic Regression
def sigmoid(x): return 1.0/(1 + exp(-x)) def trainLogRegres(x,y,opts): m,n = x.shape alpha = opts["alpha"] maxIter = opts['maxIter'] weight = ones((n,1)) for k in range(maxIter): if opts['optimizeType'] == 'batchGraDescent': weight = weight - alpha * x.T * (sigmoid(x*weight) - y) elif opts['optimizeType'] == 'stocGraDescent': for i in range(m): weight = weight - alpha * x[i,:].T * (sigmoid(x[i,:] * weight) - y[i,0]) else: raise NameError('Not support optimize method type!') return weight def testLogRegres(weight,x,y): m,n = x.shape trueNum = 0 for i in range(m): predict = sigmoid(x[i,:] * weight)[0,0] > 0.5 if predict == bool(y[i,0]): trueNum += 1 accuracy = float(trueNum) / m return accuracy #x每行对应一个样本,y是列向量 def loadData(): x = [] y = [] f = open("testSet.txt") for line in f.readlines(): lineArr = line.strip().split() x.append([1.0, float(lineArr[0]), float(lineArr[1])]) y.append(float(lineArr[2])) return mat(x),mat(y).T if __name__ == '__main__': x,y = loadData() opts = {'alpha': 0.01, 'maxIter': 50, 'optimizeType': 'stocGraDescent'} weight = trainLogRegres(x,y,opts) accuracy = testLogRegres(weight,x,y) print "accuracy:",accuracy
四、SVM
五、kmeans
https://en.wikipedia.org/wiki/Latent_semantic_analysis