主成分分析(PCA)实现代码
摘自《机器学习实践》第13章:
1 from numpy import * 2 import matplotlib 3 import matplotlib.pyplot as plt 4 5 def loadDataSet(fileName, delim='\t'): 6 fr = open(fileName) 7 stringArr = [line.strip().split(delim) for line in fr.readlines()] 8 datArr = [map(float, line) for line in stringArr] 9 return mat(datArr) 10 11 def pca(dataMat, topNfeat = 999999): 12 meanVals = mean(dataMat, axis = 0) 13 meanRemoved = dataMat - meanVals 14 covMat = cov(meanRemoved, rowvar = 0) 15 eigVals, eigVects = linalg.eig(mat(covMat)) 16 eigValInd = argsort(eigVals) 17 eigValInd = eigValInd[: - (topNfeat + 1): -1] 18 redEigVects = eigVects[:, eigValInd] 19 lowDDataMat = meanRemoved * redEigVects 20 reconMat = (lowDDataMat * redEigVects.T) + meanVals 21 return lowDDataMat, reconMat 22 23 if __name__ == "__main__": 24 dataMat = loadDataSet('testSet.txt') 25 lowDMat, reconMat = pca(dataMat, 1) 26 fig = plt.figure() 27 ax = fig.add_subplot(111) 28 ax.scatter(dataMat[:, 0].flatten().A[0], dataMat[:, 1].flatten().A[0], marker='^', s = 90) 29 ax.scatter(reconMat[:, 0].flatten().A[0], reconMat[:, 1].flatten().A[0], marker='o', s = 50, c='red') 30 plt.show()
输入数据格式: 文本文件,每行是两个以TAB键分隔的浮点数。
附运行结果: