机器学习13—PCA学习笔记
机器学习实战之PCA
test13.py
#-*- coding:utf-8 import sys sys.path.append("pca.py") import pca from numpy import * dataMat = pca.loadDataSet('testSet.txt') lowDMat, reconMat, eigVals, eigVects = pca.pca(dataMat, 1) res = shape(lowDMat) print("lowDMat:") print(lowDMat) print("reconMat:") print(reconMat) print("eigVals:") print(eigVals) print("eigVects:") print(eigVects) import matplotlib import matplotlib.pyplot as plt fig=plt.figure() ax=fig.add_subplot(111) #三角形表示原始数据点 ax.scatter(dataMat[:,0].flatten().A[0],dataMat[:,1].flatten().A[0], marker='^',s=90) #圆形点表示第一主成分点,点颜色为红色 ax.scatter(reconMat[:,0].flatten().A[0],reconMat[:,1].flatten().A[0], marker='o',s=90,c='red') plt.show() print("over!!!")
pca.py
''' Created on Jun 1, 2011 @author: Peter Harrington ''' from numpy import * def loadDataSet(fileName, delim= '\t'):#delim= ' ' fr = open(fileName) stringArr = [line.strip().split(delim) for line in fr.readlines()] datArr = [list(map(float,line)) for line in stringArr] return mat(datArr) def pca(dataMat, topNfeat=4096): meanVals = mean(dataMat, axis=0) meanRemoved = dataMat - meanVals #remove mean covMat = cov(meanRemoved, rowvar=0) eigVals,eigVects = linalg.eig(mat(covMat)) eigValInd = argsort(eigVals) #sort, sort goes smallest to largest eigValInd = eigValInd[:-(topNfeat+1):-1] #cut off unwanted dimensions redEigVects = eigVects[:,eigValInd] #reorganize eig vects largest to smallest lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions reconMat = (lowDDataMat * redEigVects.T) + meanVals return lowDDataMat, reconMat, eigVals,eigVects def replaceNanWithMean(): datMat = loadDataSet('secom.data', ' ') numFeat = shape(datMat)[1] for i in range(numFeat): meanVal = mean(datMat[nonzero(~isnan(datMat[:,i].A))[0],i]) #values that are not NaN (a number) datMat[nonzero(isnan(datMat[:,i].A))[0],i] = meanVal #set NaN values to mean return datMat