PCA和Softmax分类比较—Mnist与人脸数据集
PCA人脸识别中三种方法得到的正确率可达到100%
作为对比,单独使用Softmax回归对人脸40*10*92*112的数据分类正确率为97%。
用PCA对MNIST手写数字10*500*28*28识别,也可以达到相对比较高的正确率,马氏距离h=32时正确率为0.93 (在softmax中为0.85~0.89)。
1 # coding:utf8 2 import numpy as np 3 import os 4 import sf 5 import pca 6 7 if __name__ == '__main__': 8 img=pca.load_img() 9 test=img 10 print np.mat(img).shape 11 label=[a+1 for a in range(40) for j in range(10)] 12 index=range(400) 13 np.random.shuffle(index) 14 label_=[label[i] for i in index] 15 test_=np.mat([test[i] for i in index]) 16 17 softmax = sf.SoftMax(MAXT=200, step=0.03, landa=0.01) 18 softmax.process_train(np.mat(img),np.array(label),40) 19 softmax.validate(test_,np.array(label_)) 20 # correctnum = 390, sumnum = 400, Accuracy:0.97
1 #coding:utf8 2 import cv2 3 import numpy as np 4 import matplotlib.pyplot as plt 5 import cPickle 6 7 TYPE_NUM=10 # 40 8 SAMPLE_NUM=500 # 10 9 10 def load_img(): 11 img=[] 12 for i in range(40): 13 for j in range(10): 14 path='att_faces\\s'+str(i+1)+'\\'+str(j+1)+'.pgm' 15 a=cv2.imread(path,0) 16 a=a.flatten()/255.0 17 img.append(a) 18 return img 19 20 def dis(A,B,dis_type=0,s=None): 21 if dis_type==1: # 欧式距离 22 return np.sum(np.square(A-B)) 23 elif dis_type==2: # 马式距离 24 f=np.sqrt(abs(np.dot(np.dot((A-B),s.I),(A-B).T))) # h增大时会出现负值 25 return f.tolist()[0][0] 26 else: # 曼哈顿距离 27 return np.sum(abs(A-B)) 28 29 def pca(data,h,dis_type=0): 30 q,r=np.linalg.qr(data.T) 31 u,s,v=np.linalg.svd(r.T) 32 fi=np.dot(q,(v[:h]).T) 33 y=np.dot(fi.T,data.T) 34 ym=[np.mean(np.reshape(x,(TYPE_NUM,SAMPLE_NUM)),axis=1) for x in y] 35 ym=np.reshape(ym,(h,TYPE_NUM)) 36 c=[] 37 if dis_type==2:# 计算马氏距离的额外处理" 38 yr=[np.reshape(x,(TYPE_NUM,SAMPLE_NUM)) for x in y] 39 yr=[[np.array(yr)[j][k] for j in range(h)]for k in range(TYPE_NUM)] 40 for k in yr: 41 k=np.reshape(k,(h,SAMPLE_NUM)) 42 e=np.cov(k) 43 c.append(e) 44 return fi,ym,c 45 46 def validate(fi,ym,test,label,dis_type=0,c=None): 47 ty=np.dot(fi.T,test.T) 48 correctnum=0 49 testnum=len(test) 50 for i in range(testnum): 51 if dis_type==2: 52 n=len(ym.T) 53 dd=[dis(ty.T[i],ym.T[n_],dis_type,np.mat(c[n_])) for n_ in range(n)] 54 else: 55 dd=[dis(ty.T[i],yy,dis_type) for yy in ym.T] 56 if np.argsort(dd)[0]==label[i]: # mnist中从0开始 57 correctnum+=1 58 rate = float(correctnum) / testnum 59 print "Correctnum = %d, Sumnum = %d" % (correctnum, testnum), "Accuracy:%.2f" % (rate) 60 return rate 61 62 if __name__ == '__main__': 63 f = open('mnist.pkl', 'rb') 64 training_data, validation_data, test_data = cPickle.load(f) 65 training_inputs = [np.reshape(x, 784) for x in training_data[0]] 66 data = np.array(training_inputs[:10000]) 67 training_inputs = [np.reshape(x, 784) for x in validation_data[0]] 68 vdata = np.array(training_inputs[:5000]) 69 f.close() 70 label=training_data[1][:10000] 71 c=np.argsort(label) 72 l=[label[x] for x in c] 73 d=[data[x] for x in c] 74 data_new=[] 75 label_new=[] 76 temp=-1000 77 for i in range(10): # 将数据整理为10类各500个样本依次排列 78 id= l.index(i) 79 if id-temp<500: 80 print "<500" 81 break 82 data_new.append(d[id:id+500]) 83 label_new.append(l[id:id+500]) # PCA中不需要,用于在Softmax中验证数据 84 temp=id 85 lb=np.array(label_new).flatten() 86 data_=[] 87 for j in data_new: 88 data_+=j 89 x_=[2**i for i in range(9)] 90 d_=['Manhattan Distance','Euclidean Metric', 'Mahalanobis Distance'] 91 for j in range(3): 92 y_=[] 93 plt.figure() 94 for i in range(9): 95 fi,ym,c=pca.pca(np.mat(data_),h=x_[i],dis_type=j) 96 y_.append(pca.validate(fi,ym,vdata, validation_data[1][:5000],dis_type=j,c=c)) 97 plt.ylim([0,1.0]) 98 plt.plot(x_,y_) 99 plt.scatter(x_,y_) 100 plt.xlabel('h') 101 plt.ylabel('Accuracy') 102 plt.title(d_[j]) 103 plt.show()