Reducing the Dimensionality of data with neural networks / A fast learing algorithm for deep belief net
Deeplearning原文作者Hinton代码注解
1 Matlab示例代码为两部分,分别对应不同的论文: 2 3 1. Reducing the Dimensionality of data with neural networks 4 5 ministdeepauto.m backprop.m rbmhidlinear.m 6 7 2. A fast learing algorithm for deep belief net 8 9 mnistclassify.m backpropclassfy.m 10 11 其余部分代码通用。 12 13 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 14 mnistclassify.m 15 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 16 17 clear all 18 close all 19 20 maxepoch=50; %迭代次数 21 numhid=500; numpen=500; numpen2=2000; 22 23 fprintf(1,'Converting Raw files into Matlab format \n'); 24 converter; 25 26 fprintf(1,'Pretraining a deep autoencoder. \n'); 27 fprintf(1,'The Science paper used 50 epochs. This uses %3i \n', maxepoch); 28 29 makebatches;%分批数据 30 [numcases numdims numbatches]=size(batchdata); %获取batchdata数据大小 31 %%numcases 每批数据的个数 32 %%numdims 数据元组的维度 33 %%numbtches 数据批数 34 35 fprintf(1,'Pretraining Layer 1 with RBM: %d-%d \n',numdims,numhid);%图像输入层到第一个隐藏层 36 restart=1; %设置初始化参数 37 rbm; %调用RBM训练数据 38 hidrecbiases=hidbiases; %获取隐藏层偏置值 39 save mnistvhclassify vishid hidrecbiases visbiases; % 40 41 fprintf(1,'\nPretraining Layer 2 with RBM: %d-%d \n',numhid,numpen);%第一个隐藏层到第二个隐藏层 42 batchdata=batchposhidprobs; %上一个RBM的隐藏层输出,读入作为这个RBM的输入 43 numhid=numpen;%设置隐藏层的节点数,输入的节点数已经由读入数据给出 44 restart=1; 45 rbm; 46 hidpen=vishid; penrecbiases=hidbiases; hidgenbiases=visbiases; %同上,提取权值,偏置, 47 save mnisthpclassify hidpen penrecbiases hidgenbiases; 48 49 fprintf(1,'\nPretraining Layer 3 with RBM: %d-%d \n',numpen,numpen2);%第二个隐藏层到第三层隐藏层,其余同上 50 batchdata=batchposhidprobs; 51 numhid=numpen2; 52 restart=1; 53 rbm; 54 hidpen2=vishid; penrecbiases2=hidbiases; hidgenbiases2=visbiases; 55 save mnisthp2classify hidpen2 penrecbiases2 hidgenbiases2; 56 57 backpropclassify; 58 59 60 61 62 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 63 backpropclassify.m 64 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 65 maxepoch=200; 66 fprintf(1,'\nTraining discriminative model on MNIST by minimizing cross entropy error. \n');%最小化交叉熵 67 fprintf(1,'60 batches of 1000 cases each. \n'); 68 69 load mnistvhclassify%加载各层之间的权值,以及偏置 70 load mnisthpclassify 71 load mnisthp2classify 72 73 makebatches;%分批数据 74 [numcases numdims numbatches]=size(batchdata); 75 N=numcases; %获取每批数据向量数 76 77 %%%% PREINITIALIZE WEIGHTS OF THE DISCRIMINATIVE MODEL%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 78 79 w1=[vishid; hidrecbiases];%第一层到第二层的权重,以及第二层的偏置 80 w2=[hidpen; penrecbiases];%类上 81 w3=[hidpen2; penrecbiases2];%类上 82 w_class = 0.1*randn(size(w3,2)+1,10);%随机生成第四层列数+1行,10列的矩阵 83 %%%%%%%%%% END OF PREINITIALIZATIO OF WEIGHTS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 84 85 l1=size(w1,1)-1;%获取每层的单元个数 86 l2=size(w2,1)-1; 87 l3=size(w3,1)-1; 88 l4=size(w_class,1)-1;%最高层的单元个数 89 l5=10; %label层单元个数 90 test_err=[];% 91 train_err=[];% 92 93 94 for epoch = 1:maxepoch 95 96 %%%%%%%%%%%%%%%%%%%% COMPUTE TRAINING MISCLASSIFICATION ERROR %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 97 err=0; 98 err_cr=0; 99 counter=0; 100 [numcases numdims numbatches]=size(batchdata); 101 %%numcases 每批数据的个数 102 %%numdims 数据元组的维度 103 %%numbtches 数据批数 104 N=numcases;%%每批次数据向量个数 105 for batch = 1:numbatches 106 data = [batchdata(:,:,batch)];%读取一批次数据 107 target = [batchtargets(:,:,batch)];%读取当前批次的目标值 108 data = [data ones(N,1)];%在原数据后添加N行1列数据 109 w1probs = 1./(1 + exp(-data*w1)); w1probs = [w1probs ones(N,1)];%sigmod计算各层的概率值,参见BP算法 110 w2probs = 1./(1 + exp(-w1probs*w2)); w2probs = [w2probs ones(N,1)]; 111 w3probs = 1./(1 + exp(-w2probs*w3)); w3probs = [w3probs ones(N,1)]; 112 113 targetout = exp(w3probs*w_class);%计算最后的输出值N行10列 114 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 115 %对最后的label的输出处理过程,见公式6.1,其中w3probs*w_class是label的输入 116 %最后只能有一个单元被激活,激活单元的选择即通过下面计算得出的概率来进行选择 117 %10个单元组成的“softmax”组 118 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 119 targetout = targetout./repmat(sum(targetout,2),1,10);%计算最后10个label输出除以输出值的总和 120 121 [I J]=max(targetout,[],2);%取计算结果每行中的最大值,以及其列标 122 [I1 J1]=max(target,[],2);%取原先设定目标值的最大值以及列标 123 counter=counter+length(find(J==J1));%统计正确的条数 124 err_cr = err_cr- sum(sum( target(:,1:end).*log(targetout))) ; %%%%???? 125 end 126 train_err(epoch)=(numcases*numbatches-counter);%总的错误条数??? 127 train_crerr(epoch)=err_cr/numbatches;%平均每批次错误率??? 128 129 %%%%%%%%%%%%%% END OF COMPUTING TRAINING MISCLASSIFICATION ERROR %%%%%%%%%%%%%%%%%%%%%%%%%%%%% 130 131 %%%%%%%%%%%%%%%%%%%% COMPUTE TEST MISCLASSIFICATION ERROR %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 132 err=0; 133 err_cr=0; 134 counter=0; 135 [testnumcases testnumdims testnumbatches]=size(testbatchdata); 136 137 N=testnumcases; 138 for batch = 1:testnumbatches 139 data = [testbatchdata(:,:,batch)]; 140 target = [testbatchtargets(:,:,batch)]; 141 data = [data ones(N,1)]; 142 w1probs = 1./(1 + exp(-data*w1)); w1probs = [w1probs ones(N,1)]; 143 w2probs = 1./(1 + exp(-w1probs*w2)); w2probs = [w2probs ones(N,1)]; 144 w3probs = 1./(1 + exp(-w2probs*w3)); w3probs = [w3probs ones(N,1)]; 145 targetout = exp(w3probs*w_class); 146 targetout = targetout./repmat(sum(targetout,2),1,10); 147 148 [I J]=max(targetout,[],2); 149 [I1 J1]=max(target,[],2); 150 counter=counter+length(find(J==J1)); 151 err_cr = err_cr- sum(sum( target(:,1:end).*log(targetout))) ; 152 end 153 test_err(epoch)=(testnumcases*testnumbatches-counter); 154 test_crerr(epoch)=err_cr/testnumbatches; 155 fprintf(1,'Before epoch %d Train # misclassified: %d (from %d). Test # misclassified: %d (from %d) \t \t \n',... 156 epoch,train_err(epoch),numcases*numbatches,test_err(epoch),testnumcases*testnumbatches); 157 158 %%%%%%%%%%%%%% END OF COMPUTING TEST MISCLASSIFICATION ERROR %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 159 160 tt=0; 161 for batch = 1:numbatches/10 162 fprintf(1,'epoch %d batch %d\r',epoch,batch); 163 164 %%%%%%%%%%% COMBINE 10 MINIBATCHES INTO 1 LARGER MINIBATCH %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 165 %组合10个小批次为1000样例的批次,然后用conjugate gradient来进行微调 166 tt=tt+1; 167 data=[]; 168 targets=[]; 169 for kk=1:10 170 data=[data 171 batchdata(:,:,(tt-1)*10+kk)]; %10个小批次合成 172 targets=[targets 173 batchtargets(:,:,(tt-1)*10+kk)]; 174 end 175 176 %%%%%%%%%%%%%%% PERFORM CONJUGATE GRADIENT WITH 3 LINESEARCHES %%%%%%%%%%%%%%%%%%%%%%%%%%%%% 177 max_iter=3; %设置线性搜索的次数 178 179 if epoch<6 % First update top-level weights holding other weights fixed. 180 N = size(data,1); %获取数据的行数 181 XX = [data ones(N,1)]; %每行数据后面增加1,用来增加偏置 182 w1probs = 1./(1 + exp(-XX*w1)); w1probs = [w1probs ones(N,1)]; 183 w2probs = 1./(1 + exp(-w1probs*w2)); w2probs = [w2probs ones(N,1)]; 184 w3probs = 1./(1 + exp(-w2probs*w3)); %w3probs = [w3probs ones(N,1)]; 185 186 VV = [w_class(:)']'; %VV将随机生成的向量w_class展开成一列???为什么展开成一列与minimize的参数有关 187 % 188 Dim = [l4; l5]; %记录最后两层的单元节点数,即2000的隐藏层和10的label层 189 [X, fX] = minimize(VV,'CG_CLASSIFY_INIT',max_iter,Dim,w3probs,targets);%只训练两层 %%%详细见函数定义 190 %minimize is Cari Rasmusssen's "minimize" code 191 %%------------------参数含义------------------%% 192 %VV 随机权重向量的展开 ,其作为输入参数,列必须为1(D by 1) 193 %X 函数f="CG_CLASSIFY_INIT"的最优化参数 194 %fX 函数f对X的偏导 195 %max_iter 如果为正,表示线性搜索次数,为负,函数的最大值个数 196 %%-------------------------------------------------% 197 w_class = reshape(X,l4+1,l5);%恢复权值矩阵结构 198 199 else %进入整体微调过程 200 VV = [w1(:)' w2(:)' w3(:)' w_class(:)']'; %将所有权值按列展开成一列 201 Dim = [l1; l2; l3; l4; l5]; %记录各层单元个数传入 202 [X, fX] = minimize(VV,'CG_CLASSIFY',max_iter,Dim,data,targets); 203 204 w1 = reshape(X(1:(l1+1)*l2),l1+1,l2); %恢复W1权值1.0 205 xxx = (l1+1)*l2; %临时变量,用于恢复权值单元 206 w2 = reshape(X(xxx+1:xxx+(l2+1)*l3),l2+1,l3); 207 xxx = xxx+(l2+1)*l3; 208 w3 = reshape(X(xxx+1:xxx+(l3+1)*l4),l3+1,l4); 209 xxx = xxx+(l3+1)*l4; 210 w_class = reshape(X(xxx+1:xxx+(l4+1)*l5),l4+1,l5); 211 212 end 213 %%%%%%%%%%%%%%% END OF CONJUGATE GRADIENT WITH 3 LINESEARCHES %%%%%%%%%%%%%%%%%%%%%%%%%%%%% 214 215 end 216 217 save mnistclassify_weights w1 w2 w3 w_class 218 save mnistclassify_error test_err test_crerr train_err train_crerr; 219 220 end 221 222 223 224 225 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 226 rbm.m 227 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\ 228 epsilonw = 0.1; % Learning rate for weights 229 epsilonvb = 0.1; % Learning rate for biases of visible units 230 epsilonhb = 0.1; % Learning rate for biases of hidden units 231 weightcost = 0.0002; 232 initialmomentum = 0.5; 233 finalmomentum = 0.9; 234 235 [numcases numdims numbatches]=size(batchdata); 236 %%numcases 每批数据的个数 237 %%numdims 数据元组的维度 238 %%numbtches 数据批数 239 240 if restart ==1, 241 restart=0; 242 epoch=1; 243 244 % Initializing symmetric weights and biases. 初始化对称权值和偏置 245 vishid = 0.1*randn(numdims, numhid); %初始化生成可视层到隐藏层的权值 246 hidbiases = zeros(1,numhid);%隐藏单元的偏置值 247 visbiases = zeros(1,numdims);%可见单元的偏置值 248 249 poshidprobs = zeros(numcases,numhid); %正向的隐藏单元概率生成 250 neghidprobs = zeros(numcases,numhid);%反向的隐藏单元概率生成 251 posprods = zeros(numdims,numhid);%正向可见单元概率生成 252 negprods = zeros(numdims,numhid);%反向可见单元概率生成 253 vishidinc = zeros(numdims,numhid);%%%%%可视单元和隐藏单元之间的权值增量 254 hidbiasinc = zeros(1,numhid);%%隐藏单元的偏置增量 255 visbiasinc = zeros(1,numdims);%%可视单元的偏置增量 256 batchposhidprobs=zeros(numcases,numhid,numbatches);%存储每次迭代计算好的每层的隐藏层概率,作为下一个RBM的输入 257 end 258 259 %%%%%%%%%%%%%%%%简单输出 迭代次数 处理的批次%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 260 for epoch = epoch:maxepoch, %迭代处理 261 fprintf(1,'epoch %d\r',epoch); 262 errsum=0; %初始化输出错误为0 263 for batch = 1:numbatches, %每次处理一批次的数据 264 fprintf(1,'epoch %d batch %d\r',epoch,batch); 265 266 %%%%%%%%% START POSITIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 267 data = batchdata(:,:,batch); %读取当前批次的全部数据vi 268 poshidprobs = 1./(1 + exp(-data*vishid - repmat(hidbiases,numcases,1))); %计算前向传播的隐藏层概率hi 269 batchposhidprobs(:,:,batch)=poshidprobs;%将计算好的概率赋值给当前批次前向传播的隐藏层最后一次计算好的值作为下一层的输入 270 posprods = data' * poshidprobs;%contrastive divergence过程<vi,hi> 271 272 poshidact = sum(poshidprobs);%average-wise隐藏层激活概率值 273 posvisact = sum(data);%average-wise可视层激活概率值 274 275 %%%%%%%%% END OF POSITIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 276 poshidstates = poshidprobs > rand(numcases,numhid);%gibbs抽样,设定状态 277 278 %%%%%%%%% START NEGATIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 279 negdata = 1./(1 + exp(-poshidstates*vishid' - repmat(visbiases,numcases,1)));%根据hi计算vi+1 280 neghidprobs = 1./(1 + exp(-negdata*vishid - repmat(hidbiases,numcases,1))); %根据vi+1计算hi+1 281 negprods = negdata'*neghidprobs;%contrastive divergence <vi+1,hi+1> 282 283 neghidact = sum(neghidprobs); 284 negvisact = sum(negdata); 285 286 %%%%%%%%% END OF NEGATIVE PHASE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 287 err= sum(sum( (data-negdata).^2 )); %重新构建数据的方差 288 errsum = err + errsum;%整体方差 289 290 if epoch>5, %迭代次数不同调整冲量 291 momentum=finalmomentum; 292 else 293 momentum=initialmomentum; 294 end; 295 296 %%%%%%%%% UPDATE WEIGHTS AND BIASES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 297 vishidinc = momentum*vishidinc + ... 298 epsilonw*( (posprods-negprods)/numcases - weightcost*vishid);%权重增量计算 299 visbiasinc = momentum*visbiasinc + (epsilonvb/numcases)*(posvisact-negvisact);%偏置增量计算 300 hidbiasinc = momentum*hidbiasinc + (epsilonhb/numcases)*(poshidact-neghidact);%隐藏层增量计算 301 302 vishid = vishid + vishidinc; 303 visbiases = visbiases + visbiasinc; 304 hidbiases = hidbiases + hidbiasinc; 305 306 %%%%%%%%%%%%%%%% END OF UPDATES %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 307 308 end 309 fprintf(1, 'epoch %4i error %6.1f \n', epoch, errsum); 310 end; 311 312 313 314 315 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 316 CG_CLASSIFY_INIT.M 317 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\ 318 function [f, df] = CG_CLASSIFY_INIT(VV,Dim,w3probs,target);%CG对最上面两层的训练 319 l1 = Dim(1); 320 l2 = Dim(2); 321 N = size(w3probs,1); 322 % Do decomversion. 323 w_class = reshape(VV,l1+1,l2); %恢复权重, 324 w3probs = [w3probs ones(N,1)]; %一列,偏置 325 326 targetout = exp(w3probs*w_class); %计算label层的输出结果为numbercase*lablesnumber的矩阵 327 targetout = targetout./repmat(sum(targetout,2),1,10); %选择最后的激活单元,见backpropclassify.m 的76行 328 f = -sum(sum( target(:,1:end).*log(targetout))) ; %交叉熵 只采用了前边部分 329 330 IO = (targetout-target(:,1:end)); % 输入和输出结果之间的差值 331 Ix_class=IO; % 332 dw_class = w3probs'*Ix_class;%导数F(x)((1-F(x))乘以输出结果的偏差..其中F为sigmoid函数 333 334 df = [dw_class(:)']'; 335 336 337 338 339 340 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 341 CG_CLASSIFY.M 342 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 343 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 344 % 该段代码对所有权重进行整体微调 345 % 各部分过程见 CG_CLASSIFY_INIT.m注解 346 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 347 function [f, df] = CG_CLASSIFY(VV,Dim,XX,target); 348 349 350 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 351 rbmhidlinear.m 352 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 353 %除了最后计算单元值采用的是线性单元其余过程全部一样 354 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 355 356 复制代码