Deep Learning Tutorial (翻译) 之 Denoising Autoencoder
英文原文请参考http://www.deeplearning.net/tutorial/dA.html
自编码
一个自编码接受x作为输入,映射成隐藏层表示y:
其中,s是非线性函数,如sigmod。y通过一个decoder映射成与x有着相同shape的重构的z,通过相似的转换:
z可以看作是x的预测,给定编码code y。可选地,W'可以是W的转置,W‘=WT,目标是优化参数W,b,b'使得平均重构误差最小。
重构误差取决于输入数据的合适的分布假设,可以使用传统的平方差,如果输入是位变量,可以使用交叉熵:
编码y可以看作是数据的主成分。如果隐藏层是线性的并使用平方差来训练网络,那么k个隐藏层单元可以看作数据的k个主成分。如果隐藏层是非线性的,那么它就不同于PCA。
y可看作x的有损压缩,我们希望优化使得对训练数据和其他输入都有好的压缩效果,但是当测试样本和训练样本不符合同一分布,即相差较大时,效果不好。
我们使用Theano去实现一个自编码器,把它定义成一个类,就可用于构造堆积自编码。
隐藏层单元数比输入多的自编码器可以避免学习到同等函数,从而在隐藏层表示捕获关于输入有用的信息。一种方法是增加稀疏性,另一种是增加随机性,这种技术在RBM中被使用到,同样在去噪自编码中使用到,下面介绍去噪自编码。(不太懂)
去噪自编码
In order to force the hidden layer to discover more robust features and prevent it from simply learning the identity, we train the autoencoder to reconstruct the input from a corrupted version of it.(感觉翻译成中文意思就变了)
捕获输入间的统计依赖
去噪自编码可以从多个角度进行理解:多样化学习角度,随机操作角度,置底向上理论角度,等等,所有这些在【Vincent08】有介绍。
在【Vincent08】,随机丢失处理随机地将一些输入变为0。去噪自编码试图从非丢失值中预测丢失值。Note how being able to predict any subset of variables from the rest is a sufficient condition for completely capturing the joint distribution between a set of variables (this is how Gibbs sampling works).
将自编码类转换成去噪自编码类,只需要在输入上增加随机丢失步骤。这里我们randomly masking entries of the input by making them zero.
代码如下,完整代码请参考官方教程。
1 import numpy 2 import theano 3 import os 4 import sys 5 import timeit 6 from theano import tensor as T 7 from theano.tensor.shared_randomstreams import RandomStreams 8 from logistic_sgd import load_data 9 from utils import tile_raster_images 10 11 try: 12 import PIL.Image as Image 13 except ImportError: 14 import Image 15 16 class dA(object): 17 def __init__(self, numpy_rng, theano_rng=None, input = None, 18 n_visible = 784, n_hidden = 500, W=None, bhid=None, bvis=None): 19 ''' 20 :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams 21 :param theano_rng: Theano random generator; 22 ''' 23 self.n_visible = n_visible 24 self.n_hidden = n_hidden 25 if not theano_rng: 26 theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) 27 if not W: 28 initial_W = numpy.asarray( 29 numpy_rng.uniform( 30 low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)), 31 high=4 * numpy.sqrt(6. / (n_hidden + n_visible)), 32 size=(n_visible, n_hidden) 33 ), 34 dtype=theano.config.floatX 35 ) 36 W = theano.shared(value=initial_W, name='W', borrow = True) 37 if not bvis: 38 bvis = theano.shared( 39 value=numpy.zeros(n_visible,dtype=theano.config.floatX), 40 borrow = True 41 ) 42 if not bhid: 43 bhid = theano.shared( 44 value=numpy.zeros(n_hidden,dtype=theano.config.floatX), 45 borrow = True 46 ) 47 self.W = W 48 self.W_prime = self.W.T 49 self.b = bhid 50 self.b_prime = bvis 51 self.theano_rng = theano_rng 52 if input is None: 53 self.x = T.dmatrix(name='input') 54 else: 55 self.x = input 56 self.params = [self.W, self.b, self.b_prime] 57 58 def get_hidden_values(self, input): 59 return T.nnet.sigmoid(T.dot(input, self.W) + self.b) 60 def get_reconstructed_input(self, hidden): 61 return T.nnet.sigmoid(T.dot(hidden, self.W_prime) + self.b_prime) 62 def get_corrupted_input(self, input, corruption_level): 63 #binomial()函数为产生0,1的分布,这里是设置产生1的概率为p 64 #将输入样本每个像素点以corruption_level的概率被清0 65 return self.theano_rng.binomial(size=input.shape, n=1, 66 p=1-corruption_level, 67 dtype=theano.config.floatX) * input 68 def get_cost_updates(self, corruption_level, learning_rate): 69 tilde_x = self.get_corrupted_input(self.x, corruption_level) 70 y = self.get_hidden_values(tilde_x) 71 z = self.get_reconstructed_input(y) 72 L = - T.sum(self.x * T.log(z) + (1 - self.x) * T.log(1-z), axis=1) 73 cost = T.mean(L) 74 75 gparams = T.grad(cost, self.params) 76 updates = [ 77 (param, param - learning_rate * gparam) 78 for param, gparam in zip(self.params, gparams) 79 ] 80 return (cost, updates) 81 82 def test_dA(learning_rate=0.1, training_epochs=15, 83 dataset='mnist.pkl.gz', 84 batch_size=20, output_folder='dA_plots'): 85 datasets = load_data(dataset) 86 train_set_x, train_set_y = datasets[0] 87 n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size 88 # allocate symbolic variables for the data 89 index = T.lscalar() # index to a [mini]batch 90 x = T.matrix('x') # the data is presented as rasterized images 91 if not os.path.isdir(output_folder): 92 os.makedirs(output_folder) 93 os.chdir(output_folder) 94 95 rng = numpy.random.RandomState(123) 96 theano_rng = RandomStreams(rng.randint(2 ** 30)) 97 da = dA( 98 numpy_rng=rng, 99 theano_rng=theano_rng, 100 input=x, 101 n_visible=28*28, 102 n_hidden=500 103 ) 104 cost, updates = da.get_cost_updates(corruption_level=0.,learning_rate=learning_rate) 105 train_da = theano.function( 106 inputs=[index], 107 outputs=cost, 108 updates=updates, 109 givens={ 110 x: train_set_x[index * batch_size: (index + 1) * batch_size] 111 } 112 ) 113 start_time = timeit.default_timer() 114 for epoch in range(training_epochs): 115 c = [] 116 for batch_index in range(n_train_batches): 117 c.append(train_da(batch_index)) 118 print('Training epoch %d, cost' %epoch, numpy.mean(c)) 119 end_time = timeit.default_timer() 120 training_time = (end_time - start_time) 121 print(('The no corruption code for file ' + 122 os.path.split(__file__)[1] + 123 ' ran for %.2fm' % ((training_time) / 60.)), file=sys.stderr) 124 image = Image.fromarray( 125 tile_raster_images(X=da.W.get_value(borrow=True).T, 126 img_shape=(28, 28), tile_shape=(10, 10), 127 tile_spacing=(1, 1))) 128 image.save('filters_corruption_0.png') 129 130 os.chdir('../') 131 132 if __name__ == '__main__': 133 test_dA()
参考资料
1.官方教程
2.实用工具plotting samples and filters