Caffe学习 五 conv_layer与im2col
1.BaseConvolutionLayer & ConvolutionLayer
成员变量
注释引用自caffe代码阅读10:Caffe中卷积的实现细节(涉及到BaseConvolutionLayer、ConvolutionLayer、im2col等)-2016.4.3。
/// @brief The spatial dimensions of a filter kernel. // kernel的形状 = [kernel_h, kernel_w] Blob<int> kernel_shape_; /// @brief The spatial dimensions of the stride. // 步长形状 = [stride_h, stride_w] Blob<int> stride_; /// @brief The spatial dimensions of the padding. // pad的形状 = [pad_h, pad_w] Blob<int> pad_; /// @brief The spatial dimensions of the convolution input. // 卷积的输入形状 = [输入图像通道数, 输入图像h, 输入图像w] Blob<int> conv_input_shape_; /// @brief The spatial dimensions of the col_buffer. // col_buffer的形状 = [kernel_dim_, conv_out_spatial_dim_ ] vector<int> col_buffer_shape_; /// @brief The spatial dimensions of the output. // 输出的形状 vector<int> output_shape_; // 输入的形状 const vector<int>* bottom_shape_; // 空间轴个数 int num_spatial_axes_; // 输入度维度 = 输入图像通道数*输入图像的h*输入图像w int bottom_dim_; // 输出维度 = 输出通道数*输出h*输出w int top_dim_; // 输入图像的第几个轴是通道 int channel_axis_; // batchsize int num_; // 输入图像的通道数 int channels_; // 卷积组的大小 int group_; // 输出空间维度 = 卷积之后的图像长*卷积之后图像的宽 int out_spatial_dim_; // 使用卷积组用到的 int weight_offset_; // 卷积后的图像的通道数 int num_output_; // 是否启用偏置 bool bias_term_; // 是不是1x1卷积 bool is_1x1_; // 强制使用n维通用卷积 bool force_nd_im2col_; // conv_in_channels_ * conv_out_spatial_dim_ int num_kernels_im2col_; // num_kernels_col2im_ = reverse_dimensions() ? top_dim_ : bottom_dim_ int num_kernels_col2im_; // 卷积的输出通道数 ,在参数配置文件中设置 int conv_out_channels_; // 卷积的输入通道数 (即输入图像的通道数) int conv_in_channels_; // 卷积的输出的空间维度 = 卷积后图像h*卷积后图像w int conv_out_spatial_dim_; // 卷积核的维度 = 输入图像的维度*卷积核的h*卷积核的w int kernel_dim_; // 在使用gropu参数的时候使用的offset int col_offset_; int output_offset_; // im2col的时候使用的存储空间 Blob<Dtype> col_buffer_; // 将偏置扩展成矩阵的东东 Blob<Dtype> bias_multiplier_;
成员函数
主要的函数。
public: //构造函数 explicit BaseConvolutionLayer(const LayerParameter& param) : Layer<Dtype>(param) {} //初始化 virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); //重设形状 virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top);
对输入进行前向传播,加上bias。调用了math_function里面的caffe_cpu_gemm的矩阵运算接口。
GEMM的全称是General Matrix Matrix Multiply。其基本形式如下:
void BaseConvolutionLayer<Dtype>::forward_cpu_gemm(const Dtype* input, const Dtype* weights, Dtype* output, bool skip_im2col) { const Dtype* col_buff = input; if (!is_1x1_) { if (!skip_im2col) { conv_im2col_cpu(input, col_buffer_.mutable_cpu_data()); } col_buff = col_buffer_.cpu_data(); } for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_, (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g, (Dtype)0., output + output_offset_ * g); } } template <typename Dtype> void BaseConvolutionLayer<Dtype>::forward_cpu_bias(Dtype* output, const Dtype* bias) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num_output_, out_spatial_dim_, 1, (Dtype)1., bias, bias_multiplier_.cpu_data(), (Dtype)1., output); }
im2col和col2im。
inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) { if (!force_nd_im2col_ && num_spatial_axes_ == 2) { im2col_cpu(data, conv_in_channels_, conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2], kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], pad_.cpu_data()[0], pad_.cpu_data()[1], stride_.cpu_data()[0], stride_.cpu_data()[1], dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff); } else { im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(), col_buffer_shape_.data(), kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), col_buff); } } inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) { if (!force_nd_im2col_ && num_spatial_axes_ == 2) { col2im_cpu(col_buff, conv_in_channels_, conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2], kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], pad_.cpu_data()[0], pad_.cpu_data()[1], stride_.cpu_data()[0], stride_.cpu_data()[1], dilation_.cpu_data()[0], dilation_.cpu_data()[1], data); } else { col2im_nd_cpu(col_buff, num_spatial_axes_, conv_input_shape_.cpu_data(), col_buffer_shape_.data(), kernel_shape_.cpu_data(), pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), data); } }
前向传播调用forward_cpu_gemm和forward_cpu_bias。
template <typename Dtype> void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { const Dtype* weight = this->blobs_[0]->cpu_data(); for (int i = 0; i < bottom.size(); ++i) { const Dtype* bottom_data = bottom[i]->cpu_data(); Dtype* top_data = top[i]->mutable_cpu_data(); for (int n = 0; n < this->num_; ++n) { this->forward_cpu_gemm(bottom_data + n * this->bottom_dim_, weight, top_data + n * this->top_dim_); if (this->bias_term_) { const Dtype* bias = this->blobs_[1]->cpu_data(); this->forward_cpu_bias(top_data + n * this->top_dim_, bias); } } } }
反向传播调用backward_cpu_bias,weight_cpu_gemm和backward_cpu_gemm。
template <typename Dtype> void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { const Dtype* weight = this->blobs_[0]->cpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); for (int i = 0; i < top.size(); ++i) { const Dtype* top_diff = top[i]->cpu_diff(); const Dtype* bottom_data = bottom[i]->cpu_data(); Dtype* bottom_diff = bottom[i]->mutable_cpu_diff(); // Bias gradient, if necessary. if (this->bias_term_ && this->param_propagate_down_[1]) { Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff(); for (int n = 0; n < this->num_; ++n) { this->backward_cpu_bias(bias_diff, top_diff + n * this->top_dim_); } } if (this->param_propagate_down_[0] || propagate_down[i]) { for (int n = 0; n < this->num_; ++n) { // gradient w.r.t. weight. Note that we will accumulate diffs. if (this->param_propagate_down_[0]) { this->weight_cpu_gemm(bottom_data + n * this->bottom_dim_, top_diff + n * this->top_dim_, weight_diff); } // gradient w.r.t. bottom data, if necessary. if (propagate_down[i]) { this->backward_cpu_gemm(top_diff + n * this->top_dim_, weight, bottom_diff + n * this->bottom_dim_); } } } } }
bottom, weight, bias导数计算,向后传递。
template <typename Dtype> void BaseConvolutionLayer<Dtype>::backward_cpu_gemm(const Dtype* output, const Dtype* weights, Dtype* input) { Dtype* col_buff = col_buffer_.mutable_cpu_data(); if (is_1x1_) { col_buff = input; } for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, kernel_dim_, conv_out_spatial_dim_, conv_out_channels_ / group_, (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g, (Dtype)0., col_buff + col_offset_ * g); } if (!is_1x1_) { conv_col2im_cpu(col_buff, input); } } template <typename Dtype> void BaseConvolutionLayer<Dtype>::weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype* weights) { const Dtype* col_buff = input; if (!is_1x1_) { conv_im2col_cpu(input, col_buffer_.mutable_cpu_data()); col_buff = col_buffer_.cpu_data(); } for (int g = 0; g < group_; ++g) { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, conv_out_channels_ / group_, kernel_dim_, conv_out_spatial_dim_, (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g, (Dtype)1., weights + weight_offset_ * g); } } template <typename Dtype> void BaseConvolutionLayer<Dtype>::backward_cpu_bias(Dtype* bias, const Dtype* input) { caffe_cpu_gemv<Dtype>(CblasNoTrans, num_output_, out_spatial_dim_, 1., input, bias_multiplier_.cpu_data(), 1., bias); }
2.im2col
在caffe中,卷积运算就是先对数据进行im2col操作,再进行内积运算(inner product)。这样做,比原始的卷积操作速度更快。
图片来自High Performance Convolutional Neural Networks for Document Processing, 是图示了caffe中卷积计算的原理。
图中上半部分是一个传统卷积,下图是一个矩阵相乘的版本。
图中输入特征由3个通道组成,每个通道跟不同的卷积核做卷积运算,最后再将得到3个通道的卷积结果相加,得到一份输出特征。
如果只有一个通道,运算和之前随笔中提到的如下算法接近。
def conv(a, v, full=0): # valid:0 full:1 ah, aw = np.shape(a) vh, vw = np.shape(v) print ah,aw,vh,vw if full: temp = np.zeros((ah + 2 * vh - 2, aw + 2 * vw - 2)) temp[vh - 1:vh - 1 + ah, vw - 1:vw - 1 + aw] = a a = temp ah, aw = np.shape(a) k =[[ np.sum(np.multiply(a[i:i + vh, j:j + vw], v)) for j in range(aw - vw + 1)] for i in range(ah - vh + 1)] return k
下图是在一个卷积层中将卷积操作展开的具体操作过程。
按照卷积核的大小取数据然后展开,在同一张图里的不同卷积核选取的逐行摆放,不同N的话,就在同一行后面继续拼接。
不同个可以是多个通道,但是需要注意的是同一行里面每一段都应该对应的是原图中中一个位置的卷积窗口。
由于运算方式从卷积变成矩阵乘法,和全连接层的算法一样,求梯度便可以采用全连接的公式。
caffe中源码如下。
template <typename Dtype> void im2col_cpu(const Dtype* data_im, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_col) { const int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; const int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; const int channels_col = channels * kernel_h * kernel_w; for (int c_col = 0; c_col < channels_col; ++c_col) { int w_offset = c_col % kernel_w; int h_offset = (c_col / kernel_w) % kernel_h; int c_im = c_col / kernel_h / kernel_w; for (int h_col = 0; h_col < height_col; ++h_col) { for (int w_col = 0; w_col < width_col; ++w_col) { int h_im = h_col * stride_h - pad_h + h_offset; int w_im = w_col * stride_w - pad_w + w_offset; data_col[(c_col * height_col + h_col) * width_col + w_col] = (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? data_im[(c_im * height + h_im) * width + w_im] : 0; } } } } /* 将图片按照卷积的窗口大小切成子图,拉成一列。
*/ template <typename Dtype> inline void im2col_nd_core_cpu(const Dtype* data_input, const bool im2col, const int num_spatial_axes, const int* im_shape, const int* col_shape, const int* kernel_shape, const int* pad, const int* stride, Dtype* data_output) /* 针对输入的spatial dimension 不是二维的情况 */ template <typename Dtype> void col2im_cpu(const Dtype* data_col, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, Dtype* data_im) { caffe_set(height * width * channels, Dtype(0), data_im); const int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; const int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; const int channels_col = channels * kernel_h * kernel_w; for (int c_col = 0; c_col < channels_col; ++c_col) { int w_offset = c_col % kernel_w; int h_offset = (c_col / kernel_w) % kernel_h; int c_im = c_col / kernel_h / kernel_w; for (int h_col = 0; h_col < height_col; ++h_col) { for (int w_col = 0; w_col < width_col; ++w_col) { int h_im = h_col * stride_h - pad_h + h_offset; int w_im = w_col * stride_w - pad_w + w_offset; if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width) data_im[(c_im * height + h_im) * width + w_im] += data_col[(c_col * height_col + h_col) * width_col + w_col]; } } } } //im2col_cpu()的相反过程