nn.BatchNorm2d(128, eps=0.0001)

bn层又坑我一次!!!!
caffe转pytorch。 由于第二次加了一些网络,不知道从哪里复制的,直接是
self.p6_conv_bn = nn.BatchNorm2d(128)

然后跑前向对精度的时候死活不一样啊!!!!
然后开始了我查找问题的漫漫之旅!!足足花了我2h。
首先就是对各种层输出看哪里不一样,比如就是conv3之前都一样,但是然后conv4不一样了!
conv3输出的featuremap一样然后经过conv4就不一样了。 这里包括conv、bn、relu
那为什么之前一样的?
是不是因为权重不一样导致的,然后又去核对转权重脚本,没发现毛病。

没办法,然后想着对权重,caffe层的权重在哪里看呢?

比如卷积层,关键的是通过
if("conv_4" == this->name())
来确定是我们需要关注的层。然后可以输出这层的featuremap的输出;
权重可以通过vector<shared_ptr> blob_learn = this->blobs();来输出!

template <typename Ftype, typename Btype>
void ConvolutionLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom,
      const vector<Blob*>& top) {
  this->Quantize_gpu(bottom, top);
  const Ftype* weight = this->blobs_[0]->template gpu_data<Ftype>();
  for (int i = 0; i < bottom.size(); ++i) {
    const Ftype* bottom_data = bottom[i]->gpu_data<Ftype>();
    Ftype* top_data = top[i]->mutable_gpu_data<Ftype>();
    for (int n = 0; n < this->num_; ++n) {
      this->forward_gpu_gemm(bottom_data + n * this->bottom_dim_, weight,
          top_data + n * this->top_dim_);
      if (this->bias_term_) {
        const Ftype* bias = this->blobs_[1]->template gpu_data<Ftype>();
        this->forward_gpu_bias(top_data + n * this->top_dim_, bias);
      }
    }
  }
  this->Quantize_gpu(bottom, top);


//  if("conv_4" == this->name())
//  {
////    string path_conv_depth = "/media/algo/data_1/everyday/20230106/conv_4.txt";
////    bottom[0]->save_data_to_txt(path_conv_depth);
//
//    vector<shared_ptr<Blob>> blob_learn = this->blobs();
//    string shape_1 = blob_learn[0]->shape_string();
//    blob_learn[0]->save_data_to_txt("/media/algo/data_1/everyday/20230106/222/weight/conv_4_caffe.txt");  //caffe 保存blob值到txt,方便查看和pytorch比较https://www.cnblogs.com/yanghailin/p/17028147.html
//    int a=0;
//  }

}

同样的bn层BatchNormLayer

template<typename Ftype, typename Btype>
void BatchNormLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom, const vector<Blob*>& top) {
...
...
...

  if("conv_4/bn" == this->name())
  {
//    string path_conv_depth = "/media/algo/data_1/everyday/20230106/conv_4.txt";
//    bottom[0]->save_data_to_txt(path_conv_depth);

    vector<shared_ptr<Blob>> blob_learn = this->blobs();
    string shape_0 = blob_learn[0]->shape_string();
      string shape_1 = blob_learn[1]->shape_string();
      string shape_2 = blob_learn[2]->shape_string();
      string shape_3 = blob_learn[3]->shape_string();
      string shape_4 = blob_learn[4]->shape_string();
//    blob_learn[0]->save_data_to_txt("/media/algo/data_1/everyday/20230106/222/weight/conv_4.txt");
    int a=0;
  }

至于pytorch,我是打断点看权重的。

caffe里面的bn层如下写:

layer {
  name: "conv4_1/bn"
  type: "BatchNorm"
  bottom: "conv4_1"
  top: "conv4_1"
  batch_norm_param {
    moving_average_fraction: 0.995
    eps: 0.0001
    scale_bias: true
  }
}

caffe权重提取到本地pkl文件。

def net_prediction(net, save_weight_pkl_path="./weights202212.pkl"): #img0 [128,384,3]
    #feature map and shape
    # print("=================feature map===================")
    # for layer_name, blob in net.blobs.iteritems():
    #     print(layer_name + '\t' + str(blob.data.shape))

    # print("=================weights===================")
    # for layer_name, blob in net.params.iteritems():
    #     len_ = len(blob)
    #     print(layer_name + " has " + str(len_) + " params")
    #     for i in range(len_):
    #         print(layer_name + ' idx=  '  + str(i) + '\t' + str(blob[i].data.shape))


    ############################################################################################3
    name_weights = {}
    # 保存每层的参数信息
    keys = open('keys.txt', 'w')
    keys.write('generated by VPDNet-Caffe/convert_to_pkl.py\n\n')
    # 遍历每一网络层
    for param_name in net.params.keys():
        name_weights[param_name] = {}
        # 得到此层的参数
        layer_params = net.params[param_name]
        if len(layer_params) == 1:
            # 如果参数只有一个,则说明是反卷积层,
            # SfSNet整个模型里就只有反卷积层只有一组weight参数
            weight = layer_params[0].data
            name_weights[param_name]['weight'] = weight

            print('%s:\n\t%s (weight)' % (param_name, weight.shape))
            keys.write('%s:\n\t%s (weight)\n' % (param_name, weight.shape))
        elif len(layer_params) == 2:
            # 如果参数有两个,则说明是卷积层或者全连接层。
            # 卷积层或者全连接层都有两组参数:weight和bias
            # 权重参数
            weight = layer_params[0].data
            name_weights[param_name]['weight'] = weight
            # 偏置参数
            bias = layer_params[1].data
            name_weights[param_name]['bias'] = bias

            print('%s:\n\t%s (weight)' % (param_name, weight.shape))
            print('\t%s (bias)' % str(bias.shape))
            keys.write('%s:\n\t%s (weight)\n' % (param_name, weight.shape))
            keys.write('\t%s (bias)\n' % str(bias.shape))
        elif len(layer_params) == 5:
            # 如果有三个,则说明是BatchNorm层。
            # BN层共有三个参数,分别是:running_mean、running_var和一个缩放参数。
            running_mean = layer_params[0].data  # running_mean
            layer_params_2_data = layer_params[2].data #0
            layer_params_2_data = 1 #0.995
            # cc = layer_params[2].data
            name_weights[param_name]['running_mean'] = running_mean / layer_params_2_data #name_weights[param_name]['running_mean'] = running_mean / layer_params[2].data
            running_var = layer_params[1].data  # running_var
            name_weights[param_name]['running_var'] = running_var / layer_params_2_data #name_weights[param_name]['running_var'] = running_var / layer_params[2].data

            # aa = layer_params[2].data
            # print("bn=",aa)

            name_weights[param_name]['weight'] = layer_params[3].data
            name_weights[param_name]['bias'] = layer_params[4].data

            print('%s:\n\t%s (running_var)' % (param_name, running_var.shape),)
            print('\t%s (running_mean)' % str(running_mean.shape))
            keys.write('%s:\n\t%s (running_var)\n' % (param_name, running_var.shape))
            keys.write('\t%s (running_mean)\n' % str(running_mean.shape))
            keys.write('\t%s (weight)\n' % str(layer_params[3].data.shape))
            keys.write('\t%s (bias)\n' % str(layer_params[4].data.shape))
        else:
            # 如果报错,大家要检查自己模型哈
            raise RuntimeError("还有参数个数超过3个的层,别漏了兄dei!!!\n")
    keys.close()
    # 保存name_weightssave_weight_pkl_path
    with open(save_weight_pkl_path, 'wb') as f:
        pkl.dump(name_weights, f, protocol=2)


    #################################################################################################

这个值moving_average_fraction: 0.995,保存在caffe bn的第三个参数,但是实际跑推理的时候好像是直接读取protxt里面的值,但是又好像是直接是1? 因为有遇到caffemodel里面bn层第三个参数是0

pytorch按照下面加载:

    state_dict_vd = {}
        state_dict_vd['vd_conv11_1.weight'] = from_numpy(name_weights['vd_conv11_1']['weight'])
        state_dict_vd['vd_conv11_1.bias'] = from_numpy(name_weights['vd_conv11_1']['bias'])
        state_dict_vd['vd_conv11_1_bn.running_var'] = from_numpy(name_weights['vd_conv11_1_bn']['running_var'])
        state_dict_vd['vd_conv11_1_bn.running_mean'] = from_numpy(name_weights['vd_conv11_1_bn']['running_mean'])
        state_dict_vd['vd_conv11_1_bn.weight'] = from_numpy(name_weights['vd_conv11_1_bn']['weight'])
        state_dict_vd['vd_conv11_1_bn.bias'] = from_numpy(name_weights['vd_conv11_1_bn']['bias'])

说了这么多,但是pytroch的参数要和caffe对齐啊!!!
self.conv_4_bn = nn.BatchNorm2d(128, eps=0.0001)

一开始直接是self.conv_4_bn = nn.BatchNorm2d(128),导致精度不一样,花费了我好久才找到问题!!!

posted @ 2023-01-06 15:35  无左无右  阅读(141)  评论(0编辑  收藏  举报