nn.BatchNorm2d(128, eps=0.0001)
bn层又坑我一次!!!!
caffe转pytorch。 由于第二次加了一些网络,不知道从哪里复制的,直接是
self.p6_conv_bn = nn.BatchNorm2d(128)
然后跑前向对精度的时候死活不一样啊!!!!
然后开始了我查找问题的漫漫之旅!!足足花了我2h。
首先就是对各种层输出看哪里不一样,比如就是conv3之前都一样,但是然后conv4不一样了!
conv3输出的featuremap一样然后经过conv4就不一样了。 这里包括conv、bn、relu
那为什么之前一样的?
是不是因为权重不一样导致的,然后又去核对转权重脚本,没发现毛病。
没办法,然后想着对权重,caffe层的权重在哪里看呢?
比如卷积层,关键的是通过
if("conv_4" == this->name())
来确定是我们需要关注的层。然后可以输出这层的featuremap的输出;
权重可以通过vector<shared_ptr
template <typename Ftype, typename Btype>
void ConvolutionLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom,
const vector<Blob*>& top) {
this->Quantize_gpu(bottom, top);
const Ftype* weight = this->blobs_[0]->template gpu_data<Ftype>();
for (int i = 0; i < bottom.size(); ++i) {
const Ftype* bottom_data = bottom[i]->gpu_data<Ftype>();
Ftype* top_data = top[i]->mutable_gpu_data<Ftype>();
for (int n = 0; n < this->num_; ++n) {
this->forward_gpu_gemm(bottom_data + n * this->bottom_dim_, weight,
top_data + n * this->top_dim_);
if (this->bias_term_) {
const Ftype* bias = this->blobs_[1]->template gpu_data<Ftype>();
this->forward_gpu_bias(top_data + n * this->top_dim_, bias);
}
}
}
this->Quantize_gpu(bottom, top);
// if("conv_4" == this->name())
// {
//// string path_conv_depth = "/media/algo/data_1/everyday/20230106/conv_4.txt";
//// bottom[0]->save_data_to_txt(path_conv_depth);
//
// vector<shared_ptr<Blob>> blob_learn = this->blobs();
// string shape_1 = blob_learn[0]->shape_string();
// blob_learn[0]->save_data_to_txt("/media/algo/data_1/everyday/20230106/222/weight/conv_4_caffe.txt"); //caffe 保存blob值到txt,方便查看和pytorch比较https://www.cnblogs.com/yanghailin/p/17028147.html
// int a=0;
// }
}
同样的bn层BatchNormLayer
template<typename Ftype, typename Btype>
void BatchNormLayer<Ftype, Btype>::Forward_gpu(const vector<Blob*>& bottom, const vector<Blob*>& top) {
...
...
...
if("conv_4/bn" == this->name())
{
// string path_conv_depth = "/media/algo/data_1/everyday/20230106/conv_4.txt";
// bottom[0]->save_data_to_txt(path_conv_depth);
vector<shared_ptr<Blob>> blob_learn = this->blobs();
string shape_0 = blob_learn[0]->shape_string();
string shape_1 = blob_learn[1]->shape_string();
string shape_2 = blob_learn[2]->shape_string();
string shape_3 = blob_learn[3]->shape_string();
string shape_4 = blob_learn[4]->shape_string();
// blob_learn[0]->save_data_to_txt("/media/algo/data_1/everyday/20230106/222/weight/conv_4.txt");
int a=0;
}
至于pytorch,我是打断点看权重的。
caffe里面的bn层如下写:
layer {
name: "conv4_1/bn"
type: "BatchNorm"
bottom: "conv4_1"
top: "conv4_1"
batch_norm_param {
moving_average_fraction: 0.995
eps: 0.0001
scale_bias: true
}
}
caffe权重提取到本地pkl文件。
def net_prediction(net, save_weight_pkl_path="./weights202212.pkl"): #img0 [128,384,3]
#feature map and shape
# print("=================feature map===================")
# for layer_name, blob in net.blobs.iteritems():
# print(layer_name + '\t' + str(blob.data.shape))
# print("=================weights===================")
# for layer_name, blob in net.params.iteritems():
# len_ = len(blob)
# print(layer_name + " has " + str(len_) + " params")
# for i in range(len_):
# print(layer_name + ' idx= ' + str(i) + '\t' + str(blob[i].data.shape))
############################################################################################3
name_weights = {}
# 保存每层的参数信息
keys = open('keys.txt', 'w')
keys.write('generated by VPDNet-Caffe/convert_to_pkl.py\n\n')
# 遍历每一网络层
for param_name in net.params.keys():
name_weights[param_name] = {}
# 得到此层的参数
layer_params = net.params[param_name]
if len(layer_params) == 1:
# 如果参数只有一个,则说明是反卷积层,
# SfSNet整个模型里就只有反卷积层只有一组weight参数
weight = layer_params[0].data
name_weights[param_name]['weight'] = weight
print('%s:\n\t%s (weight)' % (param_name, weight.shape))
keys.write('%s:\n\t%s (weight)\n' % (param_name, weight.shape))
elif len(layer_params) == 2:
# 如果参数有两个,则说明是卷积层或者全连接层。
# 卷积层或者全连接层都有两组参数:weight和bias
# 权重参数
weight = layer_params[0].data
name_weights[param_name]['weight'] = weight
# 偏置参数
bias = layer_params[1].data
name_weights[param_name]['bias'] = bias
print('%s:\n\t%s (weight)' % (param_name, weight.shape))
print('\t%s (bias)' % str(bias.shape))
keys.write('%s:\n\t%s (weight)\n' % (param_name, weight.shape))
keys.write('\t%s (bias)\n' % str(bias.shape))
elif len(layer_params) == 5:
# 如果有三个,则说明是BatchNorm层。
# BN层共有三个参数,分别是:running_mean、running_var和一个缩放参数。
running_mean = layer_params[0].data # running_mean
layer_params_2_data = layer_params[2].data #0
layer_params_2_data = 1 #0.995
# cc = layer_params[2].data
name_weights[param_name]['running_mean'] = running_mean / layer_params_2_data #name_weights[param_name]['running_mean'] = running_mean / layer_params[2].data
running_var = layer_params[1].data # running_var
name_weights[param_name]['running_var'] = running_var / layer_params_2_data #name_weights[param_name]['running_var'] = running_var / layer_params[2].data
# aa = layer_params[2].data
# print("bn=",aa)
name_weights[param_name]['weight'] = layer_params[3].data
name_weights[param_name]['bias'] = layer_params[4].data
print('%s:\n\t%s (running_var)' % (param_name, running_var.shape),)
print('\t%s (running_mean)' % str(running_mean.shape))
keys.write('%s:\n\t%s (running_var)\n' % (param_name, running_var.shape))
keys.write('\t%s (running_mean)\n' % str(running_mean.shape))
keys.write('\t%s (weight)\n' % str(layer_params[3].data.shape))
keys.write('\t%s (bias)\n' % str(layer_params[4].data.shape))
else:
# 如果报错,大家要检查自己模型哈
raise RuntimeError("还有参数个数超过3个的层,别漏了兄dei!!!\n")
keys.close()
# 保存name_weightssave_weight_pkl_path
with open(save_weight_pkl_path, 'wb') as f:
pkl.dump(name_weights, f, protocol=2)
#################################################################################################
这个值moving_average_fraction: 0.995,保存在caffe bn的第三个参数,但是实际跑推理的时候好像是直接读取protxt里面的值,但是又好像是直接是1? 因为有遇到caffemodel里面bn层第三个参数是0
pytorch按照下面加载:
state_dict_vd = {}
state_dict_vd['vd_conv11_1.weight'] = from_numpy(name_weights['vd_conv11_1']['weight'])
state_dict_vd['vd_conv11_1.bias'] = from_numpy(name_weights['vd_conv11_1']['bias'])
state_dict_vd['vd_conv11_1_bn.running_var'] = from_numpy(name_weights['vd_conv11_1_bn']['running_var'])
state_dict_vd['vd_conv11_1_bn.running_mean'] = from_numpy(name_weights['vd_conv11_1_bn']['running_mean'])
state_dict_vd['vd_conv11_1_bn.weight'] = from_numpy(name_weights['vd_conv11_1_bn']['weight'])
state_dict_vd['vd_conv11_1_bn.bias'] = from_numpy(name_weights['vd_conv11_1_bn']['bias'])
说了这么多,但是pytroch的参数要和caffe对齐啊!!!
self.conv_4_bn = nn.BatchNorm2d(128, eps=0.0001)
一开始直接是self.conv_4_bn = nn.BatchNorm2d(128),导致精度不一样,花费了我好久才找到问题!!!