Tensorflow版Faster RCNN源码解析(TFFRCNN) (04) network.py中常见的网络层处理(含装饰器等)
本blog为github上CharlesShang/TFFRCNN版源码解析系列代码笔记
---------------个人学习笔记---------------
----------------本文作者疆--------------
------点击此处链接至博客园原文------
若不加申明,本文默认以测试阶段调用VGGnet_test网络,代码中默认padding方式为SAME,区别于VALID,DEFAULT_PADDING = 'SAME'
1.两个装饰器(以conv1_1的执行过程为例)
@layer def conv(self, input, k_h, k_w, c_o, s_h, s_w, name, biased=True,relu=True, padding=DEFAULT_PADDING, trainable=True):
注意@layer是个装饰器,这句话相当于conv=layer(conv)
@include_original def layer(op):
这句话则相当于layer=include_original(layer)
def include_original(dec): """ Meta decorator, which make the original function callable (via f._original() )""" def meta_decorator(f): decorated = dec(f) decorated._original = f return decorated return meta_decorator
不太理解这里的装饰器include_original套装饰器layer,由注释得知装饰器include_original实现了原函数的可调用(有懂的同学麻烦教教我!此外,为何还传入self,指的是哪个类的实例化对象,layer装饰器中最后为何还要返回self)
因此,当在VGGnet_test.py中调用conv(3,3,64,1,1,name='conv1_1',trainable=False)相当于layer(conv)(3,3,64,1,1,name='conv1_1',trainable=False)
而layer(conv)返回layer_decorated函数指针,且op被赋值为conv,则相当于执行了layer_decorated(3,3,64,1,1,name='conv1_1',trainable=False)
其中args为(3,3,64,1,1)、kwargs为{‘name’:'conv1_1','trainable':'False'}
@include_original def layer(op): def layer_decorated(self, *args, **kwargs): # Automatically set a name if not provided. # setdefault为自建函数,若字典中存在'name'键,则返回对应value值如conv1_1,否则以默认值self.get_unique_name(op.__name__)赋值并返回 name = kwargs.setdefault('name', self.get_unique_name(op.__name__)) # Figure out the layer inputs. # 如VGGnet_test.py网络,在conv1_1之前调用了self.feed('data')使得self.inputs非空 # 此时self.inputs列表中仅含一个元素,为self.layers['data']=self.data = tf.placeholder(tf.float32, shape=[None, None, None, 3])即缩放后的图像数据 if len(self.inputs)==0: # 无输入 raise RuntimeError('No input variables found for layer %s.'%name) elif len(self.inputs)==1: # 单个输入 layer_input = self.inputs[0] else: layer_input = list(self.inputs) # 多个输入的情况 # Perform the operation and get the output. # 执行相当处理层操作,如conv layer_output = op(self, layer_input, *args, **kwargs) # Add to layer LUT. #在self.layers字典中记录该层输出 self.layers[name] = layer_output # This output is now the input for the next layer. #将该层output通过feed函数添加到self.inputs列表中,作为下一层的输入 self.feed(layer_output) # Return self for chained calls. 链接调用 return self return layer_decorated
需要注意的是,self.inputs为列表,通过feed()函数填入数据作为下一层输入,self.layers为字典,记录每一层的输出,此外,还应注意某些层多个输入的情况。
def include_original(dec): """ Meta decorator, which make the original function callable (via f._original() )""" def meta_decorator(f): decorated = dec(f) decorated._original = f return decorated return meta_decorator
再来看看include_original装饰器(Meta decorator),实现了原函数的可调用功能,即如:猜想是实现了通过self.conv()调用卷积处理,此处代码未读懂。
2.Network新式类
class Network(object): def __init__(self, inputs, trainable=True): # 构造函数 self.inputs = [] self.layers = dict(inputs) self.trainable = trainable self.setup()
self.inputs为列表,通过feed()函数填入数据作为下一层输入,self.layers为字典,记录每一层的输出,各个网络处理层均在类内被定义,将在“3”中介绍。
--------------------------------(除网络处理层外的)其余7个函数-----------------------------------
def setup(self)函数,猜想若非执行Network类的子类,将触发异常(被__init__函数调用)
def setup(self): raise NotImplementedError('Must be subclassed.')
def load(self,data_path,session,ignore_missing=False) 加载如imagenet等预训练模型.npy文件
def load(self, data_path, session, ignore_missing=False): # 从ImageNet预训练模型加载相应层参数 data_dict = np.load(data_path).item() # 生成data_dict字典 for key in data_dict: with tf.variable_scope(key, reuse=True): for subkey in data_dict[key]: # data_dict[key]应该也是一个字典,含weight和bias键值 try: var = tf.get_variable(subkey) session.run(var.assign(data_dict[key][subkey])) print "assign pretrain model "+subkey+ " to "+key except ValueError: print "ignore "+key if not ignore_missing: raise
此处item()函数未查到,另with tf.variable_scope(key, reuse=True)和session.run(var.assign(data_dict[key][subkey]))等相关tensorflow机制暂不明白,被train_model()函数调用(train.py中)
def feed(self, *args)构造上一层输出作为下一层的输入,在layer装饰器和网络文件(如VGGnet_test.py中)被调用
#layers为一个dict inputs为一个list def feed(self, *args): assert len(args)!=0 # 如果args为空即没有参数,就raise一个error self.inputs = [] for layer in args: if isinstance(layer, basestring): # 判断一个对象是否为str或者unicode的实例 try: layer = self.layers[layer] # 被重新赋值 print layer except KeyError: print self.layers.keys() raise KeyError('Unknown layer name fed: %s'%layer) self.inputs.append(layer) # 将取出的layer数据(即上一层输出)存入input列表作为下一层输入 return self
def get_output(self, layer)取出由各层输出构成的字典self.layers中的某层(layer参数)输出,被test.py等调用
def get_output(self, layer): try: layer = self.layers[layer] except KeyError: print self.layers.keys() raise KeyError('Unknown layer name fed: %s'%layer) return layer
def get_unique_name(self,prefix) 若未指定网络层名,则自动获取唯一的网络层名称(如self.layers中有两个以conv开头,则该层名为conv3),实际上由于网络文件各层均指定了name使得该函数未使用,被layer装饰器调用
def get_unique_name(self, prefix): #由类似于conv等的prefix计数得到当前conv的id输出类似于conv1、conv2的处理层名称 id = sum(t.startswith(prefix) for t,_ in self.layers.items())+1 return '%s_%d'%(prefix, id)
name = kwargs.setdefault('name', self.get_unique_name(op.__name__)) # 此处op为conv、max_pool等
def make_var(self, name, shape, initializer=None, trainable=True, regularizer=None)以tensorflow定义的格式创建变量,被网络相关处理层(如conv()函数)调用
def make_var(self, name, shape, initializer=None, trainable=True, regularizer=None): # 在tensorflow格式下新建变量 return tf.get_variable(name, shape, initializer=initializer, trainable=trainable, regularizer=regularizer)
def validate_padding(self, padding)仅允许padding为SAME或VALID,否则抛出异常,被含padding的相关网络处理层(如conv()、upconv()函数)调用
def validate_padding(self, padding): # 仅允许padding方式为SAME或VALID,否则抛出异常 assert padding in ('SAME', 'VALID')
3.常见的网络处理层
-----------------------------------卷积(卷积原理1、各类卷积点击这里)-----------------------------------------
def conv(...)
@layer # 如VGGnet_test.py中self.conv(3, 3, 64, 1, 1, name='conv1_1', trainable=False) # 即执行layer装饰器中layer_output = op(self, layer_input, *args, **kwargs) # 相当于layer_output = conv(self, layer_input, 3, 3, 64, 1, 1, name='conv1_1', trainable # =False,biased=True,relu=True, padding=DEFAULT_PADDING) def conv(self, input, k_h, k_w, c_o, s_h, s_w, name, biased=True,relu=True, padding=DEFAULT_PADDING, trainable=True): """ contribution by miraclebiu, and biased option""" self.validate_padding(padding) # 仅允许padding为same或valid #input为[batch,in_height,in_width,in_channels] c_i = input.get_shape()[-1] # 输入通道数,即feature map个数 # [1, s_h, s_w, 1]指stride,其中第1位和最后1位必须为1,第1位表示在batch上的位移,最后1位表示在深度上的位移 # 匿名函数,本代码段内被调用,其中,i,k分别指input、kernel(由4个参数控制,输入、输出通道数及kernel宽和高) convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding) with tf.variable_scope(name) as scope: # 管理参数命名 # init_weights = tf.truncated_normal_initializer(0.0, stddev=0.001) 正态初始化权重 init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.01, mode='FAN_AVG', uniform=False) init_biases = tf.constant_initializer(0.0) # 以tensorflow机制构造变量 kernel = self.make_var('weights', [k_h, k_w, c_i, c_o], init_weights, trainable, \ regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY)) # 0.0005 if biased: biases = self.make_var('biases', [c_o], init_biases, trainable) conv = convolve(input, kernel) # 卷积结果 if relu: bias = tf.nn.bias_add(conv, biases) # 添加偏置量 return tf.nn.relu(bias) # relu后结果 return tf.nn.bias_add(conv, biases) else: conv = convolve(input, kernel) # 卷积结果 if relu: return tf.nn.relu(conv) return conv
cfg.TRAIN.WEIGHT_DECAY 0.0005
卷积核由4个参数控制,tensorflow中卷积处理函数为tf.nn.conv2d(i,k,[1,s_h,s_w,1],padding=padding) ,被VGGnet_train.py、VGGnet_test.py等调用
-------------------------------------反/逆卷积(反卷积原理点击这里)-----------------------------------------------
def upconv(...)
@layer def upconv(self, input, shape, c_o, ksize=4, stride = 2, name = 'upconv', biased=False, relu=True, padding=DEFAULT_PADDING, trainable=True): """ up-conv""" self.validate_padding(padding) # 仅允许padding为same或valid c_in = input.get_shape()[3].value # 输入feature map的通道数 in_shape = tf.shape(input) # 输入的shape if shape is None: # h = ((in_shape[1] - 1) * stride) + 1 # w = ((in_shape[2] - 1) * stride) + 1 h = ((in_shape[1] ) * stride) w = ((in_shape[2] ) * stride) new_shape = [in_shape[0], h, w, c_o] # 输出的4维shape如何确定不明? else: new_shape = [in_shape[0], shape[1], shape[2], c_o] output_shape = tf.stack(new_shape) # tf.stack是拼接矩阵? filter_shape = [ksize, ksize, c_o, c_in] # 卷积核的shape,由4个参数控制? with tf.variable_scope(name) as scope: # init_weights = tf.truncated_normal_initializer(0.0, stddev=0.01) init_weights = tf.contrib.layers.variance_scaling_initializer(factor=0.01, mode='FAN_AVG', uniform=False) # 赋初值 filters = self.make_var('weights', filter_shape, init_weights, trainable, \ regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY)) deconv = tf.nn.conv2d_transpose(input, filters, output_shape, strides=[1, stride, stride, 1], padding=DEFAULT_PADDING, name=scope.name) # 反卷积结果 # coz de-conv losses shape info, use reshape to re-gain shape deconv = tf.reshape(deconv, new_shape) if biased: init_biases = tf.constant_initializer(0.0) biases = self.make_var('biases', [c_o], init_biases, trainable) if relu: bias = tf.nn.bias_add(deconv, biases) # 添加偏置 return tf.nn.relu(bias) # relu处理后返回 return tf.nn.bias_add(deconv, biases) else: if relu: return tf.nn.relu(deconv) return deconv
tensorflow中反卷积处理函数为tf.nn.conv2d_transpose(...)
输出维度(4个参数)如何确定?反卷积原理?tf.stack函数?卷积核shape由4个参数确定?output_shape和new_shape的关系?暂未调用
-------------------------------------------------(最大/平均)池化---------------------------------------------
def max_pool(...)
def avg_pool(...)
@layer def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=DEFAULT_PADDING): self.validate_padding(padding) return tf.nn.max_pool(input, ksize=[1, k_h, k_w, 1], strides=[1, s_h, s_w, 1], padding=padding, name=name) @layer def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=DEFAULT_PADDING): self.validate_padding(padding) return tf.nn.avg_pool(input, ksize=[1, k_h, k_w, 1], strides=[1, s_h, s_w, 1], padding=padding, name=name)
tensorflow中(最大/平均)池化处理函数为tf.nn.max_pool(...)和tf.nn.avg_pool(...) ,被VGGnet_train.py、VGGnet_test.py等调用
-------------------------------------------------非线性函数------------------------------------------------------
def relu(...)
@layer def relu(self, input, name): return tf.nn.relu(input, name=name)
def lrn(...)
@layer
def lrn(self, input, radius, alpha, beta, name, bias=1.0):
return tf.nn.local_response_normalization(input,
depth_radius=radius,
alpha=alpha,
beta=beta,
bias=bias,
name=name)
lrn函数(原理)类似dropout和数据增强作为relu激励之后防止数据过拟合而提出的一种处理方法,全称是 local response normalization--局部响应标准化,暂未调用
def batch_normalization(...)
@layer def batch_normalization(self,input,name,relu=True, is_training=False): """contribution by miraclebiu""" if relu: temp_layer=tf.contrib.layers.batch_norm(input,scale=True,center=True,is_training=is_training,scope=name) return tf.nn.relu(temp_layer) else: return tf.contrib.layers.batch_norm(input,scale=True,center=True,is_training=is_training,scope=name)
batcn_normalization原理,暂未调用
-------------------------------------------------roi_pool------------------------------------------------------
def roi_pool(...)
# 如VGGnet_test.py中self.roi_pool(7, 7, 1.0 / 16, name='pool_5') # 该层self.feed('conv5_3', 'rois'),即输入为['conv5_3','rois']列表 @layer def roi_pool(self, input, pooled_height, pooled_width, spatial_scale, name): # only use the first input if isinstance(input[0], tuple): input[0] = input[0][0] if isinstance(input[1], tuple): input[1] = input[1][0] print input return roi_pool_op.roi_pool(input[0], input[1], pooled_height, pooled_width, spatial_scale, name=name)[0]
以VGGnet_test.py为例,该层input为conv5_3和rois构成的列表,由roi_pool_op.roi_pool(...)执行(在roi_pooling_layer/roi_pooling_op.py中,实际由roi_pooling.so执行)
isinstance(input,tuple)为何要判断input是否为tuple,被VGGnet_train.py、VGGnet_test.py等调用
-------------------------------------------------psroi_pool(原理)------------------------------------------------------
def psroi_pool(...)
@layer def psroi_pool(self, input, output_dim, group_size, spatial_scale, name): """contribution by miraclebiu""" # only use the first input if isinstance(input[0], tuple): input[0] = input[0][0] if isinstance(input[1], tuple): input[1] = input[1][0] return psroi_pooling_op.psroi_pool(input[0], input[1], output_dim=output_dim, group_size=group_size, spatial_scale=spatial_scale, name=name)[0]
由psroi_pool_op.psroi_pool(...)执行(在psroi_pooling_layer/psroi_pooling_op.py中,实际由psroi_pooling.so执行),R-FCN中使用
-------------------------------------------------proposal_layer---------------------------------------------------------
def proposal_layer(...)
# (self.feed('rpn_cls_prob_reshape', 'rpn_bbox_pred', 'im_info') # .proposal_layer(_feat_stride, anchor_scales, 'TEST', name='rois')) # 其中_feat_stride = [16, ]、anchor_scales = cfg.ANCHOR_SCALES = [8, 16, 32] @layer # cfg_key为TEST或TRAIN def proposal_layer(self, input, _feat_stride, anchor_scales, cfg_key, name): if isinstance(input[0], tuple): input[0] = input[0][0] # input[0] shape is (1, H, W, Ax2) # rpn_rois <- (1 x H x W x A, 5) [0, x1, y1, x2, y2] #返回由proposal组成的blob return tf.reshape(tf.py_func(proposal_layer_py,\ [input[0],input[1],input[2], cfg_key, _feat_stride, anchor_scales],\ [tf.float32]), [-1,5],name =name)
实际执行的是def proposal_layer(rpn_cls_prob_reshape,rpn_bbox_pred,im_info,cfg_key,_feat_stride = [16,],anchor_scales = [8,16,32])(在rpn_msr/proposal_layer_tf.py中),此处不明白的是tf.py_func中函数为proposal_layer_py为何指向该函数(因为from ..rpn_msr.proposal_layer_tf import proposal_layer as proposal_layer_py),猜想该函数(TRAIN或TEST)作用是根据score选取after_nms_topN个proposal,并reshape为[-1,5],被VGGnet_train.py、VGGnet_test.py等调用
tf.py_func()接收的是tensor,然后将其转化为numpy array送入xxx函数,最后再将xxx函数输出的numpy array转化为tensor返回
def proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, cfg_key, _feat_stride = [16,], anchor_scales = [8, 16, 32]): """ Parameters ---------- rpn_cls_prob_reshape: (1 , H , W , Ax2) outputs of RPN, prob of bg or fg NOTICE: the old version is ordered by (1, H, W, 2, A) !!!! rpn_bbox_pred: (1 , H , W , Ax4), rgs boxes output of RPN im_info: a list of [image_height, image_width, scale_ratios] cfg_key: 'TRAIN' or 'TEST' _feat_stride: the downsampling ratio of feature map to the original input image anchor_scales: the scales to the basic_anchor (basic anchor is [16, 16]) ---------- Returns ---------- rpn_rois : (1 x H x W x A, 5) e.g. [0, x1, y1, x2, y2] # Algorithm: # # for each (H, W) location i # generate A anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the A anchors # clip predicted boxes to image # remove predicted boxes with either height or width < threshold # sort all (proposal, score) pairs by score from highest to lowest # take top pre_nms_topN proposals before NMS # apply NMS with threshold 0.7 to remaining proposals # take after_nms_topN proposals after NMS # return the top proposals (-> RoIs top, scores top) # layer_params = yaml.load(self.param_str_) """
-----------------------------------------------anchor_target_layer----------------------------------------------------
def anchor_target_layer(...)
# 仅在训练中被调用,如VGGnet_train.py中 # self.feed('rpn_cls_score', 'gt_boxes', 'gt_ishard', 'dontcare_areas', 'im_info') # .anchor_target_layer(_feat_stride, anchor_scales, name = 'rpn-data' )) @layer def anchor_target_layer(self, input, _feat_stride, anchor_scales, name): if isinstance(input[0], tuple): input[0] = input[0][0] with tf.variable_scope(name) as scope: rpn_labels,rpn_bbox_targets,rpn_bbox_inside_weights,rpn_bbox_outside_weights = \ tf.py_func(anchor_target_layer_py, [input[0],input[1],input[2],input[3],input[4], _feat_stride, anchor_scales], [tf.float32,tf.float32,tf.float32,tf.float32]) # rpn_labels存放的是所有anchor的label(-1,0,1) # rpn_bbox_targets是各个anchor的4个目标值(训练RPN的一种转换) 参见bbox_transform.py中 # 转换为tf.Tensor类型 tf.cast为类型转换函数 rpn_labels = tf.convert_to_tensor(tf.cast(rpn_labels,tf.int32), name = 'rpn_labels') # shape is (1 x H x W x A, 2) rpn_bbox_targets = tf.convert_to_tensor(rpn_bbox_targets, name = 'rpn_bbox_targets') # shape is (1 x H x W x A, 4) rpn_bbox_inside_weights = tf.convert_to_tensor(rpn_bbox_inside_weights , name = 'rpn_bbox_inside_weights') # shape is (1 x H x W x A, 4) rpn_bbox_outside_weights = tf.convert_to_tensor(rpn_bbox_outside_weights , name = 'rpn_bbox_outside_weights') # shape is (1 x H x W x A, 4) return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights
仅在RPN训练过程中被调用,实际执行的是anchor_target_layer_tf.py(rpn_cls_score,gt_boxes,gt_ishard,dontcare_area,im_info, _feat_stride = [16,],anchor_scale = [4, 8, 16, 32])(在rpn_msr/anchor_target_layer_tf.py中),其中,anchor_scales = cfg.ANCHOR_SCALES = [8,16,32],作用是将anchor分配给gt,产生anchor的类别label(0/1)及回归的target,传入参数dontcare_areas释义可见函数说明,rpn_bbox_inside_weights和rpn_bbox_outside_weights暂未知具体含义,被VGGnet_train.py调用
from ..rpn_msr.anchor_target_layer_tf import anchor_target_layer as anchor_target_layer_py
def anchor_target_layer(rpn_cls_score, gt_boxes, gt_ishard, dontcare_areas, im_info, _feat_stride = [16,], anchor_scales = [4 ,8, 16, 32]): """ Assign anchors to ground-truth targets. Produces anchor classification labels and bounding-box regression targets. Parameters ---------- rpn_cls_score: (1, H, W, Ax2) bg/fg scores of previous conv layer gt_boxes: (G, 5) vstack of [x1, y1, x2, y2, class] gt_ishard: (G, 1), 1 or 0 indicates difficult or not dontcare_areas: (D, 4), some areas may contains small objs but no labelling. D may be 0 im_info: a list of [image_height, image_width, scale_ratios] _feat_stride: the downsampling ratio of feature map to the original input image anchor_scales: the scales to the basic_anchor (basic anchor is [16, 16]) ---------- Returns ---------- rpn_labels : (HxWxA, 1), for each anchor, 0 denotes bg, 1 fg, -1 dontcare rpn_bbox_targets: (HxWxA, 4), distances of the anchors to the gt_boxes(may contains some transform) that are the regression objectives # bbox_transform.py中已分析 rpn_bbox_inside_weights: (HxWxA, 4) weights of each boxes, mainly accepts hyper param in cfg rpn_bbox_outside_weights: (HxWxA, 4) used to balance the fg/bg, beacuse the numbers of bgs and fgs mays significiantly different """
-----------------------------------------------proposal_target_layer----------------------------------------------------
def proposal_target_layer(...)
# 仅在训练时被调用,如VGGnet_train.py中 # (self.feed('rpn_rois','gt_boxes', 'gt_ishard', 'dontcare_areas') # .proposal_target_layer(n_classes ,name = 'roi-data')) # 其中n_classes = cfg.NCLASSES = 21 @layer def proposal_target_layer(self, input, classes, name): if isinstance(input[0], tuple): input[0] = input[0][0] with tf.variable_scope(name) as scope: #inputs: (未区分类别的)'rpn_rois','gt_boxes', 'gt_ishard', 'dontcare_areas' rois,labels,bbox_targets,bbox_inside_weights,bbox_outside_weights \ = tf.py_func(proposal_target_layer_py, [input[0],input[1],input[2],input[3],classes], [tf.float32,tf.float32,tf.float32,tf.float32,tf.float32]) # rois <- (1 x H x W x A, 5) e.g. [0, x1, y1, x2, y2] # rois = tf.convert_to_tensor(rois, name='rois') rois = tf.reshape(rois, [-1, 5], name='rois') # goes to roi_pooling labels = tf.convert_to_tensor(tf.cast(labels,tf.int32), name = 'labels') # goes to FRCNN loss bbox_targets = tf.convert_to_tensor(bbox_targets, name = 'bbox_targets') # goes to FRCNN loss bbox_inside_weights = tf.convert_to_tensor(bbox_inside_weights, name = 'bbox_inside_weights') bbox_outside_weights = tf.convert_to_tensor(bbox_outside_weights, name = 'bbox_outside_weights') self.layers['rois'] = rois # 注意关键的这句!将rois添加到self.layers字典中以便后续取值 return rois, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights
from ..rpn_msr.proposal_target_layer_tf import proposal_target_layer as proposal_target_layer_py
仅在FRCNN训练中被调用,实际执行的是proposal_target_layer(rpn_rois,gt_boxes ,gt_ishard,dontcare_area,_num_classes)(在rpn_msr/proposal_target_layer.py中),作用是将RPN产生的rois分配给gt,产生训练类别label和训练target,RPN产生的rois数量(即rpn_rois参数的第1维)应少于1*H*W*A,bbox_inside_weights和bbox_outside_weights暂未知具体含义,被VGGnet_train.py调用
-----------------------------------------------reshape_layer----------------------------------------------------
def reshape_layer(...)
@layer def reshape_layer (self, input, d, name): input_shape = tf.shape(input) if name == 'rpn_cls_prob_reshape': # tf.reshape(input,shape) 将tensor变化为shape的维度形式 # tf.transpose(input,perm) 按perm指定维度调换tensor顺序 return tf.transpose(tf.reshape(tf.transpose(input,[0,3,1,2]), [ input_shape[0], int(d), tf.cast(tf.cast(input_shape[1],tf.float32)/tf.cast(d,tf.float32)*tf.cast(input_shape[3],tf.float32),tf.int32), input_shape[2] ]), [0,2,3,1],name=name) else: return tf.transpose(tf.reshape(tf.transpose(input,[0,3,1,2]), [ input_shape[0], int(d), tf.cast(tf.cast(input_shape[1],tf.float32)*(tf.cast(input_shape[3],tf.float32)/tf.cast(d,tf.float32)),tf.int32), input_shape[2] ]), [0,2,3,1],name=name)
未见调用,用于reshape变量维度,其中tf.reshape(input,shape) 将tensor变化为shape的维度形式,tf.transpose(input,perm)按perm指定维度调换tensor顺序
-----------------------------------------spatial_reshape_layer--------------------------------
def spatial_reshape_layer(...)
@layer def spatial_reshape_layer(self, input, d, name): input_shape = tf.shape(input) # transpose: (1, H, W, A x d) -> (1, H, WxA, d) return tf.reshape(input,\ [input_shape[0],\ input_shape[1], \ -1,\ int(d)])
在训练、测试过程中均被调用(如VGGnet_train.py和VGGnet_test.py),用于reshape变量维度
-----------------------------------------------fc---------------------------------------------------
def fc(...)
# .roi_pool(7, 7, 1.0 / 16, name='pool_5') # .fc(4096, name='fc6') # .fc(4096, name='fc7') # .fc(n_classes, relu=False, name='cls_score') @layer def fc(self, input, num_out, name, relu=True, trainable=True): with tf.variable_scope(name) as scope: # only use the first input if isinstance(input, tuple): input = input[0] input_shape = input.get_shape() # 获取tensor的维度 if input_shape.ndims == 4: # tensor的shape的rank dim = 1 for d in input_shape[1:].as_list(): # as_list()将shape值转化为list dim *= d # 上一层神经元个数 feed_in = tf.reshape(tf.transpose(input,[0,3,1,2]), [-1, dim]) else: feed_in, dim = (input, int(input_shape[-1])) # 两种情况全连接层初值标准差不一样 if name == 'bbox_pred': init_weights = tf.truncated_normal_initializer(0.0, stddev=0.001) init_biases = tf.constant_initializer(0.0) else: init_weights = tf.truncated_normal_initializer(0.0, stddev=0.01) init_biases = tf.constant_initializer(0.0) weights = self.make_var('weights', [dim, num_out], init_weights, trainable, \ regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY)) biases = self.make_var('biases', [num_out], init_biases, trainable) op = tf.nn.relu_layer if relu else tf.nn.xw_plus_b # 全连接矩阵相乘运算 fc = op(feed_in, weights, biases, name=scope.name) return fc
训练和测试时均被调用,在测试时,fc()函数中也对变量赋初值,那么,如何加载训练好的模型weights和bias?被VGGnet_train.py和VGGnet_test.py调用
-------------------------------------------softmax原理---------------------------------------------------
def softmax(...)
# (self.feed('conv5_3', 'rois') # .roi_pool(7, 7, 1.0 / 16, name='pool_5') # .fc(4096, name='fc6') # .fc(4096, name='fc7') # .fc(n_classes, relu=False, name='cls_score') # .softmax(name='cls_prob')) @layer def softmax(self, input, name): input_shape = tf.shape(input) if name == 'rpn_cls_prob': # VGGnet中均未执行这种情况 return tf.reshape(tf.nn.softmax(tf.reshape(input,[-1,input_shape[3]])),[-1,input_shape[1],input_shape[2],input_shape[3]],name=name) else: return tf.nn.softmax(input,name=name)
tensorflow中softmax函数为tf.nn.softmax(...),被VGGnet_train.py和VGGnet_test.py调用
def spatial_softmax(self, input,name)
# shape is (1, H, W, Ax2) -> (1, H, WxA, 2) # (self.feed('rpn_cls_score') # .spatial_reshape_layer(2, name='rpn_cls_score_reshape') # .spatial_softmax(name='rpn_cls_prob')) @layer # 实际上是def softmax(...)中的第一种情况 def spatial_softmax(self, input, name): input_shape = tf.shape(input) # d = input.get_shape()[-1] return tf.reshape(tf.nn.softmax(tf.reshape(input, [-1, input_shape[3]])), [-1, input_shape[1], input_shape[2], input_shape[3]], name=name)
被VGGnet_train.py和VGGnet_test.py调用
-------------------------------------------dropout原理-------------------------------------------------
@layer def dropout(self, input, keep_prob, name): return tf.nn.dropout(input, keep_prob, name=name)
tensorflow中dropout函数为tf.nn.dropout(...),仅被VGGnet_train.py调用,未被VGGnet_test.py调用
--------------------------------------------l2_regularizer正则化原理----------------------------------------------
def l2_regularizer(...)
# 注意无layer装饰器 def l2_regularizer(self, weight_decay=0.0005, scope=None): def regularizer(tensor): # tf.variable_scope可以让变量有相同的命名,包括tf.get_variable得到的变量,还有tf.Variable的变量 # tf.name_scope可以让变量有相同的命名,只是限于tf.Variable的变量 with tf.name_scope(scope, default_name='l2_regularizer', values=[tensor]): l2_weight = tf.convert_to_tensor(weight_decay, dtype=tensor.dtype.base_dtype, name='weight_decay') return tf.multiply(l2_weight, tf.nn.l2_loss(tensor), name='value') return regularizer
被conv(...)等调用,如:
kernel = self.make_var('weights', [k_h, k_w, c_i, c_o], init_weights, trainable, \ regularizer=self.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY)) # 0.0005
--------------------------------------------smooth_l1_dist原理----------------------------------------------
def smooth_l1_dist(...)
# 注意无layer装饰器 def smooth_l1_dist(self, deltas, sigma2=9.0, name='smooth_l1_dist'): with tf.name_scope(name=name) as scope: deltas_abs = tf.abs(deltas) # tf.less返回两个张量各元素比较(x<y)得到的真假值组成的张量 smoothL1_sign = tf.cast(tf.less(deltas_abs, 1.0/sigma2), tf.float32) return tf.square(deltas) * 0.5 * sigma2 * smoothL1_sign + \ (deltas_abs - 0.5 / sigma2) * tf.abs(smoothL1_sign - 1)
被build_loss()函数调用
4.不常见的网络处理层
-----------------------------------------------concat----------------------------------------------
def concat(...)
@layer def concat(self, inputs, axis, name): return tf.concat(axis=axis, values=inputs, name=name)
tf.concat()按照axis轴拼接tensor,values是由tensor组成的列表,未见调用
-------------------------------------------add---------------------------------------------------
def add(...)
@layer def add(self,input,name): """contribution by miraclebiu""" return tf.add(input[0],input[1], name=name)
对input的两个维度相加,未见调用
-------------------------------------------negation---------------------------------------------------
def negation(...)将input乘以-1
def negation(self, input, name): """ simply multiplies -1 to the tensor""" return tf.multiply(input, -1.0, name=name)
--------------------------------------用于PVAnet的相关网络处理层---------------------------
def bn_scale_combo(...)
def pva_negation_block(...)
def pva_negation_block_v2(...)
def pva_inception_res_stack(...)
def pva_inception_res_block(...)
@layer def bn_scale_combo(self, input, c_in, name, relu=True): """ PVA net BN -> Scale -> Relu""" @layer def pva_negation_block(self, input, k_h, k_w, c_o, s_h, s_w, name, biased=True, padding=DEFAULT_PADDING, trainable=True, scale = True, negation = True): """ for PVA net, Conv -> BN -> Neg -> Concat -> Scale -> Relu""" @layer def pva_negation_block_v2(self, input, k_h, k_w, c_o, s_h, s_w, c_in, name, biased=True, padding=DEFAULT_PADDING, trainable=True, scale = True, negation = True): """ for PVA net, BN -> [Neg -> Concat ->] Scale -> Relu -> Conv""" @layer def pva_inception_res_stack(self, input, c_in, name, block_start = False, type = 'a'): @layer def pva_inception_res_block(self, input, name, name_prefix = 'conv4_', type = 'a'): """build inception block"""
-------------------------------------------scale---------------------------------------------------
def scale(...) 将input乘alpha再加beta
@layer def scale(self, input, c_in, name): with tf.variable_scope(name) as scope: alpha = tf.get_variable('alpha', shape=[c_in, ], dtype=tf.float32, initializer=tf.constant_initializer(1.0), trainable=True, regularizer=self.l2_regularizer(0.00001)) beta = tf.get_variable('beta', shape=[c_in, ], dtype=tf.float32, initializer=tf.constant_initializer(0.0), trainable=True, regularizer=self.l2_regularizer(0.00001)) return tf.add(tf.multiply(input, alpha), beta)
5.Faster RCNN误差函数之build_loss(...)函数(后续应注意其在train.py中的调用方式,以tensorflow相关机制如何运行?)
取出rpn-data层输出rpn-data[0]即anchor的label,剔除label为-1难例剩余n个anchor,计算n个anchor交叉熵分类平均损失---->
取出rpn-data层输出rpn-data[1]、rpn-data[2]、rpn-data[3]参与计算n个anchor回归平均损失,其中rpn_bbox_inside_weights貌似是计算损失的系数,rpn_bbox_outside_weights未被使用---->
类似地,取出roi-data层的输出,计算R个proposal的RCNN平均分类损失和平均回归损失,其中,bbox_outside_weights和bbox_inside_weights参与计算RCNN回归损失,最后计算整个网络平均损失并返回
loss, cross_entropy, loss_box, rpn_cross_entropy, rpn_loss_box分别为整个网络平均损失、RCNN分类平均损失、RCNN回归平均损失、RPN分类平均损失、RPN回归平均损失
其他:
第6行 这里的anchor数量应少于HxWxA ??? 需要看rpn_msr和train.py对anchor的预处理等相关代码!
第16行 tf.nn.sparse_softmax_cross_entropy_with_logits() 交叉熵原理
第30行 smooth_l1_dist()函数 l1 distance计算回归损失原理
第91行 cfg.TRAIN.WEIGHT_DECAY > 0时正则化的作用
1 # 注意无layer装饰器,ohem在线难例挖矿 2 def build_loss(self, ohem=False): 3 # RPN 4 # classification loss 5 # 这里的anchor数量应少于HxWxA ??? 需要看rpn_msr中相关代码! 6 rpn_cls_score = tf.reshape(self.get_output('rpn_cls_score_reshape'), [-1, 2]) # shape (HxWxA, 2) 7 # 这里的rpn-data层在VGGnet_train.py中定义,rpn-data共有4维,计算rpn训练box loss需要用到另外3维 8 rpn_label = tf.reshape(self.get_output('rpn-data')[0], [-1]) # shape (HxWxA) 9 # ignore_label(-1) -1貌似是hard example??? 10 fg_keep = tf.equal(rpn_label, 1) # tf.equal()对应位置值比较,相等返回True,否则False 11 rpn_keep = tf.where(tf.not_equal(rpn_label, -1)) 12 # 剔除对应label为-1的anchor,难例? 13 rpn_cls_score = tf.reshape(tf.gather(rpn_cls_score, rpn_keep), [-1, 2]) # shape (N, 2) 14 rpn_label = tf.reshape(tf.gather(rpn_label, rpn_keep), [-1]) # tf.gather()取出对应index位置的元素 15 # 交叉熵 16 rpn_cross_entropy_n = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=rpn_cls_score, labels=rpn_label) 17 # RPN分类平均loss 18 rpn_cross_entropy = tf.reduce_mean(rpn_cross_entropy_n) # 计算tensor沿指定数轴的平均值 19 # box loss 20 rpn_bbox_pred = self.get_output('rpn_bbox_pred') # shape (1, H, W, Ax4) 21 rpn_bbox_targets = self.get_output('rpn-data')[1] 22 rpn_bbox_inside_weights = self.get_output('rpn-data')[2] 23 rpn_bbox_outside_weights = self.get_output('rpn-data')[3] 24 rpn_bbox_pred = tf.reshape(tf.gather(tf.reshape(rpn_bbox_pred, [-1, 4]), rpn_keep), [-1, 4]) # shape (N, 4) 25 rpn_bbox_targets = tf.reshape(tf.gather(tf.reshape(rpn_bbox_targets, [-1, 4]), rpn_keep), [-1, 4]) 26 rpn_bbox_inside_weights = tf.reshape(tf.gather(tf.reshape(rpn_bbox_inside_weights, [-1, 4]), rpn_keep), [-1, 4]) 27 # rpn_bbox_outside_weights未被使用 28 rpn_bbox_outside_weights = tf.reshape(tf.gather(tf.reshape(rpn_bbox_outside_weights, [-1, 4]), rpn_keep), [-1, 4]) 29 # n个anchor参与计算rpn_loss_box,其中,tf.reduce_sum()按axis计算一个tensor对应维度上元素的和 30 rpn_loss_box_n = tf.reduce_sum(self.smooth_l1_dist( 31 rpn_bbox_inside_weights * (rpn_bbox_pred - rpn_bbox_targets)), axis=[1]) 32 if ohem: 33 # strategy: keeps all the positive samples 34 fg_ = tf.equal(rpn_label, 1) 35 bg_ = tf.equal(rpn_label, 0) 36 pos_inds = tf.where(fg_) 37 neg_inds = tf.where(bg_) 38 rpn_cross_entropy_n_pos = tf.reshape(tf.gather(rpn_cross_entropy_n, pos_inds), [-1]) 39 rpn_cross_entropy_n_neg = tf.reshape(tf.gather(rpn_cross_entropy_n, neg_inds), [-1]) 40 top_k = tf.cast(tf.minimum(tf.shape(rpn_cross_entropy_n_neg)[0], 300), tf.int32) 41 rpn_cross_entropy_n_neg, _ = tf.nn.top_k(rpn_cross_entropy_n_neg, k=top_k) 42 rpn_cross_entropy = tf.reduce_sum(rpn_cross_entropy_n_neg) / (tf.reduce_sum(tf.cast(bg_, tf.float32)) + 1.0) \ 43 + tf.reduce_sum(rpn_cross_entropy_n_pos) / (tf.reduce_sum(tf.cast(fg_, tf.float32)) + 1.0) 44 45 rpn_loss_box_n = tf.reshape(tf.gather(rpn_loss_box_n, pos_inds), [-1]) 46 # rpn_cross_entropy_n = tf.concat(0, (rpn_cross_entropy_n_pos, rpn_cross_entropy_n_neg)) 47 48 # rpn_loss_box = 1 * tf.reduce_mean(rpn_loss_box_n) 49 # RPN回归平均损失 50 rpn_loss_box = tf.reduce_sum(rpn_loss_box_n) / (tf.reduce_sum(tf.cast(fg_keep, tf.float32)) + 1.0) 51 52 # R-CNN 53 # classification loss,注意roi-data有4个维度,与rpn-data保持一致! 54 cls_score = self.get_output('cls_score') # (R, C+1) 55 label = tf.reshape(self.get_output('roi-data')[1], [-1]) # (R) 56 # R个proposal的交叉熵损失 57 cross_entropy_n = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=cls_score, labels=label) 58 59 # bounding box regression L1 loss 60 bbox_pred = self.get_output('bbox_pred') # (R, (C+1)x4) 61 bbox_targets = self.get_output('roi-data')[2] # (R, (C+1)x4) 62 # each element is {0, 1}, represents background (0), objects (1) 63 bbox_inside_weights = self.get_output('roi-data')[3] # (R, (C+1)x4) 64 bbox_outside_weights = self.get_output('roi-data')[4] # (R, (C+1)x4) 65 66 loss_box_n = tf.reduce_sum( \ 67 bbox_outside_weights * self.smooth_l1_dist(bbox_inside_weights * (bbox_pred - bbox_targets)), \ 68 axis=[1]) 69 70 71 # loss_n = loss_box_n + cross_entropy_n 72 # loss_n = tf.reshape(loss_n, [-1]) 73 74 # if ohem: 75 # # top_k = 100 76 # top_k = tf.minimum(tf.shape(loss_n)[0] / 2, 500) 77 # loss_n, top_k_indices = tf.nn.top_k(loss_n, k=top_k, sorted=False) 78 # loss_box_n = tf.gather(loss_box_n, top_k_indices) 79 # cross_entropy_n = tf.gather(cross_entropy_n, top_k_indices) 80 81 # RCNN box_loss平均损失 82 loss_box = tf.reduce_mean(loss_box_n) 83 # RCNN 分类平均损失 84 cross_entropy = tf.reduce_mean(cross_entropy_n) 85 86 # 整个网络总的损失(含RPN+RCNN) 87 loss = cross_entropy + loss_box + rpn_cross_entropy + rpn_loss_box 88 89 # 正则化的作用? 90 # add regularizer 91 if cfg.TRAIN.WEIGHT_DECAY > 0: # 0.0005 92 regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) 93 loss = tf.add_n(regularization_losses) + loss 94 # 分别为整个网络平均损失、RCNN分类平均损失、RCNN回归平均损失、RPN分类平均损失、RPN回归平均损失 95 return loss, cross_entropy, loss_box, rpn_cross_entropy, rpn_loss_box
6.其他
数据是如何通过sess.run馈入的,feed_dict与VGGnet_test类中相应变量的关系(仅定义了占位符)是什么机制