Tensorflow版Faster RCNN源码解析(TFFRCNN) (02) test.py(不使用RPN时)(含ndarray数组复杂的切片操作等)更

本blog为github上CharlesShang/TFFRCNN版源码解析系列代码笔记

---------------个人学习笔记---------------

----------------本文作者疆--------------

------点击此处链接至博客园原文------

 

1.当不通过RPN获得roi需要做如下修改:

demo.py中调用demo()时需传入boxes参数

调用im_detect()时需传入boxes参数,否则boxes默认为None

cfg.TEST.HAS_RPN改为False

_get_blobs(im, boxes)中_get_rois_blob(rois,cfg.TEST.SCALES_BASE)的cfg.TEST.SCALES_BASE改为im_scale_factors

修改VGGnet_test.py网络文件:self.im_info改为self.rois,其占位符shape改为[None, 5],将self.layers字典中‘im_info’均改为‘rois’,删除网络与RPN相关的层。      

import tensorflow as tf
from .network import Network
from ..fast_rcnn.config import cfg


class VGGnet_test(Network):
    def __init__(self, trainable=True):
        self.inputs = []
        self.data = tf.placeholder(tf.float32, shape=[None, None, None, 3])
        self.rois = tf.placeholder(tf.float32, shape=[None, 5])
        self.keep_prob = tf.placeholder(tf.float32)
        self.layers = dict({'data': self.data, 'rois': self.rois})
        self.trainable = trainable
        self.setup()

    def setup(self):
        # n_classes = 21
        n_classes = cfg.NCLASSES
        # anchor_scales = [8, 16, 32]
        anchor_scales = cfg.ANCHOR_SCALES
        _feat_stride = [16, ]

        (self.feed('data')
         .conv(3, 3, 64, 1, 1, name='conv1_1', trainable=False)
         .conv(3, 3, 64, 1, 1, name='conv1_2', trainable=False)
         .max_pool(2, 2, 2, 2, padding='VALID', name='pool1')
         .conv(3, 3, 128, 1, 1, name='conv2_1', trainable=False)
         .conv(3, 3, 128, 1, 1, name='conv2_2', trainable=False)
         .max_pool(2, 2, 2, 2, padding='VALID', name='pool2')
         .conv(3, 3, 256, 1, 1, name='conv3_1')
         .conv(3, 3, 256, 1, 1, name='conv3_2')
         .conv(3, 3, 256, 1, 1, name='conv3_3')
         .max_pool(2, 2, 2, 2, padding='VALID', name='pool3')
         .conv(3, 3, 512, 1, 1, name='conv4_1')
         .conv(3, 3, 512, 1, 1, name='conv4_2')
         .conv(3, 3, 512, 1, 1, name='conv4_3')
         .max_pool(2, 2, 2, 2, padding='VALID', name='pool4')
         .conv(3, 3, 512, 1, 1, name='conv5_1')
         .conv(3, 3, 512, 1, 1, name='conv5_2')
         .conv(3, 3, 512, 1, 1, name='conv5_3'))

        # (self.feed('conv5_3')
        #  .conv(3, 3, 512, 1, 1, name='rpn_conv/3x3')
        #  .conv(1, 1, len(anchor_scales) * 3 * 2, 1, 1, padding='VALID', relu=False, name='rpn_cls_score'))
        #
        # (self.feed('rpn_conv/3x3')
        #  .conv(1, 1, len(anchor_scales) * 3 * 4, 1, 1, padding='VALID', relu=False, name='rpn_bbox_pred'))
        #
        # #  shape is (1, H, W, Ax2) -> (1, H, WxA, 2)
        # (self.feed('rpn_cls_score')
        #  .spatial_reshape_layer(2, name='rpn_cls_score_reshape')
        #  .spatial_softmax(name='rpn_cls_prob'))
        #
        # # shape is (1, H, WxA, 2) -> (1, H, W, Ax2)
        # (self.feed('rpn_cls_prob')
        #  .spatial_reshape_layer(len(anchor_scales) * 3 * 2, name='rpn_cls_prob_reshape'))
        #
        # (self.feed('rpn_cls_prob_reshape', 'rpn_bbox_pred', 'im_info')
        #  .proposal_layer(_feat_stride, anchor_scales, 'TEST', name='rois'))

        (self.feed('conv5_3', 'rois')
         .roi_pool(7, 7, 1.0 / 16, name='pool_5')
         .fc(4096, name='fc6')
         .fc(4096, name='fc7')
         .fc(n_classes, relu=False, name='cls_score')
         .softmax(name='cls_prob'))

        (self.feed('fc7')
         .fc(n_classes * 4, relu=False, name='bbox_pred'))
View Code

2.test.py中im_detect(sess, net, im, boxes=None)调用函数执行顺序(其中,形参im应为BGR顺序),被test_net(sess, net, imdb, weights_filename , max_per_image=300, thresh=0.05, vis=False)调用

def im_detect(sess, net, im, boxes=None):
    """Detect object classes in an image given object proposals.
    Arguments:
        net (caffe.Net): Fast R-CNN network to use
        im (ndarray): color image to test (in BGR order)
        boxes (ndarray): R x 4 array of object proposals
    Returns:
        scores (ndarray): R x K array of object class scores (K includes
            background as object category 0)
        boxes (ndarray): R x (4*K) array of predicted bounding boxes
    """

_get_blobs(im, boxes)获取blobs数据字典(含data、rois字段,data与rois均经过缩放)与图像缩放因子数组im_scales(默认不使用图像金字塔,则数组仅含一个元素),其中,还对所有的blobs['rois']进行去重(判断映射到主干网络产生的feature map相同位置的),更新blobs['rois']按缩放因子缩放了的,R2*5含第1维全0 level)和boxes(原始传入的boxes,未被缩放,R2*4)

--->forward pass前向传播,构造feed_dict字典(含net.data即blobs['data']、net.rois即blobs['rois']、net.keep_prob为1.0该值与dropout有关,与网络文件VGGnet_test.py的输入对应)并通过运行sess传入

--->通过调用(network.py)net.get_output()得到cls_score(cfg.TEST.SVM为True时的得分,即不经过softmax)、cls_prob(默认cfg.TEST.SVM为False,经过softmax处理的最终得分)、bbox_pred坐标回归量、rois(维度分别为R2*n_classes、R2*n_classes、R2*(4*n_classes)、R2*5(R2 <= R)对应于VGGnet_test.py相应层的输出,由于未使用RPN,这里得到的rois实际与输入网络的rois无异

--->默认cfg.TEST.BBOX_REG为True调用(bbox_transform.py中)bbox_transform_inv(boxes,box_deltas)回归boxes得到pred_boxes

--->调用_clip_boxes(pred_boxes,im.shape)将越界的(回归后的boxes)pred_boxes改为图像边界得到更新的pred_boxes

--->更新scores和pred_boxes并返回,此时score维度由R2*n_classes变回R*n_classes,pred_boxes维度也由R2*(4*n_classes)变为R*(4*n_classes)(R为最初的传入im_detect(...)中boxes数量即说明对于映射到feature map相同位置的原始boxes,其最终回归为拥有相同score的同一个box。说明返回的scores和pred_boxes中有重复的元素(且仍然为最初boxes的排序,具体可参考np.unique()函数)(未明白为何这样设计,只统计unique的感觉更好

    # When mapping from image ROIs to feature map ROIs, there's some aliasing
    # (some distinct image ROIs get mapped to the same feature ROI).
    # Here, we identify duplicate feature ROIs, so we only compute features
    # on the unique subset.
    if cfg.DEDUP_BOXES > 0 and not cfg.TEST.HAS_RPN:   #1/16  
        v = np.array([1, 1e3, 1e6, 1e9, 1e12])
        hashes = np.round(blobs['rois'] * cfg.DEDUP_BOXES).dot(v)
        _, index, inv_index = np.unique(hashes, return_index=True,
                                        return_inverse=True)
        blobs['rois'] = blobs['rois'][index, :]
        boxes = boxes[index, :]
    cls_score, cls_prob, bbox_pred, rois = \
        sess.run([net.get_output('cls_score'), net.get_output('cls_prob'), net.get_output('bbox_pred'),net.get_output('rois')],\
                 feed_dict=feed_dict)
    if cfg.DEDUP_BOXES > 0 and not cfg.TEST.HAS_RPN:
        # Map scores and predictions back to the original set of boxes
        scores = scores[inv_index, :]
        pred_boxes = pred_boxes[inv_index, :]
# -*- coding:utf-8 -*-
# Author: WUJiang
# 测试功能

import numpy as np

hashes = np.array([
     [4],
     [1],
     [6],
     [7],
     [1]
    ])
_, index, inv_index = np.unique(hashes, return_index=True,
                                return_inverse=True)

print(inv_index)
# _          [1 4 6 7]     <class 'numpy.ndarray'>
# index      [1 0 2 3]     <class 'numpy.ndarray'>
# inv_index  [1 0 2 3 0]   <class 'numpy.ndarray'>
View Code

_为去重并排序后的数组、index为_中元素在原数组中第一次出现时的索引构成的数组、inv_index为原数组元素在_中的对应索引构成的数组。

3._get_blobs(im, rois)函数的执行逻辑(rois即为传入的boxes),被im_detect(sess, net, im, boxes=None)调用

该函数调用_get_image_blob(im)获取blobs['data']([None, None, None,3],与VGGnet_test.py中data占位符对应)和im_scale_factors缩放因子数组(即im_scales),调用_get_rois_blob(rois,im_scale_factors)得到按图像缩放因子缩放的blobs['rois'](R*5),最后返回blobs和im_scale_factors,以下分别介绍:

def _get_image_blob(im):
    """Converts an image into a network input.
    Arguments:
        im (ndarray): a color image in BGR order
    Returns:
        blob (ndarray): a data blob holding an image pyramid
        im_scale_factors (list): list of image scales (relative to im) used
            in the image pyramid  # 此处注释有误,应为ndarray而非list
    """

im--->减去像素均值操作(图像数据标准化Normalization)--->判断以长边(1000)/短边(600)缩放--->双线性差值对图像进行缩放im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,interpolation=cv2.INTER_LINEAR)--->

blob.py中)blob=im_list_to_blob(processed_ims)将(图像金字塔处理后得到的)图像列表转化为blob元组(即blobs['data'],其shape为[len(ims),max_shape[0],max_shape[1],3],第一维对应于该图像的金字塔图像数量,默认不采样图像金字塔,因此第一维为1,后面三维代表图像宽、高和通道数)返回blob和np.array(im_scale_factors)缩放因子数组(不使用图像金字塔时processed_imsim_scale_factors均只含一个元素)

 

 

 

def _get_rois_blob(im_rois, im_scale_factors):
    """Converts RoIs into network inputs.
    Arguments:
        im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates
        im_scale_factors (list): scale factors as returned by _get_image_blob   # 此处注释有误,应为ndarray而非list
    Returns:
        blob (ndarray): R x 5 matrix of RoIs in the image pyramid
    """
# blob.py中im_list_to_blob()
def im_list_to_blob(ims):
    # 将图像构成的列表构成网络的blob输入
    """Convert a list of images into a network input.
    Assumes images are already prepared (means subtracted, BGR order, ...).
    """
    max_shape = np.array([im.shape for im in ims]).max(axis=0)  # 取出最大图像shape max_shape[0]、max_shape[1]
    num_images = len(ims)
    # 构造blob输入,维度为(图像数量,max_shape[0],max_shape[1],3)
    blob = np.zeros((num_images, max_shape[0], max_shape[1], 3),
                    dtype=np.float32)
    for i in xrange(num_images):
        im = ims[i]
        blob[i, 0:im.shape[0], 0:im.shape[1], :] = im
    return blob

_get_rois_blob(...)中调用_project_im_rois(im_rois, im_scale_factors)得到rois和levels,拼接rois和levels构成rois_blob(即data['rois'],shape为R*5,5=1level+4坐标,默认不使用图像金字塔level均为0)

def _project_im_rois(im_rois, scales):
    """Project image RoIs into the image pyramid built by _get_image_blob.
    Arguments:
        im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates
        scales (list): scale factors as returned by _get_image_blob  # 此处注释有误,应为ndarray而非list
    Returns:
        rois (ndarray): R x 4 matrix of projected RoI coordinates
        levels (list): image pyramid levels used by each projected RoI  # 此处注释有误,应为ndarray而非list
    """

rois = im_rois * scales[levels] rois按缩放因子进行缩放返回rois和levels

# -*- coding:utf-8 -*-
# Author: WUJiang
# 测试功能

import numpy as np

im_rois = np.array([
    [1, 2, 4, 5],
    [2, 3, 7, 8]
])
scale = np.array([0.6])
levels = np.array([
    [0],
    [0]
])
"""
[[0.6]
 [0.6]]
"""
print(scale[levels])
rois = im_rois * scale[levels]
"""
[[0.6 1.2 2.4 3. ]
 [1.2 1.8 4.2 4.8]]
"""
print(rois)
View Code

4.test_net(sess, net, imdb, weight_filename, max_per_image = 300即top-300个proposal, thresh = 0.05即得分阈值, vis = False)的执行逻辑,被test_net.py调用

def test_net(sess, net, imdb, weights_filename , max_per_image=300, thresh=0.05, vis=False): # 被test_net.py调用
    # weights_filename不包含后缀的模型文件名,类似于vggnet...70000
    """Test a Fast R-CNN network on an image database."""

构造all_boxes列表,其中all_boxes[cls_ind类别索引][im_id图像索引]表示为该类在该图像中对应的R*5的检测结果det,即R*(x1,y1,x2,y2,score)  (其中数据集图像共num_images张,num_images=len(imdb.image_index))

--->调用(config.py中)get_output_dir(imdb,weights_filename)得到输出对应数据集检测结果pkl的部分绝对路径(还需拼接上detection.pkl)

--->构造计时器字典_t(含im_detectmisc字段,分别创建了一个Timer对象)

--->(不使用RPN时)roidb=imdb.roidb(维度和内容不明,猜测其存储了各张图像roi相关信息,roi中包括检测得到的和gt)--->得到测试结果pkl文件全路径det_file(如output/faster_rcnn_voc_vgg/voc_2007_test/VGGnet_fast_rcnn_iter_6020/detections.pkl)

--->i循环遍历每一张图像  bbox_proposal = roidb[i]['boxes'][roidb[i]['gt_classes'] == 0]剔除为gt的rois,只评估非gt roi(即实际检测得到)的proposal的效果(因为roi有可能来源于training or val split,这里的gt_classes指的不是roi对应gt roi的类别,而是表示是否为gt roi,1表示是gt roi?此理解是错误的,可见pascal_voc.py中gtclasses字段含义

    for i in xrange(num_images):
        # filter out any ground truth boxes
        if cfg.TEST.HAS_RPN:
            box_proposals = None
        else:
            # The roidb may contain ground-truth rois (for example, if the roidb
            # comes from the training or val split). We only want to evaluate
            # detection on the *non*-ground-truth rois. We select those the rois
            # that have the gt_classes field set to 0, which means there's no
            # ground truth.
            box_proposals = roidb[i]['boxes'][roidb[i]['gt_classes'] == 0]

        im = cv2.imread(imdb.image_path_at(i))
        _t['im_detect'].tic()
        scores, boxes = im_detect(sess, net, im, box_proposals)
        detect_time = _t['im_detect'].toc(average=False)

--->读取图像im--->调用im_detect(sess, net, im, box_proposal)返回scores和boxes,前后时间戳相减得到耗时detect_time

--->对于每张图像,从1~imdb.num_classes j循环遍历各个类别(0为背景),取出该类score得分超过thresh阈值的boxes拼接成cls_det(None,5),类内执行NMS(nms是C编译的.so),更新cls_det,将cls_det填入all_boxes对应位置all_boxes[j][i]

--->每张图像最多保留包含所有类别的max_per_image(默认为300,即top-300)个得分最高的boxes,更新all_boxes[j][i],im_detect()以后的处理均纳入NMS耗时nms_time(即_t['misc'])--->打印相关测试信息

--->两层循环结束后,利用cPickle向det_file中写入all_boxes检测结果--->调用(lib.datasets.imdb.pascal_voc.py中)imdb.evaluate_detection(all_boxes, output_dir)评估算法精度(如计算AP值)、按类存储相关检测结果

# -*- coding:utf-8 -*-
# Author: WUJiang
# 测试功能

# all_boxes[cls][image] = N x 5 array of detections in (x1, y1, x2, y2, score)
all_boxes = [[[] for _ in range(5)]  # num_images
             for _ in range(4)]  # num_classes
"""
[
 [[], [], [], [], []],
 [[], [], [], [], []],
 [[], [], [], [], []],
 [[], [], [], [], []]
]
"""
print(all_boxes)
View Code

5.其他函数

_clip_boxes(boxes, im_shape)

def _clip_boxes(boxes, im_shape):
    """Clip boxes to image boundaries."""
    # x1 >= 0
    boxes[:, 0::4] = np.maximum(boxes[:, 0::4], 0) 
    # y1 >= 0
    boxes[:, 1::4] = np.maximum(boxes[:, 1::4], 0)
    # x2 < im_shape[1]
    boxes[:, 2::4] = np.minimum(boxes[:, 2::4], im_shape[1] - 1)
    # y2 < im_shape[0]
    boxes[:, 3::4] = np.minimum(boxes[:, 3::4], im_shape[0] - 1)
    return boxes
# -*- coding:utf-8 -*-
# Author: WUJiang
# 测试功能

import numpy as np

boxes = np.array([
    [1, 2, 5, 7],
    [2, 4, 8, 9]
                 ]
)
"""
[[1]
 [2]]
"""
print(boxes[:, 0::4])
# [1 2]
print(boxes[:, 0])
View Code
# -*- coding:utf-8 -*-
# Author: WUJiang
# 测试功能
import numpy as np

a = np.array([
    [0, 1, 2, 3, 4, 5, 6, 7],
    [1, 2, 3, 4, 5, 6, 7, 8],
    [2, 3, 4, 5, 6, 7, 8, 9]
])
"""
[[0 4]
 [1 5]
 [2 6]]
"""
b = a[:, 0::4]   # 第二维以4为步长
print(b)
View Code

_rescale_boxes(boxes, inds, scales)  未见调用

def _rescale_boxes(boxes, inds, scales):
    """Rescale boxes according to image rescaling."""
    for i in xrange(boxes.shape[0]):  # 并不是生成序列,而是作为一个生成器,生成一个取出一个
        boxes[i,:] = boxes[i,:] / scales[int(inds[i])]
    return boxes

vis_detection(im, class_name, dets, thresh=0.8) 可视化检测结果,绘制boxes矩形框 (demo.py中将其重构,不太理解的是这里为何要将10与dets.shape[0]进行比较

被test_net(sess, net, imdb, weight_filename, max_per_image = 300, thresh = 0.05, vis = False)调用

def vis_detections(im, class_name, dets, thresh=0.8):
    """Visual debugging of detections."""
    import matplotlib.pyplot as plt 
    #im = im[:, :, (2, 1, 0)]
    for i in xrange(np.minimum(10, dets.shape[0])):   # dets由bbox坐标和score拼成
        bbox = dets[i, :4] 
        score = dets[i, -1] 
        if score > thresh:
            #plt.cla()
            #plt.imshow(im)
            plt.gca().add_patch(
                plt.Rectangle((bbox[0], bbox[1]),
                              bbox[2] - bbox[0],
                              bbox[3] - bbox[1], fill=False,
                              edgecolor='g', linewidth=3)
                )
            plt.gca().text(bbox[0], bbox[1] - 2,
                 '{:s} {:.3f}'.format(class_name, score),
                 bbox=dict(facecolor='blue', alpha=0.5),
                 fontsize=14, color='white')

            plt.title('{}  {:.3f}'.format(class_name, score))
    #plt.show()

apply_nms(all_boxes, thresh) 未见调用

def apply_nms(all_boxes, thresh):
    """Apply non-maximum suppression to all predicted boxes output by the
    test_net method.
    """
    num_classes = len(all_boxes)
    num_images = len(all_boxes[0])
    nms_boxes = [[[] for _ in xrange(num_images)]
                 for _ in xrange(num_classes)]
    for cls_ind in xrange(num_classes):
        for im_ind in xrange(num_images):
            dets = all_boxes[cls_ind][im_ind]
            if dets == []:
                continue

            x1 = dets[:, 0]
            y1 = dets[:, 1]
            x2 = dets[:, 2]
            y2 = dets[:, 3]
            scores = dets[:, 4]
            inds = np.where((x2 > x1) & (y2 > y1) & (scores > cfg.TEST.DET_THRESHOLD))[0]
            dets = dets[inds,:]
            if dets == []:
                continue

            keep = nms(dets, thresh)  # 核心在这里
            if len(keep) == 0:
                continue
            nms_boxes[cls_ind][im_ind] = dets[keep, :].copy()
    return nms_boxes

传入参数all_boxes可见test_net(sess, net, imdb, weights_filename, max_per_image. thresh, vis)函数

posted @ 2019-06-29 10:18  JiangJ~  阅读(670)  评论(0编辑  收藏  举报