pytorch_ssd 代码注释

box_utils.py

# -*- coding: utf-8 -*-
import torch


def point_form(boxes):
    """ Convert prior_boxes to (xmin, ymin, xmax, ymax)
    representation for comparison to point form ground truth data.
    Args:
        boxes: (tensor) center-size default boxes from priorbox layers.
    Return:
        boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
    """
    return torch.cat((boxes[:, :2] - boxes[:, 2:]/2,     # xmin, ymin
                     boxes[:, :2] + boxes[:, 2:]/2), 1)  # xmax, ymax


def center_size(boxes):
    """ Convert prior_boxes to (cx, cy, w, h)
    representation for comparison to center-size form ground truth data.
    Args:
        boxes: (tensor) point_form boxes
    Return:
        boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
    """
    return torch.cat((boxes[:, 2:] + boxes[:, :2])/2,  # cx, cy
                     boxes[:, 2:] - boxes[:, :2], 1)  # w, h


def intersect(box_a, box_b):
    """ We resize both tensors to [A,B,2] without new malloc:
    [A,2] -> [A,1,2] -> [A,B,2]
    [B,2] -> [1,B,2] -> [A,B,2]
    Then we compute the area of intersect between box_a and box_b.
    Args:
      box_a: (tensor) bounding boxes, Shape: [A,4].
      box_b: (tensor) bounding boxes, Shape: [B,4].
    Return:
      (tensor) intersection area, Shape: [A,B].
    """
    A = box_a.size(0)
    B = box_b.size(0)

    # n1 = box_a[:, 2:]
    # n1_1 = box_a[:, 2:].unsqueeze(1)
    # n1_2 = box_a[:, 2:].unsqueeze(1).expand(A, B, 2)
    #
    # n2 = box_b[:, 2:]
    # n2_1 = box_b[:, 2:].unsqueeze(0)
    # n2_2 = box_b[:, 2:].unsqueeze(0).expand(A, B, 2)
    #
    # n3 = torch.min(n1_2, n2_2)

    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))

    # sub_ = max_xy - min_xy

    inter = torch.clamp((max_xy - min_xy), min=0)
    return inter[:, :, 0] * inter[:, :, 1]


def jaccard(box_a, box_b):
    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
    is simply the intersection over union of two boxes.  Here we operate on
    ground truth boxes and default boxes.
    E.g.:
        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
    Args:
        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
        box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
    Return:
        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
    """
    inter = intersect(box_a, box_b)
    area_a = ((box_a[:, 2]-box_a[:, 0]) *
              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
    area_b = ((box_b[:, 2]-box_b[:, 0]) *
              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
    union = area_a + area_b - inter
    return inter / union  # [A,B]

# match(self.threshold, truths, defaults, self.variance, labels,
#                   loc_t, conf_t, idx)

def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx):
    """Match each prior box with the ground truth box of the highest jaccard
    overlap, encode the bounding boxes, then return the matched indices
    corresponding to both confidence and location preds.
    Args:
        threshold: (float) The overlap threshold used when mathing boxes.
        truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors].
        priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
        variances: (tensor) Variances corresponding to each prior coord,
            Shape: [num_priors, 4].
        labels: (tensor) All the class labels for the image, Shape: [num_obj].
        loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
        conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds.
        idx: (int) current batch index
    Return:
        The matched indices corresponding to 1)location and 2)confidence preds.
    """
    # jaccard index   truths[3, 4]   priors[8732, 4]  overlaps[3, 8732]
    overlaps = jaccard(
        truths,
        point_form(priors)
    )## overlaps [2,8732]
    # (Bipartite Matching)
    # [1,num_objects] best prior for each ground truth
    #[2,1]                [2,1]
    best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)
    # [1,num_priors] best ground truth for each prior
    #[1,8732]           [1,8732]
    best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
    best_truth_idx.squeeze_(0) #[8732]  #8732个priors与哪个truth交并比最大
    best_truth_overlap.squeeze_(0) #[8732] #8732个priors与哪个truth交并比最大是多少
    best_prior_idx.squeeze_(1) #[2]  #3个truth与哪个priors的交并比最大
    best_prior_overlap.squeeze_(1) #[2] #3个truth与哪个priors的交并比最大是多少
    best_truth_overlap.index_fill_(0, best_prior_idx, 2)  # ensure best prior
    # TODO refactor: index  best_prior_idx with long tensor
    # ensure every gt matches with its prior of max overlap
    for j in range(best_prior_idx.size(0)):
        best_truth_idx[best_prior_idx[j]] = j
    matches = truths[best_truth_idx]          # Shape: [num_priors,4]         truths[3, 4]  best_truth_idx[8732]     matches[8732, 4]
    conf = labels[best_truth_idx] + 1         # Shape: [num_priors]            labels[3]        best_truth_idx[8732]   conf[8732]
    conf[best_truth_overlap < threshold] = 0  # label as background
    loc = encode(matches, priors, variances)
    loc_t[idx] = loc    # [num_priors,4] encoded offsets to learn
    conf_t[idx] = conf  # [num_priors] top class label for each prior


def encode(matched, priors, variances):
    """Encode the variances from the priorbox layers into the ground truth boxes
    we have matched (based on jaccard overlap) with the prior boxes.
    Args:
        matched: (tensor) Coords of ground truth for each prior in point-form
            Shape: [num_priors, 4].
        priors: (tensor) Prior boxes in center-offset form
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        encoded boxes (tensor), Shape: [num_priors, 4]
    """

    # dist b/t match center and prior's center
    g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2]
    # encode variance
    g_cxcy /= (variances[0] * priors[:, 2:])
    # match wh / prior wh
    g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
    g_wh = torch.log(g_wh) / variances[1]
    # return target for smooth_l1_loss
    return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]


# Adapted from https://github.com/Hakuyume/chainer-ssd
def decode(loc, priors, variances):
    """Decode locations from predictions using priors to undo
    the encoding we did for offset regression at train time.
    Args:
        loc (tensor): location predictions for loc layers,
            Shape: [num_priors,4]
        priors (tensor): Prior boxes in center-offset form.
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        decoded bounding box predictions
    """

    boxes = torch.cat((
        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
        priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
    boxes[:, :2] -= boxes[:, 2:] / 2
    boxes[:, 2:] += boxes[:, :2]
    return boxes


def log_sum_exp(x):
    """Utility function for computing log_sum_exp while determining
    This will be used to determine unaveraged confidence loss across
    all examples in a batch.
    Args:
        x (Variable(tensor)): conf_preds from conf layers
    """
    x_max = x.data.max()

    bb = torch.exp(x-x_max) #[26196, 21]

    cc = torch.sum(torch.exp(x-x_max), 1, keepdim=True)#[26196, 1]

    dd = torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True))#[26196, 1]

    return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max


# Original author: Francisco Massa:
# https://github.com/fmassa/object-detection.torch
# Ported to PyTorch by Max deGroot (02/01/2017)
# def nms(boxes, scores, overlap=0.5, top_k=200):
#     """Apply non-maximum suppression at test time to avoid detecting too many
#     overlapping bounding boxes for a given object.
#     Args:
#         boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
#         scores: (tensor) The class predscores for the img, Shape:[num_priors].
#         overlap: (float) The overlap thresh for suppressing unnecessary boxes.
#         top_k: (int) The Maximum number of box preds to consider.
#     Return:
#         The indices of the kept boxes with respect to num_priors.
#     """
#
#     keep = scores.new(scores.size(0)).zero_().long()
#     if boxes.numel() == 0:
#         return keep
#     x1 = boxes[:, 0]
#     y1 = boxes[:, 1]
#     x2 = boxes[:, 2]
#     y2 = boxes[:, 3]
#     area = torch.mul(x2 - x1, y2 - y1)
#     v, idx = scores.sort(0)  # sort in ascending order
#     # I = I[v >= 0.01]
#     idx = idx[-top_k:]  # indices of the top-k largest vals
#     xx1 = boxes.new()
#     yy1 = boxes.new()
#     xx2 = boxes.new()
#     yy2 = boxes.new()
#     w = boxes.new()
#     h = boxes.new()
#
#     # keep = torch.Tensor()
#     count = 0
#     while idx.numel() > 0:
#         i = idx[-1]  # index of current largest val
#         # keep.append(i)
#         keep[count] = i
#         count += 1
#         if idx.size(0) == 1:
#             break
#         idx = idx[:-1]  # remove kept element from view
#         # load bboxes of next highest vals
#         # torch.index_select(x1, 0, idx, out=xx1)
#         # torch.index_select(y1, 0, idx, out=yy1)
#         # torch.index_select(x2, 0, idx, out=xx2)
#         # torch.index_select(y2, 0, idx, out=yy2)
#
#         # torch.index_select(x1, 0, idx, out=xx1)
#         xx1 = torch.index_select(x1, 0, idx)
#         # torch.index_select(y1, 0, idx, out=yy1)
#         yy1 = torch.index_select(y1, 0, idx)
#         # torch.index_select(x2, 0, idx, out=xx2)
#         xx2 = torch.index_select(x2, 0, idx)
#         # torch.index_select(y2, 0, idx, out=yy2)
#         yy2 = torch.index_select(y2, 0, idx)
#
#
#
#
#         # store element-wise max with next highest score
#         xx1 = torch.clamp(xx1, min=x1[i].data)
#         yy1 = torch.clamp(yy1, min=y1[i].data)
#         xx2 = torch.clamp(xx2, max=x2[i].data)
#         yy2 = torch.clamp(yy2, max=y2[i].data)
#
#         # xx2.requires_grad_(False)
#         # yy2.requires_grad_(False)
#         # yy2 = yy2.detach()
#         # xx2 = xx2.detach()
#         # w = w.detach()
#         # h = h.detach()
#
#         w.resize_as_(xx2)
#         h.resize_as_(yy2)
#         w = xx2 - xx1
#         h = yy2 - yy1
#         # check sizes of xx1 and xx2.. after each iteration
#         w = torch.clamp(w, min=0.0)
#         h = torch.clamp(h, min=0.0)
#         inter = w*h
#         # IoU = i / (area(a) + area(b) - i)
#         rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
#         union = (rem_areas - inter) + area[i]
#         IoU = inter/union  # store result in iou
#         # keep only elements with an IoU <= overlap
#         idx = idx[IoU.le(overlap)]
#     return keep, count

'''nms()函数:非极大值抑制函数。对每一类别分别执行该函数'''
def nms(boxes, scores, overlap=0.5, top_k=200):##参数:边界框精确位置,边界框类别的分数、nms阈值、前200个边界框
    '''(1)构建keep张量:初始值为0,形状与预测框的数量相同(预测框的数量为该类,类别置信度大于阈值的预测边界框的数量)'''
    keep = scores.new(scores.size(0)).zero_().long()

    if boxes.numel() == 0:
        return keep

    '''(2)计算预测边界框的面积'''
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    area = torch.mul(x2 - x1, y2 - y1)

    '''(3)获取 类别置信度分数最高的top_k个 预测边界框的索引'''
    v, idx = scores.sort(0)  #对类别置信度分数升序排序,返回 按照类别置信度分数排序后的   预测边界框的索引
    # I = I[v >= 0.01]
    '''类别置信度分数最高的前top_k个预测框的索引:idx '''
    idx = idx[-top_k:]  # indices of the top-k largest vals
    xx1 = boxes.new()
    yy1 = boxes.new()
    xx2 = boxes.new()
    yy2 = boxes.new()
    w = boxes.new()
    h = boxes.new()
    '''(4)将nms后的预测边界框的索引,存入keep'''
    count = 0
    while idx.numel() > 0:
        ''''#1.类别置信度分数最高的预测边界框————————索引逐一写入keep'''
        i = idx[-1]  # index of current largest val
        # keep.append(i)
        keep[count] = i
        count += 1

        if idx.size(0) == 1:
            break
        '''#2.剩余预测边界框的索引'''
        idx = idx[:-1]  # remove kept element from view
        '''#3.计算剩余预测边界框与,分数最高的边界框之间的iou值'''
        #####################################添加代码##########################################
        #否者出错RuntimeError: index_select(): functions with out=... arguments don't support automatic differentiation, but one of the arguments requires grad.
        idx= torch.autograd.Variable(idx, requires_grad=False)
        idx = idx.data
        x1 = torch.autograd.Variable(x1, requires_grad=False)
        x1 = x1.data
        y1 = torch.autograd.Variable(y1, requires_grad=False)
        y1 = y1.data
        x2 = torch.autograd.Variable(x2, requires_grad=False)
        x2 = x2.data
        y2 = torch.autograd.Variable(y2, requires_grad=False)
        y2 = y2.data
        ######################################添加代码#################################################
        torch.index_select(x1, 0, idx, out=xx1)
        torch.index_select(y1, 0, idx, out=yy1)
        torch.index_select(x2, 0, idx, out=xx2)
        torch.index_select(y2, 0, idx, out=yy2)
        # store element-wise max with next highest score
        xx1 = torch.clamp(xx1, min=x1[i])
        yy1 = torch.clamp(yy1, min=y1[i])
        xx2 = torch.clamp(xx2, max=x2[i])
        yy2 = torch.clamp(yy2, max=y2[i])
        w.resize_as_(xx2)
        h.resize_as_(yy2)
        w = xx2 - xx1
        h = yy2 - yy1
        # check sizes of xx1 and xx2.. after each iteration
        w = torch.clamp(w, min=0.0)
        h = torch.clamp(h, min=0.0)
        inter = w*h
        # IoU = i / (area(a) + area(b) - i)
        #####################################添加代码##########################################
        #否者出错RuntimeError: index_select(): functions with out=... arguments don't support automatic differentiation, but one of the arguments requires grad.
        area = torch.autograd.Variable(area, requires_grad=False)
        area = area.data
        idx= torch.autograd.Variable(idx, requires_grad=False)
        idx = idx.data
        ######################################添加代码#################################################
        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
        union = (rem_areas - inter) + area[i]
        IoU = inter/union  # store result in iou
        # keep only elements with an IoU <= overlap
        '''4.保留iou值小于nms阈值的预测边界框的索引'''
        idx = idx[IoU.le(overlap)]#保留交并比小于阈值的预测边界框的id
    return keep, count

multibox_loss.py

# -*- coding: utf-8 -*-
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from data import coco as cfg
from ..box_utils import match, log_sum_exp

# criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5,
#                          False, args.cuda)

class MultiBoxLoss(nn.Module):
    """SSD Weighted Loss Function
    Compute Targets:
        1) Produce Confidence Target Indices by matching  ground truth boxes
           with (default) 'priorboxes' that have jaccard index > threshold parameter
           (default threshold: 0.5).
        2) Produce localization target by 'encoding' variance into offsets of ground
           truth boxes and their matched  'priorboxes'.
        3) Hard negative mining to filter the excessive number of negative examples
           that comes with using a large number of default bounding boxes.
           (default negative:positive ratio 3:1)
    Objective Loss:
        L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
        Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
        weighted by α which is set to 1 by cross val.
        Args:
            c: class confidences,
            l: predicted boxes,
            g: ground truth boxes
            N: number of matched default boxes
        See: https://arxiv.org/pdf/1512.02325.pdf for more details.
    """

    def __init__(self, num_classes, overlap_thresh, prior_for_matching,
                 bkg_label, neg_mining, neg_pos, neg_overlap, encode_target,
                 use_gpu=True):
        super(MultiBoxLoss, self).__init__()
        self.use_gpu = use_gpu
        self.num_classes = num_classes
        self.threshold = overlap_thresh
        self.background_label = bkg_label
        self.encode_target = encode_target
        self.use_prior_for_matching = prior_for_matching
        self.do_neg_mining = neg_mining
        self.negpos_ratio = neg_pos
        self.neg_overlap = neg_overlap
        self.variance = cfg['variance']

    def forward(self, predictions, targets):
        """Multibox Loss
        Args:
            predictions (tuple): A tuple containing loc preds, conf preds,
            and prior boxes from SSD net.
                conf shape: torch.size(batch_size,num_priors,num_classes)
                loc shape: torch.size(batch_size,num_priors,4)
                priors shape: torch.size(num_priors,4)

            targets (tensor): Ground truth boxes and labels for a batch,
                shape: [batch_size,num_objs,5] (last idx is the label).

                loc_data [3,8732,4]
                conf_data [3,8732,21]
                priors [8732,4]
        """
        #loc_data [b,8732,4]   conf_data[b,8732,21]  priors[8732,4]
        loc_data, conf_data, priors = predictions
        num = loc_data.size(0)#batchsize
        priors = priors[:loc_data.size(1), :]#[8732,4]
        num_priors = (priors.size(0))#8732
        num_classes = self.num_classes#21

        # match priors (default boxes) and ground truth boxes
        loc_t = torch.Tensor(num, num_priors, 4) #[b,8732,4]
        conf_t = torch.LongTensor(num, num_priors) #[b,8732]
        for idx in range(num):
            truths = targets[idx][:, :-1].data #[2, 4]
            labels = targets[idx][:, -1].data #[2]
            defaults = priors.data #[8732, 4]
            match(self.threshold, truths, defaults, self.variance, labels,
                  loc_t, conf_t, idx)
        if self.use_gpu:
            loc_t = loc_t.cuda()
            conf_t = conf_t.cuda()
        # wrap targets
        loc_t = Variable(loc_t, requires_grad=False) #[3,8732,4]
        conf_t = Variable(conf_t, requires_grad=False) #[3,8732]

        pos = conf_t > 0 #pos [3,8732]  False,True
        num_pos = pos.sum(dim=1, keepdim=True) #num_pos shape[3,1]   | [23],[4],[9]

        # Localization Loss (Smooth L1)
        # Shape: [batch,num_priors,4]  #pos.unsqueeze(pos.dim()) [3,8732,1]
        #pos_idx [3,8732,4]
        pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
        #loc_data[3, 8732, 4]   aa[240]
        aa = loc_data[pos_idx]
        #[3,8732,4]   bb[240]
        bb = loc_t[pos_idx]

        loc_p = loc_data[pos_idx].view(-1, 4)
        loc_t = loc_t[pos_idx].view(-1, 4)
        #loss_l tensor(14.0165, grad_fn=<SmoothL1LossBackward>)
        loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False)

        # sum_t = 0
        # for i in range(21):
        #     sum_t += conf_data[0][0][i]


        # Compute max conf across batch for hard negative mining
        #conf_data [3,8732,21]  batch_conf[3*8732,21]  [26196,21]
        batch_conf = conf_data.view(-1, self.num_classes)
        b1 = log_sum_exp(batch_conf) #[26196,1]
        b00 = conf_t.view(-1, 1) #[26196, 1]
        b2 = batch_conf.gather(1, conf_t.view(-1, 1)) #[26196,1]

        #这里的loss_c1shape就1
        #下面之所以要log_sum_exp因为是要计算每个框的loss为了后续难样本挖掘,loss大的给负样本
        # loss_c1 = F.cross_entropy(batch_conf, conf_t.view(-1))
        #batch_conf[26196, 21]  conf_t[3, 8732]
        #loss_c[26196,1]    #https://zhuanlan.zhihu.com/p/153535799
        loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1))

        # Hard Negative Mining
        #loss_c[pos] = 0  # filter out pos boxes for now
        #loss_c = loss_c.view(num, -1)
        #loss_c [3,8732]
        loss_c = loss_c.view(num, -1)
        loss_c[pos] = 0 #把正样本的loss置为0
        #loss_idx [3,8732]
        tmp1, loss_idx = loss_c.sort(1, descending=True) ## _, loss_idx = loss_c.sort(1, descending=True)
        #idx_rank [3,8732]
        tmp2, idx_rank = loss_idx.sort(1) ## _, idx_rank = loss_idx.sort(1)
        num_pos = pos.long().sum(1, keepdim=True)#num_pos shape[3,1]   | [23],[4],[9]
        aaaaa = self.negpos_ratio * num_pos
        num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)#num_pos shape[3,1]   | [69],[12],[27]
        #neg [3,8732]  True,False  给出的是conf_data对应坐标的True与False 排序的从大到小
        neg = idx_rank < num_neg.expand_as(idx_rank)
        # Confidence Loss Including Positive and Negative Examples
        pos_idx = pos.unsqueeze(2).expand_as(conf_data)#pos[3,8732]  conf_data[3,8732,21]
        neg_idx = neg.unsqueeze(2).expand_as(conf_data)##neg [3,8732]  conf_data[3,8732,21]
        ## pos_idx+neg_idx  这两者的形状都是相同的[3,8732,21] 值都是True或者False  加运算相当执行了或运算,只要有一个True就是True
        #conf_p [144,21] -->  这里面的144就是上面两个pos_idx和neg_idx里面True数量之和 69+12+27+23+4+9=144
        conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes)
        # pos [3,8732]
        # neg [3,8732]
        # conf_t [3,8732]
        # targets_-weighted [144]
        targets_weighted = conf_t[(pos+neg).gt(0)]
        #loss_c tensor(58.0656, grad_fn=<NllLossBackward>)
        loss_c = F.cross_entropy(conf_p, targets_weighted, size_average=False)

        # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N

        N = num_pos.data.sum() ##N=36  就是num_pos之和[23] + [4] + [9]
        loss_l /= N
        loss_c /= N
        return loss_l, loss_c

posted @ 2023-04-21 10:36  无左无右  阅读(47)  评论(0编辑  收藏  举报