u版yolov3详解 --->> 前向推理部分

推理是detect.py脚本。
一张图像首先经过class LoadImages: 类处理。
经过def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32)函数
处理成最长边为640,并且最短边为32的倍数的图像。
原图是1280,720, 经过letterbox函数处理后是640,384 (32*12=384)
具体letterbox分析在这里
https://www.cnblogs.com/yanghailin/p/15338637.html

后面就是正常的转通道,归一化处理等。

    img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
    img = np.ascontiguousarray(img)

后面就是推理:

        img = torch.from_numpy(img).to(device)
        img = img.half() if half else img.float()  # uint8 to fp16/32
        img /= 255.0  # 0 - 255 to 0.0 - 1.0
        if img.ndimension() == 3:
            img = img.unsqueeze(0)

        # Inference
        t1 = time_synchronized()
        pred = model(img, augment=opt.augment)[0]

        # Apply NMS
        pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, opt.classes, opt.agnostic_nms,
                                   max_det=opt.max_det)
        t2 = time_synchronized()

这里需要注意的是:
pred = model(img, augment=opt.augment)[0]
这句就开始推理,因为是推理模式,在网络的最后一层设定了推理模式走不同的逻辑:

class Detect(nn.Module):
    stride = None  # strides computed during build
    onnx_dynamic = False  # ONNX export parameter
                                    #ch [256,512,1024]
    def __init__(self, nc=80, anchors=(), ch=(), inplace=True):  # detection layer
        super(Detect, self).__init__()
        self.nc = nc  # 80 number of classes
        self.no = nc + 5  #85 number of outputs per anchor
        self.nl = len(anchors)  #3 number of detection layers
        self.na = len(anchors[0]) // 2  #3 number of anchors
        self.grid = [torch.zeros(1)] * self.nl  # init grid
        a = torch.tensor(anchors).float().view(self.nl, -1, 2) # [3,3,2]
        # a_tmp = a.clone().view(self.nl, 1, -1, 1, 1, 2)
        self.register_buffer('anchors', a)  # shape(nl,na,2)
        self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2))  # shape(nl,1,na,1,1,2)
        self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)  # output conv
        self.inplace = inplace  # use in-place ops (e.g. slice assignment)

    def forward(self, x):
        # x = x.copy()  # for profiling
        z = []  # inference output
        for i in range(self.nl): #nl=3
            x[i] = self.m[i](x[i])  # conv
            bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
            # x(bs,3,80,80,85)
            #x(bs,3,40,40,85)
            #x(bs,3,20,20,85)

            #self.na=3    self.no=85
            if not self.training:  # inference
                if self.grid[i].shape[2:4] != x[i].shape[2:4] or self.onnx_dynamic:
                    self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
                    ##[1,1,80,80,2]  #[1,1,40,40,2] #[1,1,20,20,2]

                y = x[i].sigmoid() #[1,3,80,80,85]
                if self.inplace:##走这里
                    y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i]  # xy  这里是映射到原图大小
                    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh 
                    ##这里的wh需要注意,得到的wh也是已经映射到原图大小上面去的
                    ##因为这里的self.anchor_grid[i]就是相对于原图大小的!
                    ##而在训练的时候,3组anchor,每组anchor都缩放到32,16,8对应的feature map上面去的。
                    ##训练的时候是每层feature map上面尺寸,因为gt也是映射到每层feature map大小  -->  t = targets * gain 
                    ## 这里因为前面学的是一个相对系数,直接乘原图就可以得到在原图上面的位置。
                else:  # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
                    xy = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i]  # xy
                    wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i].view(1, self.na, 1, 1, 2)  # wh
                    y = torch.cat((xy, wh, y[..., 4:]), -1)
                z.append(y.view(bs, -1, self.no))  ##self.no=85


        # aa,bb = torch.cat(z, 1), x
        return x if self.training else (torch.cat(z, 1), x)

    @staticmethod
    def _make_grid(nx=20, ny=20):
        # nx = 5
        # ny = 5
        yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
        #yv [20,20]   xv[20,20]
        # tmp_0 = torch.stack((xv, yv), 2)  #[20,20,2]
        # tmp_1 = torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float() #[1,1,20,20,2]

        return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()

这里说一下,anchor缩放代码是在哪里:
在yolo.py的Model类里面:
先是跑了一次前向,得到每层feature map的大小size,再用送进前向之前的原尺寸除以size。
这样就知道了缩放比例,然后每个anchor分别除以这个比例!
所以,训练的时候都是gt缩放到3个不同比例,对应的anchor也是。然后在3个不同比例下计算loss的。

class Model(nn.Module):
    def __init__(self, cfg='yolov3.yaml', ch=3, nc=None, anchors=None):  # model, input channels, number of classes
        super(Model, self).__init__()
        if isinstance(cfg, dict):
            self.yaml = cfg  # model dict
        else:  # is *.yaml
            import yaml  # for torch hub
            self.yaml_file = Path(cfg).name
            with open(cfg) as f:
                self.yaml = yaml.safe_load(f)  # model dict

        # Define model
        ch = self.yaml['ch'] = self.yaml.get('ch', ch)  # input channels
        if nc and nc != self.yaml['nc']:
            logger.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
            self.yaml['nc'] = nc  # override yaml value
        if anchors:
            logger.info(f'Overriding model.yaml anchors with anchors={anchors}')
            self.yaml['anchors'] = round(anchors)  # override yaml value
        self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch])  # model, savelist
        self.names = [str(i) for i in range(self.yaml['nc'])]  # default names
        self.inplace = self.yaml.get('inplace', True)
        # logger.info([x.shape for x in self.forward(torch.zeros(1, ch, 64, 64))])

        # Build strides, anchors
        m = self.model[-1]  # Detect()
        if isinstance(m, Detect):
            s = 256  # 2x min stride
            m.inplace = self.inplace

            # tmp111 = self.forward(torch.zeros(1, ch, s, s))
            #value [8,16,32]
            m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s))])  # forward
            tmp12 = m.stride.view(-1, 1, 1) #shape [3,1,1]

            '''
            m.anchors shape[3,3,2]
            
            tensor([[[ 10.,  13.],
         [ 16.,  30.],
         [ 33.,  23.]],

        [[ 30.,  61.],
         [ 62.,  45.],
         [ 59., 119.]],

        [[116.,  90.],
         [156., 198.],
         [373., 326.]]])
            '''

            m.anchors /= m.stride.view(-1, 1, 1)
            check_anchor_order(m)
            self.stride = m.stride
            self._initialize_biases()  # only run once
            # logger.info('Strides: %s' % m.stride.tolist())

        # Init weights, biases
        initialize_weights(self)
        self.info()
        logger.info('')

    def forward(self, x, augment=False, profile=False):
        if augment:
            return self.forward_augment(x)  # augmented inference, None
        else:
            # aa = self.forward_once(x, profile)
            return self.forward_once(x, profile)  # single-scale inference, train

然后就是nms:

def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False,
                        labels=(), max_det=300):
    """Runs Non-Maximum Suppression (NMS) on inference results

    Returns:
         list of detections, on (n,6) tensor per image [xyxy, conf, cls]
    """

    nc = prediction.shape[2] - 5  # number of classes
    xc = prediction[..., 4] > conf_thres  # candidates

    # Checks
    assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
    assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'

    # Settings
    min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
    time_limit = 10.0  # seconds to quit after
    redundant = True  # require redundant detections
    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
    merge = False  # use merge-NMS

    t = time.time()
    output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
    for xi, x in enumerate(prediction):  # image index, image inference
        # Apply constraints
        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
        x = x[xc[xi]]  # confidence

        # Cat apriori labels if autolabelling
        if labels and len(labels[xi]):
            l = labels[xi]
            v = torch.zeros((len(l), nc + 5), device=x.device)
            v[:, :4] = l[:, 1:5]  # box
            v[:, 4] = 1.0  # conf
            v[range(len(l)), l[:, 0].long() + 5] = 1.0  # cls
            x = torch.cat((x, v), 0)

        # If none remain process next image
        if not x.shape[0]:
            continue

        # Compute conf
        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf

        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
        box = xywh2xyxy(x[:, :4])

        # Detections matrix nx6 (xyxy, conf, cls)
        if multi_label:
            i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
            x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
        else:  # best class only
            conf, j = x[:, 5:].max(1, keepdim=True)
            x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]

        # Filter by class
        if classes is not None:
            x = x[(x[:, 5:6] == torcdef letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):h.tensor(classes, device=x.device)).any(1)]

        # Apply finite constraint
        # if not torch.isfinite(x).all():
        #     x = x[torch.isfinite(x).all(1)]

        # Check shape
        n = x.shape[0]  # number of boxes
        if not n:  # no boxes
            continue
        elif n > max_nms:  # excess boxes
            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence

        # Batched NMS
        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
        if i.shape[0] > max_det:  # limit detections
            i = i[:max_det]
        if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
            weights = iou * scores[None]  # box weights
            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
            if redundant:
                i = i[iou.sum(1) > 1]  # require redundancy

        output[xi] = x[i]
        if (time.time() - t) > time_limit:
            print(f'WARNING: NMS time limit {time_limit}s exceeded')
            break  # time limit exceeded

    return output

这段代码需要注意是:

 # Compute conf
x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf

条件概率,每个类别的概率还需要乘以是否是目标这类的概率。
然后坐标转为xyxy格式
box = xywh2xyxy(x[:, :4])

nms得到结果:
pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, opt.classes, opt.agnostic_nms,
max_det=opt.max_det)
pred的shape是[5,6]. 表示有5个目标,x,y,x,y,score,cls

这里需要注意的是pred预测出来的坐标都是相对于一开始我们把图像缩放到一个尺寸上面并且做了32的倍数,中心化这么个操作。
现在需要映射回原图。
det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()
img是归一化后的图,im0是原图

# scale_coords(img.shape[2:], det[:, :4], im0.shape)
def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):
    # Rescale coords (xyxy) from img1_shape to img0_shape
    if ratio_pad is None:  # calculate from img0_shape
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding  0  12
    else:
        gain = ratio_pad[0][0]
        pad = ratio_pad[1]

    coords[:, [0, 2]] -= pad[0]  # x padding
    coords[:, [1, 3]] -= pad[1]  # y padding  12
    coords[:, :4] /= gain
    clip_coords(coords, img0_shape)
    return coords

恩,然后就得到了原图目标坐标、类别、得分这些信息了。
然后就可以画图显示出来。
https://www.cnblogs.com/yanghailin/p/14894318.html

总结一下:

1.需要注意的是,推理的时候并没有把图像统一到固定尺寸640*640,而是把最长边固定为640,然后短边按照系数缩放,并且整成32的倍数。
def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):这个函数就是完成上面的操作。

2.网络部分
推理走自己的逻辑分支,

if self.inplace:##走这里
    y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i]  # xy  这里是映射到原图大小
    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh 

这两句很重要。把框映射到原图大小。中心点xy乘以缩放系数。
wh乘以按照原图大小设定的anchor。
对照这训练时候的代码:

pxy = ps[:, :2].sigmoid() * 2. - 0.5  #[95,2]
pwh = (ps[:, 2:4].sigmoid() * 2) ** 2 * anchors[i] #[95,2]
pbox = torch.cat((pxy, pwh), 1)  # predicted box  #[95,4]

这里都是相对坐标,都一样,不影响loss的计算。这里的anchors是相对于每层feature map大小的,上面有分析。

  1. 最后就是由于有补边操作,需要再映射到原图。scale_coords这代码没怎么看的懂。
posted @ 2021-09-27 16:32  无左无右  阅读(1341)  评论(0编辑  收藏  举报