u版yolov3详解 --->> 前向推理部分
推理是detect.py脚本。
一张图像首先经过class LoadImages: 类处理。
经过def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32)函数
处理成最长边为640,并且最短边为32的倍数的图像。
原图是1280,720, 经过letterbox函数处理后是640,384 (32*12=384)
具体letterbox分析在这里
https://www.cnblogs.com/yanghailin/p/15338637.html
后面就是正常的转通道,归一化处理等。
img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416
img = np.ascontiguousarray(img)
后面就是推理:
img = torch.from_numpy(img).to(device)
img = img.half() if half else img.float() # uint8 to fp16/32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
if img.ndimension() == 3:
img = img.unsqueeze(0)
# Inference
t1 = time_synchronized()
pred = model(img, augment=opt.augment)[0]
# Apply NMS
pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, opt.classes, opt.agnostic_nms,
max_det=opt.max_det)
t2 = time_synchronized()
这里需要注意的是:
pred = model(img, augment=opt.augment)[0]
这句就开始推理,因为是推理模式,在网络的最后一层设定了推理模式走不同的逻辑:
class Detect(nn.Module):
stride = None # strides computed during build
onnx_dynamic = False # ONNX export parameter
#ch [256,512,1024]
def __init__(self, nc=80, anchors=(), ch=(), inplace=True): # detection layer
super(Detect, self).__init__()
self.nc = nc # 80 number of classes
self.no = nc + 5 #85 number of outputs per anchor
self.nl = len(anchors) #3 number of detection layers
self.na = len(anchors[0]) // 2 #3 number of anchors
self.grid = [torch.zeros(1)] * self.nl # init grid
a = torch.tensor(anchors).float().view(self.nl, -1, 2) # [3,3,2]
# a_tmp = a.clone().view(self.nl, 1, -1, 1, 1, 2)
self.register_buffer('anchors', a) # shape(nl,na,2)
self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2)) # shape(nl,1,na,1,1,2)
self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch) # output conv
self.inplace = inplace # use in-place ops (e.g. slice assignment)
def forward(self, x):
# x = x.copy() # for profiling
z = [] # inference output
for i in range(self.nl): #nl=3
x[i] = self.m[i](x[i]) # conv
bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
# x(bs,3,80,80,85)
#x(bs,3,40,40,85)
#x(bs,3,20,20,85)
#self.na=3 self.no=85
if not self.training: # inference
if self.grid[i].shape[2:4] != x[i].shape[2:4] or self.onnx_dynamic:
self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
##[1,1,80,80,2] #[1,1,40,40,2] #[1,1,20,20,2]
y = x[i].sigmoid() #[1,3,80,80,85]
if self.inplace:##走这里
y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy 这里是映射到原图大小
y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
##这里的wh需要注意,得到的wh也是已经映射到原图大小上面去的
##因为这里的self.anchor_grid[i]就是相对于原图大小的!
##而在训练的时候,3组anchor,每组anchor都缩放到32,16,8对应的feature map上面去的。
##训练的时候是每层feature map上面尺寸,因为gt也是映射到每层feature map大小 --> t = targets * gain
## 这里因为前面学的是一个相对系数,直接乘原图就可以得到在原图上面的位置。
else: # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
xy = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy
wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i].view(1, self.na, 1, 1, 2) # wh
y = torch.cat((xy, wh, y[..., 4:]), -1)
z.append(y.view(bs, -1, self.no)) ##self.no=85
# aa,bb = torch.cat(z, 1), x
return x if self.training else (torch.cat(z, 1), x)
@staticmethod
def _make_grid(nx=20, ny=20):
# nx = 5
# ny = 5
yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
#yv [20,20] xv[20,20]
# tmp_0 = torch.stack((xv, yv), 2) #[20,20,2]
# tmp_1 = torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float() #[1,1,20,20,2]
return torch.stack((xv, yv), 2).view((1, 1, ny, nx, 2)).float()
这里说一下,anchor缩放代码是在哪里:
在yolo.py的Model类里面:
先是跑了一次前向,得到每层feature map的大小size,再用送进前向之前的原尺寸除以size。
这样就知道了缩放比例,然后每个anchor分别除以这个比例!
所以,训练的时候都是gt缩放到3个不同比例,对应的anchor也是。然后在3个不同比例下计算loss的。
class Model(nn.Module):
def __init__(self, cfg='yolov3.yaml', ch=3, nc=None, anchors=None): # model, input channels, number of classes
super(Model, self).__init__()
if isinstance(cfg, dict):
self.yaml = cfg # model dict
else: # is *.yaml
import yaml # for torch hub
self.yaml_file = Path(cfg).name
with open(cfg) as f:
self.yaml = yaml.safe_load(f) # model dict
# Define model
ch = self.yaml['ch'] = self.yaml.get('ch', ch) # input channels
if nc and nc != self.yaml['nc']:
logger.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
self.yaml['nc'] = nc # override yaml value
if anchors:
logger.info(f'Overriding model.yaml anchors with anchors={anchors}')
self.yaml['anchors'] = round(anchors) # override yaml value
self.model, self.save = parse_model(deepcopy(self.yaml), ch=[ch]) # model, savelist
self.names = [str(i) for i in range(self.yaml['nc'])] # default names
self.inplace = self.yaml.get('inplace', True)
# logger.info([x.shape for x in self.forward(torch.zeros(1, ch, 64, 64))])
# Build strides, anchors
m = self.model[-1] # Detect()
if isinstance(m, Detect):
s = 256 # 2x min stride
m.inplace = self.inplace
# tmp111 = self.forward(torch.zeros(1, ch, s, s))
#value [8,16,32]
m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s))]) # forward
tmp12 = m.stride.view(-1, 1, 1) #shape [3,1,1]
'''
m.anchors shape[3,3,2]
tensor([[[ 10., 13.],
[ 16., 30.],
[ 33., 23.]],
[[ 30., 61.],
[ 62., 45.],
[ 59., 119.]],
[[116., 90.],
[156., 198.],
[373., 326.]]])
'''
m.anchors /= m.stride.view(-1, 1, 1)
check_anchor_order(m)
self.stride = m.stride
self._initialize_biases() # only run once
# logger.info('Strides: %s' % m.stride.tolist())
# Init weights, biases
initialize_weights(self)
self.info()
logger.info('')
def forward(self, x, augment=False, profile=False):
if augment:
return self.forward_augment(x) # augmented inference, None
else:
# aa = self.forward_once(x, profile)
return self.forward_once(x, profile) # single-scale inference, train
然后就是nms:
def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False,
labels=(), max_det=300):
"""Runs Non-Maximum Suppression (NMS) on inference results
Returns:
list of detections, on (n,6) tensor per image [xyxy, conf, cls]
"""
nc = prediction.shape[2] - 5 # number of classes
xc = prediction[..., 4] > conf_thres # candidates
# Checks
assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
# Settings
min_wh, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height
max_nms = 30000 # maximum number of boxes into torchvision.ops.nms()
time_limit = 10.0 # seconds to quit after
redundant = True # require redundant detections
multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
merge = False # use merge-NMS
t = time.time()
output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
for xi, x in enumerate(prediction): # image index, image inference
# Apply constraints
# x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height
x = x[xc[xi]] # confidence
# Cat apriori labels if autolabelling
if labels and len(labels[xi]):
l = labels[xi]
v = torch.zeros((len(l), nc + 5), device=x.device)
v[:, :4] = l[:, 1:5] # box
v[:, 4] = 1.0 # conf
v[range(len(l)), l[:, 0].long() + 5] = 1.0 # cls
x = torch.cat((x, v), 0)
# If none remain process next image
if not x.shape[0]:
continue
# Compute conf
x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
# Box (center x, center y, width, height) to (x1, y1, x2, y2)
box = xywh2xyxy(x[:, :4])
# Detections matrix nx6 (xyxy, conf, cls)
if multi_label:
i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
else: # best class only
conf, j = x[:, 5:].max(1, keepdim=True)
x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
# Filter by class
if classes is not None:
x = x[(x[:, 5:6] == torcdef letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):h.tensor(classes, device=x.device)).any(1)]
# Apply finite constraint
# if not torch.isfinite(x).all():
# x = x[torch.isfinite(x).all(1)]
# Check shape
n = x.shape[0] # number of boxes
if not n: # no boxes
continue
elif n > max_nms: # excess boxes
x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence
# Batched NMS
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
if i.shape[0] > max_det: # limit detections
i = i[:max_det]
if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
# update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
weights = iou * scores[None] # box weights
x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
if redundant:
i = i[iou.sum(1) > 1] # require redundancy
output[xi] = x[i]
if (time.time() - t) > time_limit:
print(f'WARNING: NMS time limit {time_limit}s exceeded')
break # time limit exceeded
return output
这段代码需要注意是:
# Compute conf
x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
条件概率,每个类别的概率还需要乘以是否是目标这类的概率。
然后坐标转为xyxy格式
box = xywh2xyxy(x[:, :4])
nms得到结果:
pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, opt.classes, opt.agnostic_nms,
max_det=opt.max_det)
pred的shape是[5,6]. 表示有5个目标,x,y,x,y,score,cls
这里需要注意的是pred预测出来的坐标都是相对于一开始我们把图像缩放到一个尺寸上面并且做了32的倍数,中心化这么个操作。
现在需要映射回原图。
det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()
img是归一化后的图,im0是原图
# scale_coords(img.shape[2:], det[:, :4], im0.shape)
def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):
# Rescale coords (xyxy) from img1_shape to img0_shape
if ratio_pad is None: # calculate from img0_shape
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding 0 12
else:
gain = ratio_pad[0][0]
pad = ratio_pad[1]
coords[:, [0, 2]] -= pad[0] # x padding
coords[:, [1, 3]] -= pad[1] # y padding 12
coords[:, :4] /= gain
clip_coords(coords, img0_shape)
return coords
恩,然后就得到了原图目标坐标、类别、得分这些信息了。
然后就可以画图显示出来。
https://www.cnblogs.com/yanghailin/p/14894318.html
总结一下:
1.需要注意的是,推理的时候并没有把图像统一到固定尺寸640*640,而是把最长边固定为640,然后短边按照系数缩放,并且整成32的倍数。
def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):这个函数就是完成上面的操作。
2.网络部分
推理走自己的逻辑分支,
if self.inplace:##走这里
y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy 这里是映射到原图大小
y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
这两句很重要。把框映射到原图大小。中心点xy乘以缩放系数。
wh乘以按照原图大小设定的anchor。
对照这训练时候的代码:
pxy = ps[:, :2].sigmoid() * 2. - 0.5 #[95,2]
pwh = (ps[:, 2:4].sigmoid() * 2) ** 2 * anchors[i] #[95,2]
pbox = torch.cat((pxy, pwh), 1) # predicted box #[95,4]
这里都是相对坐标,都一样,不影响loss的计算。这里的anchors是相对于每层feature map大小的,上面有分析。
- 最后就是由于有补边操作,需要再映射到原图。scale_coords这代码没怎么看的懂。