Yolov8-源码解析-三十七-

Yolov8 源码解析(三十七)

.\yolov8\ultralytics\nn\modules\conv.py

# Ultralytics YOLO 🚀, AGPL-3.0 license
"""Convolution modules."""

import math

import numpy as np
import torch
import torch.nn as nn

__all__ = (
    "Conv",
    "Conv2",
    "LightConv",
    "DWConv",
    "DWConvTranspose2d",
    "ConvTranspose",
    "Focus",
    "GhostConv",
    "ChannelAttention",
    "SpatialAttention",
    "CBAM",
    "Concat",
    "RepConv",
)


def autopad(k, p=None, d=1):  # kernel, padding, dilation
    """Pad to 'same' shape outputs."""
    # 计算实际的卷积核大小,当 dilation 大于 1 时
    if d > 1:
        k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]
    # 自动计算 padding 大小,如果未指定
    if p is None:
        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]
    return p


class Conv(nn.Module):
    """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""

    default_act = nn.SiLU()  # 默认激活函数为 SiLU

    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
        """Initialize Conv layer with given arguments including activation."""
        super().__init__()
        # 创建卷积层,设置相关参数
        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
        # 批归一化层
        self.bn = nn.BatchNorm2d(c2)
        # 激活函数,默认为 SiLU
        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()

    def forward(self, x):
        """Apply convolution, batch normalization and activation to input tensor."""
        return self.act(self.bn(self.conv(x)))

    def forward_fuse(self, x):
        """Perform transposed convolution of 2D data."""
        return self.act(self.conv(x))


class Conv2(Conv):
    """Simplified RepConv module with Conv fusing."""

    def __init__(self, c1, c2, k=3, s=1, p=None, g=1, d=1, act=True):
        """Initialize Conv layer with given arguments including activation."""
        super().__init__(c1, c2, k, s, p, g=g, d=d, act=act)
        # 添加额外的 1x1 卷积层
        self.cv2 = nn.Conv2d(c1, c2, 1, s, autopad(1, p, d), groups=g, dilation=d, bias=False)

    def forward(self, x):
        """Apply convolution, batch normalization and activation to input tensor."""
        # 使用融合后的卷积核进行前向传播
        return self.act(self.bn(self.conv(x) + self.cv2(x)))

    def forward_fuse(self, x):
        """Apply fused convolution, batch normalization and activation to input tensor."""
        return self.act(self.bn(self.conv(x)))

    def fuse_convs(self):
        """Fuse parallel convolutions."""
        # 合并并更新卷积核权重
        w = torch.zeros_like(self.conv.weight.data)
        i = [x // 2 for x in w.shape[2:]]
        w[:, :, i[0] : i[0] + 1, i[1] : i[1] + 1] = self.cv2.weight.data.clone()
        self.conv.weight.data += w
        # 删除 cv2 属性,更新 forward 方法为融合后的版本
        self.__delattr__("cv2")
        self.forward = self.forward_fuse


class LightConv(nn.Module):
    """
    Light convolution with args(ch_in, ch_out, kernel).

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
    """
    def __init__(self, c1, c2, k=1, act=nn.ReLU()):
        """Initialize Conv layer with given arguments including activation."""
        # 调用父类构造函数,初始化神经网络层
        super().__init__()
        # 创建第一个卷积层,1x1卷积,不使用激活函数
        self.conv1 = Conv(c1, c2, 1, act=False)
        # 创建深度可分离卷积层,输入输出通道数相同,卷积核大小为k,使用指定的激活函数
        self.conv2 = DWConv(c2, c2, k, act=act)

    def forward(self, x):
        """Apply 2 convolutions to input tensor."""
        # 将输入张量x先经过conv1进行卷积,再经过conv2进行深度可分离卷积
        return self.conv2(self.conv1(x))
class DWConv(Conv):
    """Depth-wise convolution."""

    def __init__(self, c1, c2, k=1, s=1, d=1, act=True):  # ch_in, ch_out, kernel, stride, dilation, activation
        """Initialize Depth-wise convolution with given parameters."""
        super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act)


class DWConvTranspose2d(nn.ConvTranspose2d):
    """Depth-wise transpose convolution."""

    def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0):  # ch_in, ch_out, kernel, stride, padding, padding_out
        """Initialize DWConvTranspose2d class with given parameters."""
        super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2))


class ConvTranspose(nn.Module):
    """Convolution transpose 2d layer."""

    default_act = nn.SiLU()  # default activation

    def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True):
        """Initialize ConvTranspose2d layer with batch normalization and activation function."""
        super().__init__()
        self.conv_transpose = nn.ConvTranspose2d(c1, c2, k, s, p, bias=not bn)
        self.bn = nn.BatchNorm2d(c2) if bn else nn.Identity()
        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()

    def forward(self, x):
        """Applies transposed convolutions, batch normalization and activation to input."""
        return self.act(self.bn(self.conv_transpose(x)))

    def forward_fuse(self, x):
        """Applies activation and convolution transpose operation to input."""
        return self.act(self.conv_transpose(x))


class Focus(nn.Module):
    """Focus wh information into c-space."""

    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
        """Initializes Focus object with user defined channel, convolution, padding, group and activation values."""
        super().__init__()
        self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act)
        # self.contract = Contract(gain=2)

    def forward(self, x):
        """
        Applies convolution to concatenated tensor and returns the output.

        Input shape is (b,c,w,h) and output shape is (b,4c,w/2,h/2).
        """
        return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1))
        # return self.conv(self.contract(x))


class GhostConv(nn.Module):
    """Ghost Convolution https://github.com/huawei-noah/ghostnet."""

    def __init__(self, c1, c2, k=1, s=1, g=1, act=True):
        """Initializes the GhostConv object with input channels, output channels, kernel size, stride, groups and
        activation.
        """
        super().__init__()
        c_ = c2 // 2  # hidden channels
        self.cv1 = Conv(c1, c_, k, s, None, g, act=act)
        self.cv2 = Conv(c_, c_, 5, 1, None, c_, act=act)

    def forward(self, x):
        """Forward propagation through a Ghost Bottleneck layer with skip connection."""
        y = self.cv1(x)
        return torch.cat((y, self.cv2(y)), 1)


class RepConv(nn.Module):
    """Repeated Convolution."""

    def __init__(self, c1, c2, k=3, s=1, g=1, act=True):
        """Initialize repeated convolution with specified parameters."""
        super().__init__()
        self.conv1 = Conv(c1, c2, k, s, g=g, act=act)
        self.conv2 = Conv(c2, c2, k, s, g=g, act=act)

    def forward(self, x):
        """Apply repeated convolution on input tensor and return the result."""
        return self.conv2(self.conv1(x))
    """
    RepConv 是一个基本的重复风格块,包括训练和部署状态。
    
    这个模块用于 RT-DETR。
    基于 https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
    """
    
    default_act = nn.SiLU()  # 默认激活函数为 SiLU
    
    def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False):
        """使用给定的输入、输出和可选的激活函数初始化轻量卷积层。"""
        super().__init__()
        assert k == 3 and p == 1
        self.g = g
        self.c1 = c1
        self.c2 = c2
        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
    
        # 如果启用了批归一化(bn=True)且满足条件,初始化批归一化层
        self.bn = nn.BatchNorm2d(num_features=c1) if bn and c2 == c1 and s == 1 else None
        # 初始化第一个卷积层,使用自定义的 Conv 类
        self.conv1 = Conv(c1, c2, k, s, p=p, g=g, act=False)
        # 初始化第二个卷积层,使用自定义的 Conv 类
        self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False)
    
    def forward_fuse(self, x):
        """前向传播过程。"""
        return self.act(self.conv(x))
    
    def forward(self, x):
        """前向传播过程。"""
        # 如果未使用批归一化,id_out 为 0;否则,id_out 为经过批归一化的 x
        id_out = 0 if self.bn is None else self.bn(x)
        # 返回第一个卷积层、第二个卷积层和可能的批归一化的叠加结果
        return self.act(self.conv1(x) + self.conv2(x) + id_out)
    
    def get_equivalent_kernel_bias(self):
        """通过将 3x3 卷积核、1x1 卷积核和身份卷积核及其偏置相加,返回等效的卷积核和偏置。"""
        # 获取第一个卷积层的等效卷积核和偏置
        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
        # 获取第二个卷积层的等效卷积核和偏置
        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
        # 获取批归一化层的等效卷积核和偏置
        kernelid, biasid = self._fuse_bn_tensor(self.bn)
        # 返回相加后的等效卷积核和偏置
        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
    
    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
        """将 1x1 卷积核填充为 3x3 卷积核。"""
        # 如果 1x1 卷积核为 None,则返回 0
        if kernel1x1 is None:
            return 0
        else:
            # 使用 torch.nn.functional.pad 函数对 1x1 卷积核进行填充
            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
    def _fuse_bn_tensor(self, branch):
        """Generates appropriate kernels and biases for convolution by fusing branches of the neural network."""
        # 如果分支为空,返回0和0
        if branch is None:
            return 0, 0
        # 如果分支是Conv类型
        if isinstance(branch, Conv):
            # 获取卷积核
            kernel = branch.conv.weight
            # 获取BatchNorm层的running_mean
            running_mean = branch.bn.running_mean
            # 获取BatchNorm层的running_var
            running_var = branch.bn.running_var
            # 获取BatchNorm层的gamma(权重)
            gamma = branch.bn.weight
            # 获取BatchNorm层的beta(偏置)
            beta = branch.bn.bias
            # 获取BatchNorm层的eps
            eps = branch.bn.eps
        # 如果分支是nn.BatchNorm2d类型
        elif isinstance(branch, nn.BatchNorm2d):
            # 如果没有id_tensor属性,创建一个对角矩阵作为id_tensor
            if not hasattr(self, "id_tensor"):
                input_dim = self.c1 // self.g
                kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32)
                for i in range(self.c1):
                    kernel_value[i, i % input_dim, 1, 1] = 1
                self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
            # 使用已经存在的id_tensor作为kernel
            kernel = self.id_tensor
            # 获取BatchNorm2d层的running_mean
            running_mean = branch.running_mean
            # 获取BatchNorm2d层的running_var
            running_var = branch.running_var
            # 获取BatchNorm2d层的gamma(权重)
            gamma = branch.weight
            # 获取BatchNorm2d层的beta(偏置)
            beta = branch.bias
            # 获取BatchNorm2d层的eps
            eps = branch.eps
        # 计算标准差
        std = (running_var + eps).sqrt()
        # 计算t的值,用于归一化
        t = (gamma / std).reshape(-1, 1, 1, 1)
        # 返回融合了BN的卷积核和偏置
        return kernel * t, beta - running_mean * gamma / std

    def fuse_convs(self):
        """Combines two convolution layers into a single layer and removes unused attributes from the class."""
        # 如果已经存在conv属性,直接返回
        if hasattr(self, "conv"):
            return
        # 获取等效的卷积核和偏置
        kernel, bias = self.get_equivalent_kernel_bias()
        # 创建新的卷积层,并设置其参数
        self.conv = nn.Conv2d(
            in_channels=self.conv1.conv.in_channels,
            out_channels=self.conv1.conv.out_channels,
            kernel_size=self.conv1.conv.kernel_size,
            stride=self.conv1.conv.stride,
            padding=self.conv1.conv.padding,
            dilation=self.conv1.conv.dilation,
            groups=self.conv1.conv.groups,
            bias=True,
        ).requires_grad_(False)
        # 将融合后的卷积核和偏置赋值给新的卷积层
        self.conv.weight.data = kernel
        self.conv.bias.data = bias
        # 将所有参数设置为不需要梯度
        for para in self.parameters():
            para.detach_()
        # 删除不再需要的属性
        self.__delattr__("conv1")
        self.__delattr__("conv2")
        if hasattr(self, "nm"):
            self.__delattr__("nm")
        if hasattr(self, "bn"):
            self.__delattr__("bn")
        if hasattr(self, "id_tensor"):
            self.__delattr__("id_tensor")
class ChannelAttention(nn.Module):
    """Channel-attention module https://github.com/open-mmlab/mmdetection/tree/v3.0.0rc1/configs/rtmdet."""

    def __init__(self, channels: int) -> None:
        """Initializes the class and sets the basic configurations and instance variables required."""
        super().__init__()
        # 创建一个自适应平均池化层,输出大小为 (1, 1),用于对输入进行全局平均池化
        self.pool = nn.AdaptiveAvgPool2d(1)
        # 创建一个卷积层,对输入进行通道间的卷积,输出通道数与输入相同
        self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True)
        # 创建一个 Sigmoid 激活函数实例
        self.act = nn.Sigmoid()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Applies forward pass using activation on convolutions of the input, optionally using batch normalization."""
        # 对输入 x 进行全局平均池化,然后通过卷积和 Sigmoid 激活函数处理,返回加权后的特征
        return x * self.act(self.fc(self.pool(x)))


class SpatialAttention(nn.Module):
    """Spatial-attention module."""

    def __init__(self, kernel_size=7):
        """Initialize Spatial-attention module with kernel size argument."""
        super().__init__()
        # 断言 kernel_size 必须是 3 或 7
        assert kernel_size in {3, 7}, "kernel size must be 3 or 7"
        # 根据 kernel_size 创建卷积层,用于空间注意力计算
        padding = 3 if kernel_size == 7 else 1
        self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
        # 创建一个 Sigmoid 激活函数实例
        self.act = nn.Sigmoid()

    def forward(self, x):
        """Apply channel and spatial attention on input for feature recalibration."""
        # 对输入 x 进行通道和空间注意力的计算,返回加权后的特征
        return x * self.act(self.cv1(torch.cat([torch.mean(x, 1, keepdim=True), torch.max(x, 1, keepdim=True)[0]], 1)))


class CBAM(nn.Module):
    """Convolutional Block Attention Module."""

    def __init__(self, c1, kernel_size=7):
        """Initialize CBAM with given input channel (c1) and kernel size."""
        super().__init__()
        # 创建通道注意力模块和空间注意力模块
        self.channel_attention = ChannelAttention(c1)
        self.spatial_attention = SpatialAttention(kernel_size)

    def forward(self, x):
        """Applies the forward pass through C1 module."""
        # 通过通道注意力模块和空间注意力模块,对输入 x 进行特征加权处理
        return self.spatial_attention(self.channel_attention(x))


class Concat(nn.Module):
    """Concatenate a list of tensors along dimension."""

    def __init__(self, dimension=1):
        """Concatenates a list of tensors along a specified dimension."""
        super().__init__()
        # 指定要进行拼接的维度
        self.d = dimension

    def forward(self, x):
        """Forward pass for the YOLOv8 mask Proto module."""
        # 沿指定维度对输入 x 中的张量进行拼接
        return torch.cat(x, self.d)

.\yolov8\ultralytics\nn\modules\head.py

# Ultralytics YOLO 🚀, AGPL-3.0 license
"""Model head modules."""

import copy  # 导入复制模块
import math  # 导入数学模块

import torch  # 导入PyTorch库
import torch.nn as nn  # 导入PyTorch神经网络模块
from torch.nn.init import constant_, xavier_uniform_  # 从PyTorch初始化模块导入常量初始化和Xavier初始化

from ultralytics.utils.tal import TORCH_1_10, dist2bbox, dist2rbox, make_anchors  # 导入Ultralytics自定义工具函数

from .block import DFL, BNContrastiveHead, ContrastiveHead, Proto  # 从当前目录导入自定义块
from .conv import Conv  # 从当前目录导入卷积模块
from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer  # 从当前目录导入变换器相关模块
from .utils import bias_init_with_prob, linear_init  # 从当前目录导入工具函数

__all__ = "Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder", "v10Detect"  # 导出模块列表


class Detect(nn.Module):
    """YOLOv8 Detect head for detection models."""

    dynamic = False  # 是否强制网格重建
    export = False  # 导出模式
    end2end = False  # 端到端模式
    max_det = 300  # 最大检测数
    shape = None  # 形状为空
    anchors = torch.empty(0)  # 锚点初始化为空张量
    strides = torch.empty(0)  # 步长初始化为空张量

    def __init__(self, nc=80, ch=()):
        """Initializes the YOLOv8 detection layer with specified number of classes and channels."""
        super().__init__()
        self.nc = nc  # 类别数
        self.nl = len(ch)  # 检测层的数量
        self.reg_max = 16  # DFL通道数 (ch[0] // 16 用于缩放到4/8/12/16/20的大小)
        self.no = nc + self.reg_max * 4  # 每个锚点的输出数
        self.stride = torch.zeros(self.nl)  # 构建时计算的步长
        c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100))  # 通道数
        self.cv2 = nn.ModuleList(
            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
        )  # 用于cv2的卷积模块列表
        self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)  # 用于cv3的卷积模块列表
        self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()  # 如果DFL通道大于1则使用DFL,否则使用恒等映射

        if self.end2end:
            self.one2one_cv2 = copy.deepcopy(self.cv2)  # 若为端到端模式则深拷贝cv2
            self.one2one_cv3 = copy.deepcopy(self.cv3)  # 若为端到端模式则深拷贝cv3

    def forward(self, x):
        """Concatenates and returns predicted bounding boxes and class probabilities."""
        if self.end2end:
            return self.forward_end2end(x)  # 若为端到端模式则调用端到端前向传播函数

        for i in range(self.nl):
            x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)  # 每层的特征图上进行特征提取和组合
        if self.training:  # 训练路径
            return x
        y = self._inference(x)  # 推理路径
        return y if self.export else (y, x)  # 若非导出模式返回(y, x),否则返回y
    def forward_end2end(self, x):
        """
        Performs forward pass of the v10Detect module.

        Args:
            x (tensor): Input tensor.

        Returns:
            (dict, tensor): If not in training mode, returns a dictionary containing the outputs of both one2many and one2one detections.
                           If in training mode, returns a dictionary containing the outputs of one2many and one2one detections separately.
        """
        # Detach input tensors for one2one module
        x_detach = [xi.detach() for xi in x]
        
        # Compute one2one detections for each level
        one2one = [
            torch.cat((self.one2one_cv2[i](x_detach[i]), self.one2one_cv3[i](x_detach[i])), 1) for i in range(self.nl)
        ]
        
        # Compute one2many detections for each level
        for i in range(self.nl):
            x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
        
        if self.training:  # Training path
            # Return outputs separately if in training mode
            return {"one2many": x, "one2one": one2one}

        # Inference path
        y = self._inference(one2one)
        y = self.postprocess(y.permute(0, 2, 1), self.max_det, self.nc)
        
        # Return outputs based on export flag
        return y if self.export else (y, {"one2many": x, "one2one": one2one})

    def _inference(self, x):
        """Decode predicted bounding boxes and class probabilities based on multiple-level feature maps."""
        # Obtain shape of the input tensor (BCHW)
        shape = x[0].shape
        # Concatenate predictions across different levels
        x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
        
        # Adjust anchors and strides if dynamic or shape changes
        if self.dynamic or self.shape != shape:
            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
            self.shape = shape
        
        # Split predictions into bounding box and class probability predictions
        if self.export and self.format in {"saved_model", "pb", "tflite", "edgetpu", "tfjs"}:
            box = x_cat[:, : self.reg_max * 4]
            cls = x_cat[:, self.reg_max * 4 :]
        else:
            box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
        
        # Adjust bounding boxes for export formats tflite and edgetpu
        if self.export and self.format in {"tflite", "edgetpu"}:
            grid_h = shape[2]
            grid_w = shape[3]
            grid_size = torch.tensor([grid_w, grid_h, grid_w, grid_h], device=box.device).reshape(1, 4, 1)
            norm = self.strides / (self.stride[0] * grid_size)
            dbox = self.decode_bboxes(self.dfl(box) * norm, self.anchors.unsqueeze(0) * norm[:, :2])
        else:
            dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides
        
        # Apply sigmoid function to class predictions
        return torch.cat((dbox, cls.sigmoid()), 1)
    # 初始化 Detect() 模型的偏置项,需要注意步长的可用性
    def bias_init(self):
        """Initialize Detect() biases, WARNING: requires stride availability."""
        m = self  # self.model[-1]  # Detect() module
        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
            a[-1].bias.data[:] = 1.0  # box
            b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)
        if self.end2end:
            for a, b, s in zip(m.one2one_cv2, m.one2one_cv3, m.stride):  # from
                a[-1].bias.data[:] = 1.0  # box
                b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)

    # 解码边界框
    def decode_bboxes(self, bboxes, anchors):
        """Decode bounding boxes."""
        return dist2bbox(bboxes, anchors, xywh=not self.end2end, dim=1)

    @staticmethod
    # 对来自 YOLOv10 模型的预测结果进行后处理
    def postprocess(preds: torch.Tensor, max_det: int, nc: int = 80):
        """
        Post-processes the predictions obtained from a YOLOv10 model.

        Args:
            preds (torch.Tensor): The predictions obtained from the model. It should have a shape of (batch_size, num_boxes, 4 + num_classes).
            max_det (int): The maximum number of detections to keep.
            nc (int, optional): The number of classes. Defaults to 80.

        Returns:
            (torch.Tensor): The post-processed predictions with shape (batch_size, max_det, 6),
                including bounding boxes, scores and cls.
        """
        assert 4 + nc == preds.shape[-1]
        boxes, scores = preds.split([4, nc], dim=-1)
        max_scores = scores.amax(dim=-1)
        max_scores, index = torch.topk(max_scores, min(max_det, max_scores.shape[1]), axis=-1)
        index = index.unsqueeze(-1)
        boxes = torch.gather(boxes, dim=1, index=index.repeat(1, 1, boxes.shape[-1]))
        scores = torch.gather(scores, dim=1, index=index.repeat(1, 1, scores.shape[-1]))

        # NOTE: simplify but result slightly lower mAP
        # scores, labels = scores.max(dim=-1)
        # return torch.cat([boxes, scores.unsqueeze(-1), labels.unsqueeze(-1)], dim=-1)

        scores, index = torch.topk(scores.flatten(1), max_det, axis=-1)
        labels = index % nc
        index = index // nc
        boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))

        return torch.cat([boxes, scores.unsqueeze(-1), labels.unsqueeze(-1).to(boxes.dtype)], dim=-1)
class Segment(Detect):
    """YOLOv8 Segment head for segmentation models."""

    def __init__(self, nc=80, nm=32, npr=256, ch=()):
        """Initialize the YOLO model attributes such as the number of masks, prototypes, and the convolution layers."""
        super().__init__(nc, ch)
        self.nm = nm  # number of masks
        self.npr = npr  # number of protos
        self.proto = Proto(ch[0], self.npr, self.nm)  # protos

        c4 = max(ch[0] // 4, self.nm)
        self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nm, 1)) for x in ch)

    def forward(self, x):
        """Return model outputs and mask coefficients if training, otherwise return outputs and mask coefficients."""
        p = self.proto(x[0])  # mask protos
        bs = p.shape[0]  # batch size

        mc = torch.cat([self.cv4[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)], 2)  # mask coefficients
        x = Detect.forward(self, x)
        if self.training:
            return x, mc, p
        return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))


class OBB(Detect):
    """YOLOv8 OBB detection head for detection with rotation models."""

    def __init__(self, nc=80, ne=1, ch=()):
        """Initialize OBB with number of classes `nc` and layer channels `ch`."""
        super().__init__(nc, ch)
        self.ne = ne  # number of extra parameters

        c4 = max(ch[0] // 4, self.ne)
        self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.ne, 1)) for x in ch)

    def forward(self, x):
        """Concatenates and returns predicted bounding boxes and class probabilities."""
        bs = x[0].shape[0]  # batch size
        angle = torch.cat([self.cv4[i](x[i]).view(bs, self.ne, -1) for i in range(self.nl)], 2)  # OBB theta logits
        # NOTE: set `angle` as an attribute so that `decode_bboxes` could use it.
        angle = (angle.sigmoid() - 0.25) * math.pi  # [-pi/4, 3pi/4]
        # angle = angle.sigmoid() * math.pi / 2  # [0, pi/2]
        if not self.training:
            self.angle = angle  # Store the adjusted angles for use in bbox decoding
        x = Detect.forward(self, x)
        if self.training:
            return x, angle
        return torch.cat([x, angle], 1) if self.export else (torch.cat([x[0], angle], 1), (x[1], angle))

    def decode_bboxes(self, bboxes, anchors):
        """Decode rotated bounding boxes using stored `angle` attribute."""
        return dist2rbox(bboxes, self.angle, anchors, dim=1)


class Pose(Detect):
    """YOLOv8 Pose head for keypoints models."""
    def __init__(self, nc=80, kpt_shape=(17, 3), ch=()):
        """
        Initialize YOLO network with default parameters and Convolutional Layers.
        """
        super().__init__(nc, ch)
        self.kpt_shape = kpt_shape  # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
        self.nk = kpt_shape[0] * kpt_shape[1]  # number of keypoints total

        # Calculate c4 as the maximum of the first channel's size divided by 4 or number of keypoints
        c4 = max(ch[0] // 4, self.nk)
        # Initialize cv4 as a list of convolutional layers for each channel in ch
        self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nk, 1)) for x in ch)

    def forward(self, x):
        """
        Perform forward pass through YOLO model and return predictions.
        """
        bs = x[0].shape[0]  # batch size
        # Perform convolution operations on each input x[i] and concatenate results
        kpt = torch.cat([self.cv4[i](x[i]).view(bs, self.nk, -1) for i in range(self.nl)], -1)  # (bs, 17*3, h*w)
        x = Detect.forward(self, x)
        if self.training:
            return x, kpt
        # Decode keypoints and concatenate with x if not in export mode
        pred_kpt = self.kpts_decode(bs, kpt)
        return torch.cat([x, pred_kpt], 1) if self.export else (torch.cat([x[0], pred_kpt], 1), (x[1], kpt))

    def kpts_decode(self, bs, kpts):
        """
        Decodes keypoints.
        """
        ndim = self.kpt_shape[1]
        if self.export:  # required for TFLite export to avoid 'PLACEHOLDER_FOR_GREATER_OP_CODES' bug
            # Reshape kpts to match self.kpt_shape and compute absolute positions based on anchors and strides
            y = kpts.view(bs, *self.kpt_shape, -1)
            a = (y[:, :, :2] * 2.0 + (self.anchors - 0.5)) * self.strides
            if ndim == 3:
                a = torch.cat((a, y[:, :, 2:3].sigmoid()), 2)
            return a.view(bs, self.nk, -1)
        else:
            y = kpts.clone()
            if ndim == 3:
                # Apply sigmoid function to the third dimension of y
                y[:, 2::3] = y[:, 2::3].sigmoid()  # sigmoid (WARNING: inplace .sigmoid_() Apple MPS bug)
            # Compute absolute positions for x and y coordinates based on anchors and strides
            y[:, 0::ndim] = (y[:, 0::ndim] * 2.0 + (self.anchors[0] - 0.5)) * self.strides
            y[:, 1::ndim] = (y[:, 1::ndim] * 2.0 + (self.anchors[1] - 0.5)) * self.strides
            return y
class Classify(nn.Module):
    """YOLOv8 classification head, i.e. x(b,c1,20,20) to x(b,c2)."""

    def __init__(self, c1, c2, k=1, s=1, p=None, g=1):
        """
        Initializes YOLOv8 classification head with specified input and output channels, kernel size, stride,
        padding, and groups.
        """
        super().__init__()
        c_ = 1280  # efficientnet_b0 size

        # 使用 Conv 类定义一个卷积层,输入通道 c1,输出通道 c_,使用指定的内核大小 k,步幅 s,填充 p,分组数 g
        self.conv = Conv(c1, c_, k, s, p, g)

        # 使用 nn.AdaptiveAvgPool2d 实例化一个自适应平均池化层,将输入变为 x(b,c_,1,1)
        self.pool = nn.AdaptiveAvgPool2d(1)

        # 使用 nn.Dropout 实例化一个 dropout 层,概率为 0.0,原地操作 inplace=True
        self.drop = nn.Dropout(p=0.0, inplace=True)

        # 使用 nn.Linear 实例化一个线性层,输入特征数 c_,输出特征数 c2
        self.linear = nn.Linear(c_, c2)  # to x(b,c2)

    def forward(self, x):
        """Performs a forward pass of the YOLO model on input image data."""
        if isinstance(x, list):
            # 如果输入是一个列表,将列表中的张量在维度1上进行拼接
            x = torch.cat(x, 1)

        # 执行网络前向传播,通过 conv -> pool -> drop -> linear 的流程,最后进行 softmax 如果是训练模式
        x = self.linear(self.drop(self.pool(self.conv(x)).flatten(1)))
        return x if self.training else x.softmax(1)


class WorldDetect(Detect):
    """Head for integrating YOLOv8 detection models with semantic understanding from text embeddings."""

    def __init__(self, nc=80, embed=512, with_bn=False, ch=()):
        """
        Initialize YOLOv8 detection layer with nc classes and layer channels ch.
        """
        super().__init__(nc, ch)

        # 计算 c3 的值,取 ch[0] 和 self.nc 与 100 的最小值中的较大者
        c3 = max(ch[0], min(self.nc, 100))

        # 使用 nn.ModuleList 实例化一个模块列表,每个元素是一个 nn.Sequential 模块,包括 Conv -> Conv -> nn.Conv2d
        self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, embed, 1)) for x in ch)

        # 使用 nn.ModuleList 实例化一个模块列表,根据 with_bn 的布尔值选择不同的头部模块
        self.cv4 = nn.ModuleList(BNContrastiveHead(embed) if with_bn else ContrastiveHead() for _ in ch)
    def forward(self, x, text):
        """Concatenates and returns predicted bounding boxes and class probabilities."""
        # 对每一层进行处理
        for i in range(self.nl):
            # 将特征图 x[i] 和文本特征 text 进行拼接,并经过一系列卷积操作
            x[i] = torch.cat((self.cv2[i](x[i]), self.cv4[i](self.cv3[i](x[i]), text)), 1)
        if self.training:
            # 如果处于训练阶段,则直接返回处理后的 x
            return x

        # 推理路径
        shape = x[0].shape  # BCHW
        # 将每层特征图 x 按照维度 2 进行拼接
        x_cat = torch.cat([xi.view(shape[0], self.nc + self.reg_max * 4, -1) for xi in x], 2)
        if self.dynamic or self.shape != shape:
            # 如果处于动态模式或者形状发生了变化,则重新生成锚点和步长
            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
            self.shape = shape

        if self.export and self.format in {"saved_model", "pb", "tflite", "edgetpu", "tfjs"}:  # 避免 TF FlexSplitV 操作
            # 如果处于导出模式且格式是 TensorFlow 支持的格式,则分别提取框和类别概率
            box = x_cat[:, : self.reg_max * 4]
            cls = x_cat[:, self.reg_max * 4 :]
        else:
            # 否则按照 reg_max * 4 和 nc 进行分割,分别得到框和类别概率
            box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)

        if self.export and self.format in {"tflite", "edgetpu"}:
            # 预先计算归一化因子以增加数值稳定性
            # 参考 https://github.com/ultralytics/ultralytics/issues/7371
            grid_h = shape[2]
            grid_w = shape[3]
            grid_size = torch.tensor([grid_w, grid_h, grid_w, grid_h], device=box.device).reshape(1, 4, 1)
            norm = self.strides / (self.stride[0] * grid_size)
            # 对框进行解码,并乘以归一化因子
            dbox = self.decode_bboxes(self.dfl(box) * norm, self.anchors.unsqueeze(0) * norm[:, :2])
        else:
            # 否则直接对框进行解码,并乘以步长
            dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides

        # 将解码后的框和类别概率进行拼接,并对类别概率进行 sigmoid 处理
        y = torch.cat((dbox, cls.sigmoid()), 1)
        # 如果处于导出模式,则只返回 y;否则返回 y 和 x
        return y if self.export else (y, x)

    def bias_init(self):
        """Initialize Detect() biases, WARNING: requires stride availability."""
        m = self  # self.model[-1]  # Detect() module
        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
            # 初始化偏置项,针对 box
            a[-1].bias.data[:] = 1.0  # box
            # 初始化偏置项,针对类别概率 cls
            # b[-1].bias.data[:] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)
# 定义实时可变形Transformer解码器(RTDETRDecoder)类,用于目标检测。
class RTDETRDecoder(nn.Module):
    """
    Real-Time Deformable Transformer Decoder (RTDETRDecoder) module for object detection.
    
    This decoder module utilizes Transformer architecture along with deformable convolutions to predict bounding boxes
    and class labels for objects in an image. It integrates features from multiple layers and runs through a series of
    Transformer decoder layers to output the final predictions.
    """

    # 导出模式标志,默认为False,表示非导出模式
    export = False  # export mode

    def __init__(
        self,
        nc=80,  # 类别数,默认为80类
        ch=(512, 1024, 2048),  # 特征通道数,元组形式,包含三个值
        hd=256,  # 隐藏层维度,默认为256
        nq=300,  # 查询数量,默认为300
        ndp=4,  # 解码器点的数量,默认为4
        nh=8,  # 注意力头的数量,默认为8
        ndl=6,  # 解码器层数,默认为6
        d_ffn=1024,  # 前馈网络的维度,默认为1024
        dropout=0.0,  # dropout概率,默认为0.0,表示不使用dropout
        act=nn.ReLU(),  # 激活函数,默认为ReLU
        eval_idx=-1,  # 评估索引,默认为-1
        # 训练参数
        nd=100,  # 去噪次数,默认为100
        label_noise_ratio=0.5,  # 标签噪声比例,默认为0.5
        box_noise_scale=1.0,  # 边界框噪声比例,默认为1.0
        learnt_init_query=False,  # 是否学习初始查询,默认为False,表示不学习
        """
        Initializes the RTDETRDecoder module with the given parameters.

        Args:
            nc (int): Number of classes. Default is 80.
            ch (tuple): Channels in the backbone feature maps. Default is (512, 1024, 2048).
            hd (int): Dimension of hidden layers. Default is 256.
            nq (int): Number of query points. Default is 300.
            ndp (int): Number of decoder points. Default is 4.
            nh (int): Number of heads in multi-head attention. Default is 8.
            ndl (int): Number of decoder layers. Default is 6.
            d_ffn (int): Dimension of the feed-forward networks. Default is 1024.
            dropout (float): Dropout rate. Default is 0.
            act (nn.Module): Activation function. Default is nn.ReLU.
            eval_idx (int): Evaluation index. Default is -1.
            nd (int): Number of denoising. Default is 100.
            label_noise_ratio (float): Label noise ratio. Default is 0.5.
            box_noise_scale (float): Box noise scale. Default is 1.0.
            learnt_init_query (bool): Whether to learn initial query embeddings. Default is False.
        """
        # Initialize the superclass (nn.Module) to inherit its methods and attributes
        super().__init__()
        
        # Set the hidden dimension attribute
        self.hidden_dim = hd
        
        # Set the number of attention heads attribute
        self.nhead = nh
        
        # Determine the number of levels in the backbone feature maps
        self.nl = len(ch)  # num level
        
        # Set the number of classes attribute
        self.nc = nc
        
        # Set the number of query points attribute
        self.num_queries = nq
        
        # Set the number of decoder layers attribute
        self.num_decoder_layers = ndl

        # Backbone feature projection
        # Create a list of nn.Sequential modules for projecting each backbone feature map to hd dimensions
        self.input_proj = nn.ModuleList(nn.Sequential(nn.Conv2d(x, hd, 1, bias=False), nn.BatchNorm2d(hd)) for x in ch)
        # NOTE: simplified version but it's not consistent with .pt weights.
        # self.input_proj = nn.ModuleList(Conv(x, hd, act=False) for x in ch)

        # Transformer module
        # Initialize the transformer decoder layer with specified parameters
        decoder_layer = DeformableTransformerDecoderLayer(hd, nh, d_ffn, dropout, act, self.nl, ndp)
        # Initialize the transformer decoder module using the created decoder layer
        self.decoder = DeformableTransformerDecoder(hd, decoder_layer, ndl, eval_idx)

        # Denoising part
        # Initialize an embedding layer for denoising classes
        self.denoising_class_embed = nn.Embedding(nc, hd)
        # Set the number of denoising iterations attribute
        self.num_denoising = nd
        # Set the label noise ratio attribute
        self.label_noise_ratio = label_noise_ratio
        # Set the box noise scale attribute
        self.box_noise_scale = box_noise_scale

        # Decoder embedding
        # Initialize query embeddings if specified to be learned
        self.learnt_init_query = learnt_init_query
        if learnt_init_query:
            self.tgt_embed = nn.Embedding(nq, hd)
        # Initialize a multi-layer perceptron for query position encoding
        self.query_pos_head = MLP(4, 2 * hd, hd, num_layers=2)

        # Encoder head
        # Sequentially apply linear transformation and layer normalization for encoder output
        self.enc_output = nn.Sequential(nn.Linear(hd, hd), nn.LayerNorm(hd))
        # Linear layer for predicting scores in encoder
        self.enc_score_head = nn.Linear(hd, nc)
        # Multi-layer perceptron for bounding box prediction in encoder
        self.enc_bbox_head = MLP(hd, hd, 4, num_layers=3)

        # Decoder head
        # Create a list of linear layers for predicting scores in each decoder layer
        self.dec_score_head = nn.ModuleList([nn.Linear(hd, nc) for _ in range(ndl)])
        # Create a list of multi-layer perceptrons for bounding box prediction in each decoder layer
        self.dec_bbox_head = nn.ModuleList([MLP(hd, hd, 4, num_layers=3) for _ in range(ndl)])

        # Initialize parameters for the module
        self._reset_parameters()
    def forward(self, x, batch=None):
        """
        Runs the forward pass of the module, returning bounding box and classification scores for the input.
        """
        from ultralytics.models.utils.ops import get_cdn_group

        # Input projection and embedding
        feats, shapes = self._get_encoder_input(x)

        # Prepare denoising training
        dn_embed, dn_bbox, attn_mask, dn_meta = get_cdn_group(
            batch,
            self.nc,
            self.num_queries,
            self.denoising_class_embed.weight,
            self.num_denoising,
            self.label_noise_ratio,
            self.box_noise_scale,
            self.training,
        )

        embed, refer_bbox, enc_bboxes, enc_scores = self._get_decoder_input(feats, shapes, dn_embed, dn_bbox)

        # Decoder
        dec_bboxes, dec_scores = self.decoder(
            embed,
            refer_bbox,
            feats,
            shapes,
            self.dec_bbox_head,
            self.dec_score_head,
            self.query_pos_head,
            attn_mask=attn_mask,
        )
        x = dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta
        if self.training:
            return x
        # (bs, 300, 4+nc)
        y = torch.cat((dec_bboxes.squeeze(0), dec_scores.squeeze(0).sigmoid()), -1)
        return y if self.export else (y, x)

    def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device="cpu", eps=1e-2):
        """
        Generates anchor bounding boxes for given shapes with specific grid size and validates them.
        """
        anchors = []
        for i, (h, w) in enumerate(shapes):
            sy = torch.arange(end=h, dtype=dtype, device=device)
            sx = torch.arange(end=w, dtype=dtype, device=device)
            grid_y, grid_x = torch.meshgrid(sy, sx, indexing="ij") if TORCH_1_10 else torch.meshgrid(sy, sx)
            grid_xy = torch.stack([grid_x, grid_y], -1)  # (h, w, 2)

            valid_WH = torch.tensor([w, h], dtype=dtype, device=device)
            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH  # (1, h, w, 2)
            wh = torch.ones_like(grid_xy, dtype=dtype, device=device) * grid_size * (2.0**i)
            anchors.append(torch.cat([grid_xy, wh], -1).view(-1, h * w, 4))  # (1, h*w, 4)

        anchors = torch.cat(anchors, 1)  # (1, h*w*nl, 4)
        valid_mask = ((anchors > eps) & (anchors < 1 - eps)).all(-1, keepdim=True)  # 1, h*w*nl, 1
        anchors = torch.log(anchors / (1 - anchors))
        anchors = anchors.masked_fill(~valid_mask, float("inf"))
        return anchors, valid_mask
    def _get_encoder_input(self, x):
        """Processes and returns encoder inputs by getting projection features from input and concatenating them."""
        # 获取投影特征
        x = [self.input_proj[i](feat) for i, feat in enumerate(x)]
        # 获取编码器输入
        feats = []
        shapes = []
        for feat in x:
            h, w = feat.shape[2:]
            # 将特征展平并转置维度以便编码器使用 [b, c, h, w] -> [b, h*w, c]
            feats.append(feat.flatten(2).permute(0, 2, 1))
            # 记录特征的高度和宽度
            shapes.append([h, w])

        # 将所有特征连接起来 [b, h*w, c]
        feats = torch.cat(feats, 1)
        return feats, shapes

    def _get_decoder_input(self, feats, shapes, dn_embed=None, dn_bbox=None):
        """Generates and prepares the input required for the decoder from the provided features and shapes."""
        bs = feats.shape[0]
        # 为解码器准备输入
        anchors, valid_mask = self._generate_anchors(shapes, dtype=feats.dtype, device=feats.device)
        features = self.enc_output(valid_mask * feats)  # bs, h*w, 256

        enc_outputs_scores = self.enc_score_head(features)  # (bs, h*w, nc)

        # 查询选择
        # 选择每个样本的前 num_queries 个最高分数的索引 (bs, num_queries)
        topk_ind = torch.topk(enc_outputs_scores.max(-1).values, self.num_queries, dim=1).indices.view(-1)
        # 创建一个表示每个样本索引的张量 (bs, num_queries)
        batch_ind = torch.arange(end=bs, dtype=topk_ind.dtype).unsqueeze(-1).repeat(1, self.num_queries).view(-1)

        # 从 features 中选择 topk_ind 所指定的特征 (bs, num_queries, 256)
        top_k_features = features[batch_ind, topk_ind].view(bs, self.num_queries, -1)
        # 从 anchors 中选择 topk_ind 所指定的锚点 (bs, num_queries, 4)
        top_k_anchors = anchors[:, topk_ind].view(bs, self.num_queries, -1)

        # 动态锚点 + 静态内容
        refer_bbox = self.enc_bbox_head(top_k_features) + top_k_anchors

        # 对编码器的边界框进行 sigmoid 操作
        enc_bboxes = refer_bbox.sigmoid()
        if dn_bbox is not None:
            # 如果存在额外的边界框 dn_bbox,则将其与 refer_bbox 连接起来
            refer_bbox = torch.cat([dn_bbox, refer_bbox], 1)
        enc_scores = enc_outputs_scores[batch_ind, topk_ind].view(bs, self.num_queries, -1)

        # 获取目标嵌入向量(embeddings)
        embeddings = self.tgt_embed.weight.unsqueeze(0).repeat(bs, 1, 1) if self.learnt_init_query else top_k_features
        if self.training:
            # 在训练模式下,需要将 refer_bbox 和 embeddings 设置为不可训练状态
            refer_bbox = refer_bbox.detach()
            if not self.learnt_init_query:
                embeddings = embeddings.detach()
        if dn_embed is not None:
            # 如果存在额外的嵌入向量 dn_embed,则将其与 embeddings 连接起来
            embeddings = torch.cat([dn_embed, embeddings], 1)

        return embeddings, refer_bbox, enc_bboxes, enc_scores

    # TODO


注释解释了每一行代码的作用和意图,确保了代码结构和原始缩进的完整性。
    def _reset_parameters(self):
        """
        Initializes or resets the parameters of the model's various components with predefined weights and biases.
        """
        # Class and bbox head init
        bias_cls = bias_init_with_prob(0.01) / 80 * self.nc
        # NOTE: the weight initialization in `linear_init` would cause NaN when training with custom datasets.

        # 设置分类和边界框头的偏置项
        constant_(self.enc_score_head.bias, bias_cls)

        # 初始化编码器边界框头最后一层的权重和偏置为0
        constant_(self.enc_bbox_head.layers[-1].weight, 0.0)
        constant_(self.enc_bbox_head.layers[-1].bias, 0.0)

        # 针对每个解码器的分数头和边界框头进行初始化
        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
            # 设置解码器分类头的偏置项
            constant_(cls_.bias, bias_cls)
            # 初始化解码器边界框头最后一层的权重和偏置为0
            constant_(reg_.layers[-1].weight, 0.0)
            constant_(reg_.layers[-1].bias, 0.0)

        # 初始化编码器输出头的权重
        linear_init(self.enc_output[0])
        xavier_uniform_(self.enc_output[0].weight)

        # 如果使用了学习初始化的查询,对目标嵌入进行均匀分布的Xavier初始化
        if self.learnt_init_query:
            xavier_uniform_(self.tgt_embed.weight)

        # 对查询位置头的权重进行均匀分布的Xavier初始化
        xavier_uniform_(self.query_pos_head.layers[0].weight)
        xavier_uniform_(self.query_pos_head.layers[1].weight)

        # 对输入投影层中每个层的权重进行均匀分布的Xavier初始化
        for layer in self.input_proj:
            xavier_uniform_(layer[0].weight)
# 设置类属性,指示v10Detect对象是端到端的
end2end = True

class v10Detect(Detect):
    """
    v10 Detection head from https://arxiv.org/pdf/2405.14458

    Args:
        nc (int): Number of classes.
        ch (tuple): Tuple of channel sizes.

    Attributes:
        max_det (int): Maximum number of detections.

    Methods:
        __init__(self, nc=80, ch=()): Initializes the v10Detect object.
        forward(self, x): Performs forward pass of the v10Detect module.
        bias_init(self): Initializes biases of the Detect module.

    """

    def __init__(self, nc=80, ch=()):
        """初始化v10Detect对象,设置类的属性和输入参数"""
        # 调用父类的初始化方法,设置类的属性nc和ch
        super().__init__(nc, ch)
        # 根据输入通道数,计算第一个卷积层的输出通道数c3
        c3 = max(ch[0], min(self.nc, 100))  # channels
        # 创建一个ModuleList,包含多个Sequential模块,每个模块用于不同的通道数x
        self.cv3 = nn.ModuleList(
            nn.Sequential(
                nn.Sequential(Conv(x, x, 3, g=x), Conv(x, c3, 1)),
                nn.Sequential(Conv(c3, c3, 3, g=c3), Conv(c3, c3, 1)),
                nn.Conv2d(c3, self.nc, 1),
            )
            for x in ch
        )
        # 使用深拷贝复制self.cv3到self.one2one_cv3
        self.one2one_cv3 = copy.deepcopy(self.cv3)

.\yolov8\ultralytics\nn\modules\transformer.py

# 导入必要的库和模块
"""Transformer modules."""

import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.init import constant_, xavier_uniform_

# 导入自定义的模块和函数
from .conv import Conv
from .utils import _get_clones, inverse_sigmoid, multi_scale_deformable_attn_pytorch

__all__ = (
    "TransformerEncoderLayer",
    "TransformerLayer",
    "TransformerBlock",
    "MLPBlock",
    "LayerNorm2d",
    "AIFI",
    "DeformableTransformerDecoder",
    "DeformableTransformerDecoderLayer",
    "MSDeformAttn",
    "MLP",
)

# 定义 Transformer 编码器层的类
class TransformerEncoderLayer(nn.Module):
    """Defines a single layer of the transformer encoder."""

    def __init__(self, c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False):
        """Initialize the TransformerEncoderLayer with specified parameters."""
        super().__init__()
        
        # 检查是否满足需要的 PyTorch 版本
        from ...utils.torch_utils import TORCH_1_9
        if not TORCH_1_9:
            raise ModuleNotFoundError(
                "TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True)."
            )
        
        # 多头注意力机制
        self.ma = nn.MultiheadAttention(c1, num_heads, dropout=dropout, batch_first=True)
        
        # 前向传播模型的实现
        self.fc1 = nn.Linear(c1, cm)  # 第一个全连接层
        self.fc2 = nn.Linear(cm, c1)  # 第二个全连接层

        # 层归一化
        self.norm1 = nn.LayerNorm(c1)
        self.norm2 = nn.LayerNorm(c1)

        # Dropout 层
        self.dropout = nn.Dropout(dropout)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        # 激活函数
        self.act = act

        # 是否在归一化之前进行层归一化
        self.normalize_before = normalize_before

    @staticmethod
    def with_pos_embed(tensor, pos=None):
        """Add position embeddings to the tensor if provided."""
        return tensor if pos is None else tensor + pos

    def forward_post(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
        """Performs forward pass with post-normalization."""
        # 添加位置编码
        q = k = self.with_pos_embed(src, pos)
        
        # 多头注意力机制的前向传播
        src2 = self.ma(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
        
        # 残差连接和 dropout
        src = src + self.dropout1(src2)
        
        # 层归一化
        src = self.norm1(src)
        
        # 前向传播的第二部分:全连接层和激活函数
        src2 = self.fc2(self.dropout(self.act(self.fc1(src))))
        
        # 残差连接和 dropout
        src = src + self.dropout2(src2)
        
        # 再次进行层归一化
        return self.norm2(src)

    def forward_pre(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
        """Performs forward pass with pre-normalization."""
        # 层归一化
        src2 = self.norm1(src)
        
        # 添加位置编码
        q = k = self.with_pos_embed(src2, pos)
        
        # 多头注意力机制的前向传播
        src2 = self.ma(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
        
        # 残差连接和 dropout
        src = src + self.dropout1(src2)
        
        # 再次进行层归一化
        src2 = self.norm2(src)
        
        # 前向传播的第二部分:全连接层和激活函数
        src2 = self.fc2(self.dropout(self.act(self.fc1(src2))))
        
        # 残差连接
        return src + self.dropout2(src2)
    # 此方法用于在编码器模块中前向传播输入数据。
    def forward(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
        """Forward propagates the input through the encoder module."""
        # 如果设置了 normalize_before 标志,则调用前向传播前处理方法
        if self.normalize_before:
            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
        # 否则调用前向传播后处理方法
        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
class AIFI(TransformerEncoderLayer):
    """Defines the AIFI transformer layer."""

    def __init__(self, c1, cm=2048, num_heads=8, dropout=0, act=nn.GELU(), normalize_before=False):
        """Initialize the AIFI instance with specified parameters."""
        super().__init__(c1, cm, num_heads, dropout, act, normalize_before)

    def forward(self, x):
        """Forward pass for the AIFI transformer layer."""
        c, h, w = x.shape[1:]
        # 构建二维的正弦-余弦位置编码
        pos_embed = self.build_2d_sincos_position_embedding(w, h, c)
        # 将输入张量展平成[B, HxW, C]
        x = super().forward(x.flatten(2).permute(0, 2, 1), pos=pos_embed.to(device=x.device, dtype=x.dtype))
        # 将张量重新整形为原始形状[B, C, H, W],并保持连续性
        return x.permute(0, 2, 1).view([-1, c, h, w]).contiguous()

    @staticmethod
    def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.0):
        """Builds 2D sine-cosine position embedding."""
        assert embed_dim % 4 == 0, "Embed dimension must be divisible by 4 for 2D sin-cos position embedding"
        # 创建网格坐标
        grid_w = torch.arange(w, dtype=torch.float32)
        grid_h = torch.arange(h, dtype=torch.float32)
        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
        pos_dim = embed_dim // 4
        # 计算频率因子
        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
        omega = 1.0 / (temperature**omega)

        # 计算位置编码
        out_w = grid_w.flatten()[..., None] @ omega[None]
        out_h = grid_h.flatten()[..., None] @ omega[None]

        # 拼接正弦和余弦编码
        return torch.cat([torch.sin(out_w), torch.cos(out_w), torch.sin(out_h), torch.cos(out_h)], 1)[None]


class TransformerLayer(nn.Module):
    """Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""

    def __init__(self, c, num_heads):
        """Initializes a self-attention mechanism using linear transformations and multi-head attention."""
        super().__init__()
        # 初始化查询、键、值的线性变换
        self.q = nn.Linear(c, c, bias=False)
        self.k = nn.Linear(c, c, bias=False)
        self.v = nn.Linear(c, c, bias=False)
        # 多头注意力机制
        self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads)
        # 两个线性层
        self.fc1 = nn.Linear(c, c, bias=False)
        self.fc2 = nn.Linear(c, c, bias=False)

    def forward(self, x):
        """Apply a transformer block to the input x and return the output."""
        # 使用自注意力机制和残差连接
        x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
        # 两个线性层的前向传播,并添加残差连接
        return self.fc2(self.fc1(x)) + x


class TransformerBlock(nn.Module):
    """Vision Transformer https://arxiv.org/abs/2010.11929."""

    def __init__(self, c1, c2, num_heads, num_layers):
        """Initialize a Transformer module with position embedding and specified number of heads and layers."""
        super().__init__()
        self.conv = None
        # 如果输入通道数和输出通道数不同,添加一个卷积层
        if c1 != c2:
            self.conv = Conv(c1, c2)
        # 可学习的位置编码
        self.linear = nn.Linear(c2, c2)
        # 多个Transformer层组成的序列
        self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers)))
        self.c2 = c2
    def forward(self, x):
        """Forward propagates the input through the bottleneck module."""
        # 如果存在卷积层,则将输入 x 传递给卷积层进行处理
        if self.conv is not None:
            x = self.conv(x)
        # 获取输入张量 x 的形状信息:批量大小 b, 通道数 _, 宽度 w, 高度 h
        b, _, w, h = x.shape
        # 将 x 展平为二维张量,然后对维度进行置换,变换顺序为 (2, 0, 1)
        p = x.flatten(2).permute(2, 0, 1)
        # 将展平后的张量 p 输入到 self.linear 进行线性变换,并加上原始的 p
        # 然后再将结果进行置换,变换顺序为 (1, 2, 0),最后将其重新形状为 (b, self.c2, w, h)
        return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, w, h)
# 实现多层感知机(MLP)的单个块
class MLPBlock(nn.Module):
    """Implements a single block of a multi-layer perceptron."""

    def __init__(self, embedding_dim, mlp_dim, act=nn.GELU):
        """Initialize the MLPBlock with specified embedding dimension, MLP dimension, and activation function."""
        super().__init__()
        # 第一层线性变换,将输入的embedding_dim维度映射到mlp_dim维度
        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
        # 第二层线性变换,将mlp_dim维度的输入映射回embedding_dim维度
        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
        # 激活函数,默认为GELU
        self.act = act()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Forward pass for the MLPBlock."""
        # 前向传播过程,先通过第一层线性变换和激活函数,再通过第二层线性变换
        return self.lin2(self.act(self.lin1(x)))


# 实现简单的多层感知机(MLP)模型,也称为前馈神经网络(FFN)
class MLP(nn.Module):
    """Implements a simple multi-layer perceptron (also called FFN)."""

    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        """Initialize the MLP with specified input, hidden, output dimensions and number of layers."""
        super().__init__()
        self.num_layers = num_layers
        # 根据层数和各层的维度,创建多个线性层组成的层列表
        h = [hidden_dim] * (num_layers - 1)
        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))

    def forward(self, x):
        """Forward pass for the entire MLP."""
        # 逐层进行前向传播,对前num_layers-1层使用ReLU激活函数
        for i, layer in enumerate(self.layers):
            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
        return x


# 实现二维的层归一化模块,灵感来自Detectron2和ConvNeXt的实现
class LayerNorm2d(nn.Module):
    """
    2D Layer Normalization module inspired by Detectron2 and ConvNeXt implementations.

    Original implementations in
    https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py
    and
    https://github.com/facebookresearch/ConvNeXt/blob/main/models/convnext.py.
    """

    def __init__(self, num_channels, eps=1e-6):
        """Initialize LayerNorm2d with the given parameters."""
        super().__init__()
        # 初始化归一化层的权重和偏置参数
        self.weight = nn.Parameter(torch.ones(num_channels))
        self.bias = nn.Parameter(torch.zeros(num_channels))
        self.eps = eps

    def forward(self, x):
        """Perform forward pass for 2D layer normalization."""
        # 对输入进行二维的层归一化计算
        u = x.mean(1, keepdim=True)
        s = (x - u).pow(2).mean(1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.eps)
        return self.weight[:, None, None] * x + self.bias[:, None, None]


# 实现基于Deformable-DETR和PaddleDetection实现的多尺度可变形注意力模块
class MSDeformAttn(nn.Module):
    """
    Multiscale Deformable Attention Module based on Deformable-DETR and PaddleDetection implementations.

    https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
    """
    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
        """Initialize MSDeformAttn with the given parameters."""
        # 调用父类的初始化方法
        super().__init__()
        # 检查是否满足 d_model 可以被 n_heads 整除的条件,否则抛出错误
        if d_model % n_heads != 0:
            raise ValueError(f"d_model must be divisible by n_heads, but got {d_model} and {n_heads}")
        # 计算每个头部的维度
        _d_per_head = d_model // n_heads
        # 断言 d_model 必须能够被 n_heads 整除,用于检查计算的正确性
        assert _d_per_head * n_heads == d_model, "`d_model` must be divisible by `n_heads`"

        # 设置 im2col 操作的步长
        self.im2col_step = 64

        # 初始化各个模型参数
        self.d_model = d_model
        self.n_levels = n_levels
        self.n_heads = n_heads
        self.n_points = n_points

        # 创建用于偏移量的线性层,输入维度为 d_model,输出维度为 n_heads * n_levels * n_points * 2
        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
        # 创建用于注意力权重的线性层,输入维度为 d_model,输出维度为 n_heads * n_levels * n_points
        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
        # 创建用于值投影的线性层,输入维度和输出维度都是 d_model
        self.value_proj = nn.Linear(d_model, d_model)
        # 创建用于输出投影的线性层,输入维度和输出维度都是 d_model
        self.output_proj = nn.Linear(d_model, d_model)

        # 调用内部方法,初始化模型参数
        self._reset_parameters()

    def _reset_parameters(self):
        """Reset module parameters."""
        # 将 sampling_offsets 的权重初始化为常数 0.0
        constant_(self.sampling_offsets.weight.data, 0.0)
        # 生成一组角度 thetas,用于初始化采样网格
        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
        # 将初始化的网格归一化,并重复以适应不同的 levels 和 points
        grid_init = (
            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
            .view(self.n_heads, 1, 1, 2)
            .repeat(1, self.n_levels, self.n_points, 1)
        )
        # 根据点的索引加权初始化网格的不同部分
        for i in range(self.n_points):
            grid_init[:, :, i, :] *= i + 1
        # 使用无梯度的方式将初始化后的网格作为偏置参数赋给 sampling_offsets
        with torch.no_grad():
            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
        # 将 attention_weights 的权重初始化为常数 0.0
        constant_(self.attention_weights.weight.data, 0.0)
        # 将 attention_weights 的偏置初始化为常数 0.0
        constant_(self.attention_weights.bias.data, 0.0)
        # 使用 xavier_uniform 方法初始化 value_proj 的权重
        xavier_uniform_(self.value_proj.weight.data)
        # 将 value_proj 的偏置初始化为常数 0.0
        constant_(self.value_proj.bias.data, 0.0)
        # 使用 xavier_uniform 方法初始化 output_proj 的权重
        xavier_uniform_(self.output_proj.weight.data)
        # 将 output_proj 的偏置初始化为常数 0.0
        constant_(self.output_proj.bias.data, 0.0)
    def forward(self, query, refer_bbox, value, value_shapes, value_mask=None):
        """
        Perform forward pass for multiscale deformable attention.

        https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py

        Args:
            query (torch.Tensor): [bs, query_length, C] 输入的查询张量,形状为 [批大小, 查询长度, 通道数]
            refer_bbox (torch.Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
                bottom-right (1, 1), including padding area 参考边界框张量,形状为 [批大小, 查询长度, 层级数, 2],表示区域范围在 [0, 1] 之间,左上角为 (0,0),右下角为 (1,1),包含填充区域
            value (torch.Tensor): [bs, value_length, C] 输入的值张量,形状为 [批大小, 值长度, 通道数]
            value_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] 不同层级的值张量形状列表
            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements 值张量的掩码,形状为 [批大小, 值长度],True 表示非填充元素,False 表示填充元素

        Returns:
            output (Tensor): [bs, Length_{query}, C] 输出张量,形状为 [批大小, 查询长度, 通道数]
        """
        bs, len_q = query.shape[:2]  # 获取批大小和查询长度
        len_v = value.shape[1]  # 获取值张量的长度
        assert sum(s[0] * s[1] for s in value_shapes) == len_v  # 确保所有层级的值张量形状乘积等于值张量长度

        value = self.value_proj(value)  # 使用值投影函数处理值张量

        if value_mask is not None:
            value = value.masked_fill(value_mask[..., None], float(0))  # 根据值张量的掩码进行填充

        value = value.view(bs, len_v, self.n_heads, self.d_model // self.n_heads)  # 调整值张量的形状为 [批大小, 值长度, 头数, 模型维度//头数]

        sampling_offsets = self.sampling_offsets(query).view(bs, len_q, self.n_heads, self.n_levels, self.n_points, 2)
        # 计算采样偏移量,形状为 [批大小, 查询长度, 头数, 层级数, 采样点数, 2]

        attention_weights = self.attention_weights(query).view(bs, len_q, self.n_heads, self.n_levels * self.n_points)
        # 计算注意力权重,形状为 [批大小, 查询长度, 头数, 层级数 * 采样点数]

        attention_weights = F.softmax(attention_weights, -1).view(bs, len_q, self.n_heads, self.n_levels, self.n_points)
        # 对注意力权重进行 softmax 归一化,形状为 [批大小, 查询长度, 头数, 层级数, 采样点数]

        num_points = refer_bbox.shape[-1]  # 获取参考边界框张量的最后一个维度大小

        if num_points == 2:
            offset_normalizer = torch.as_tensor(value_shapes, dtype=query.dtype, device=query.device).flip(-1)
            add = sampling_offsets / offset_normalizer[None, None, None, :, None, :]
            sampling_locations = refer_bbox[:, :, None, :, None, :] + add
            # 根据参考边界框张量和采样偏移量计算采样位置,形状为 [批大小, 查询长度, 1, 层级数, 1, 2]

        elif num_points == 4:
            add = sampling_offsets / self.n_points * refer_bbox[:, :, None, :, None, 2:] * 0.5
            sampling_locations = refer_bbox[:, :, None, :, None, :2] + add
            # 根据参考边界框张量、采样偏移量和采样点数计算采样位置,形状为 [批大小, 查询长度, 1, 层级数, 1, 2]

        else:
            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {num_points}.")
            # 如果参考边界框张量的最后一个维度不是 2 或 4,则引发值错误异常

        output = multi_scale_deformable_attn_pytorch(value, value_shapes, sampling_locations, attention_weights)
        # 使用多尺度可变形注意力函数计算输出结果

        return self.output_proj(output)
        # 对输出结果进行投影处理并返回
class DeformableTransformerDecoderLayer(nn.Module):
    """
    Deformable Transformer Decoder Layer inspired by PaddleDetection and Deformable-DETR implementations.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
    https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
    """

    def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0.0, act=nn.ReLU(), n_levels=4, n_points=4):
        """Initialize the DeformableTransformerDecoderLayer with the given parameters."""
        super().__init__()

        # Self attention
        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)  # 创建自注意力层
        self.dropout1 = nn.Dropout(dropout)  # 定义第一层dropout
        self.norm1 = nn.LayerNorm(d_model)  # 定义第一层Layer Normalization

        # Cross attention
        self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)  # 创建交叉注意力层
        self.dropout2 = nn.Dropout(dropout)  # 定义第二层dropout
        self.norm2 = nn.LayerNorm(d_model)  # 定义第二层Layer Normalization

        # FFN
        self.linear1 = nn.Linear(d_model, d_ffn)  # 第一层线性变换
        self.act = act  # 激活函数
        self.dropout3 = nn.Dropout(dropout)  # 定义第三层dropout
        self.linear2 = nn.Linear(d_ffn, d_model)  # 第二层线性变换
        self.dropout4 = nn.Dropout(dropout)  # 定义第四层dropout
        self.norm3 = nn.LayerNorm(d_model)  # 定义第三层Layer Normalization

    @staticmethod
    def with_pos_embed(tensor, pos):
        """Add positional embeddings to the input tensor, if provided."""
        return tensor if pos is None else tensor + pos  # 如果提供了位置编码,则将其添加到输入张量中

    def forward_ffn(self, tgt):
        """Perform forward pass through the Feed-Forward Network part of the layer."""
        tgt2 = self.linear2(self.dropout3(self.act(self.linear1(tgt))))  # 前向传播过程中的前馈网络部分
        tgt = tgt + self.dropout4(tgt2)  # 加上残差连接和最后一层dropout
        return self.norm3(tgt)  # 应用Layer Normalization

    def forward(self, embed, refer_bbox, feats, shapes, padding_mask=None, attn_mask=None, query_pos=None):
        """Perform the forward pass through the entire decoder layer."""

        # Self attention
        q = k = self.with_pos_embed(embed, query_pos)  # 添加位置编码后的查询和键
        tgt = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), embed.transpose(0, 1), attn_mask=attn_mask)[0].transpose(0, 1)  # 自注意力机制
        embed = embed + self.dropout1(tgt)  # 加上残差连接和第一层dropout
        embed = self.norm1(embed)  # 应用Layer Normalization

        # Cross attention
        tgt = self.cross_attn(
            self.with_pos_embed(embed, query_pos), refer_bbox.unsqueeze(2), feats, shapes, padding_mask
        )  # 交叉注意力机制
        embed = embed + self.dropout2(tgt)  # 加上残差连接和第二层dropout
        embed = self.norm2(embed)  # 应用Layer Normalization

        # FFN
        return self.forward_ffn(embed)  # 前向传播过程中的前馈网络
    def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
        """Initialize the DeformableTransformerDecoder with the given parameters."""
        # 调用父类初始化方法
        super().__init__()
        # 使用 _get_clones 函数复制 decoder_layer,构建层列表
        self.layers = _get_clones(decoder_layer, num_layers)
        # 记录解码器层数
        self.num_layers = num_layers
        # 记录隐藏层维度
        self.hidden_dim = hidden_dim
        # 设置评估索引,如果未指定则为最后一层
        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx

    def forward(
        self,
        embed,  # 解码器嵌入
        refer_bbox,  # 锚框
        feats,  # 图像特征
        shapes,  # 特征形状
        bbox_head,
        score_head,
        pos_mlp,
        attn_mask=None,
        padding_mask=None,
    ):
        """Perform the forward pass through the entire decoder."""
        # 初始化输出为解码器嵌入
        output = embed
        # 初始化解码器生成的边界框和类别
        dec_bboxes = []
        dec_cls = []
        # 初始化最后细化的参考边界框为 None
        last_refined_bbox = None
        # 对参考边界框进行 sigmoid 操作
        refer_bbox = refer_bbox.sigmoid()
        # 遍历所有层进行前向传播
        for i, layer in enumerate(self.layers):
            # 在当前层应用解码器操作
            output = layer(output, refer_bbox, feats, shapes, padding_mask, attn_mask, pos_mlp(refer_bbox))

            # 预测边界框
            bbox = bbox_head[i](output)
            # 计算细化的边界框
            refined_bbox = torch.sigmoid(bbox + inverse_sigmoid(refer_bbox))

            # 如果处于训练阶段
            if self.training:
                # 记录类别预测结果
                dec_cls.append(score_head[i](output))
                # 如果是第一层,直接记录细化后的边界框
                if i == 0:
                    dec_bboxes.append(refined_bbox)
                else:
                    # 否则记录上一次细化后的边界框
                    dec_bboxes.append(torch.sigmoid(bbox + inverse_sigmoid(last_refined_bbox)))
            # 如果处于评估阶段且达到指定的评估层次
            elif i == self.eval_idx:
                # 记录类别预测结果
                dec_cls.append(score_head[i](output))
                # 记录细化后的边界框
                dec_bboxes.append(refined_bbox)
                break

            # 更新上一次细化后的边界框为当前细化后的边界框
            last_refined_bbox = refined_bbox
            # 更新参考边界框为当前细化后的边界框的分离版本(在训练阶段)
            refer_bbox = refined_bbox.detach() if self.training else refined_bbox

        # 返回堆叠的边界框和类别预测结果
        return torch.stack(dec_bboxes), torch.stack(dec_cls)

.\yolov8\ultralytics\nn\modules\utils.py

# 导入必要的库和模块
import copy  # 导入copy模块,用于深拷贝对象
import math  # 导入math模块,提供数学函数

import numpy as np  # 导入NumPy库,用于科学计算
import torch  # 导入PyTorch库
import torch.nn as nn  # 导入PyTorch的神经网络模块
import torch.nn.functional as F  # 导入PyTorch的函数库
from torch.nn.init import uniform_  # 从PyTorch的初始化模块中导入uniform_函数

__all__ = "multi_scale_deformable_attn_pytorch", "inverse_sigmoid"  # 定义模块的公开接口

def _get_clones(module, n):
    """Create a list of cloned modules from the given module."""
    return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])

def bias_init_with_prob(prior_prob=0.01):
    """Initialize conv/fc bias value according to a given probability value."""
    return float(-np.log((1 - prior_prob) / prior_prob))  # 返回根据先验概率初始化的偏置值

def linear_init(module):
    """Initialize the weights and biases of a linear module."""
    bound = 1 / math.sqrt(module.weight.shape[0])
    uniform_(module.weight, -bound, bound)  # 初始化线性模块的权重
    if hasattr(module, "bias") and module.bias is not None:
        uniform_(module.bias, -bound, bound)  # 如果模块具有偏置项,则初始化偏置项

def inverse_sigmoid(x, eps=1e-5):
    """Calculate the inverse sigmoid function for a tensor."""
    x = x.clamp(min=0, max=1)  # 将输入张量限制在区间 [0, 1]
    x1 = x.clamp(min=eps)  # 将输入张量在最小值eps处截断
    x2 = (1 - x).clamp(min=eps)  # 将 1-x 在最小值eps处截断
    return torch.log(x1 / x2)  # 返回对数的差值,计算逆sigmoid函数的值

def multi_scale_deformable_attn_pytorch(
    value: torch.Tensor,
    value_spatial_shapes: torch.Tensor,
    sampling_locations: torch.Tensor,
    attention_weights: torch.Tensor,
) -> torch.Tensor:
    """
    Multiscale deformable attention.

    https://github.com/IDEA-Research/detrex/blob/main/detrex/layers/multi_scale_deform_attn.py
    """
    bs, _, num_heads, embed_dims = value.shape  # 获取输入张量value的形状信息
    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape  # 获取采样位置张量sampling_locations的形状信息
    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)  # 根据value_spatial_shapes切分value张量
    sampling_grids = 2 * sampling_locations - 1  # 计算采样网格的位置
    sampling_value_list = []  # 初始化采样值列表
    for level, (H_, W_) in enumerate(value_spatial_shapes):
        # 将value_list[level]展平并转置,然后重塑为(bs*num_heads, embed_dims, H_, W_)
        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)
        # 将sampling_grids[:, :, :, level]转置并展平,得到(bs*num_heads, num_queries, num_points, 2)
        sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
        # 使用双线性插值对value_l_进行采样,得到采样值sampling_value_l_
        sampling_value_l_ = F.grid_sample(
            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
        )
        sampling_value_list.append(sampling_value_l_)  # 将采样值添加到列表中
    # 将attention_weights转置和重塑,得到形状为(bs*num_heads, 1, num_queries, num_levels*num_points)的张量
    attention_weights = attention_weights.transpose(1, 2).reshape(
        bs * num_heads, 1, num_queries, num_levels * num_points
    )
    # 计算加权平均后的输出张量
    output = (
        # 将采样值列表按照指定维度堆叠成张量,并展开至倒数第二维
        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
        # 沿着最后一维度求和,得到加权平均值
        .sum(-1)
        # 将结果重新调整形状为(bs, num_heads * embed_dims, num_queries)
        .view(bs, num_heads * embed_dims, num_queries)
    )
    # 调换第一和第二维度,并使得张量的存储顺序连续化
    return output.transpose(1, 2).contiguous()

.\yolov8\ultralytics\nn\modules\__init__.py

# 导入模块和类别,用于Ultralytics YOLO框架的神经网络模块
"""
Ultralytics modules.

Example:
    Visualize a module with Netron.
    ```python
    from ultralytics.nn.modules import *
    import torch
    import os

    x = torch.ones(1, 128, 40, 40)
    m = Conv(128, 128)
    f = f'{m._get_name()}.onnx'
    torch.onnx.export(m, x, f)
    os.system(f'onnxslim {f} {f} && open {f}')  # pip install onnxslim
    ```py
"""

from .block import (
    C1,             # 导入模块中的类别 C1
    C2,             # 导入模块中的类别 C2
    C3,             # 导入模块中的类别 C3
    C3TR,           # 导入模块中的类别 C3TR
    CIB,            # 导入模块中的类别 CIB
    DFL,            # 导入模块中的类别 DFL
    ELAN1,          # 导入模块中的类别 ELAN1
    PSA,            # 导入模块中的类别 PSA
    SPP,            # 导入模块中的类别 SPP
    SPPELAN,        # 导入模块中的类别 SPPELAN
    SPPF,           # 导入模块中的类别 SPPF
    AConv,          # 导入模块中的类别 AConv
    ADown,          # 导入模块中的类别 ADown
    Attention,      # 导入模块中的类别 Attention
    BNContrastiveHead,  # 导入模块中的类别 BNContrastiveHead
    Bottleneck,     # 导入模块中的类别 Bottleneck
    BottleneckCSP,  # 导入模块中的类别 BottleneckCSP
    C2f,            # 导入模块中的类别 C2f
    C2fAttn,        # 导入模块中的类别 C2fAttn
    C2fCIB,         # 导入模块中的类别 C2fCIB
    C3Ghost,        # 导入模块中的类别 C3Ghost
    C3x,            # 导入模块中的类别 C3x
    CBFuse,         # 导入模块中的类别 CBFuse
    CBLinear,       # 导入模块中的类别 CBLinear
    ContrastiveHead,    # 导入模块中的类别 ContrastiveHead
    GhostBottleneck,    # 导入模块中的类别 GhostBottleneck
    HGBlock,        # 导入模块中的类别 HGBlock
    HGStem,         # 导入模块中的类别 HGStem
    ImagePoolingAttn,   # 导入模块中的类别 ImagePoolingAttn
    Proto,          # 导入模块中的类别 Proto
    RepC3,          # 导入模块中的类别 RepC3
    RepNCSPELAN4,   # 导入模块中的类别 RepNCSPELAN4
    RepVGGDW,       # 导入模块中的类别 RepVGGDW
    ResNetLayer,    # 导入模块中的类别 ResNetLayer
    SCDown,         # 导入模块中的类别 SCDown
)

from .conv import (
    CBAM,           # 导入模块中的类别 CBAM
    ChannelAttention,   # 导入模块中的类别 ChannelAttention
    Concat,         # 导入模块中的类别 Concat
    Conv,           # 导入模块中的类别 Conv
    Conv2,          # 导入模块中的类别 Conv2
    ConvTranspose,  # 导入模块中的类别 ConvTranspose
    DWConv,         # 导入模块中的类别 DWConv
    DWConvTranspose2d,  # 导入模块中的类别 DWConvTranspose2d
    Focus,          # 导入模块中的类别 Focus
    GhostConv,      # 导入模块中的类别 GhostConv
    LightConv,      # 导入模块中的类别 LightConv
    RepConv,        # 导入模块中的类别 RepConv
    SpatialAttention,   # 导入模块中的类别 SpatialAttention
)

from .head import (
    OBB,            # 导入模块中的类别 OBB
    Classify,       # 导入模块中的类别 Classify
    Detect,         # 导入模块中的类别 Detect
    Pose,           # 导入模块中的类别 Pose
    RTDETRDecoder,  # 导入模块中的类别 RTDETRDecoder
    Segment,        # 导入模块中的类别 Segment
    WorldDetect,    # 导入模块中的类别 WorldDetect
    v10Detect,      # 导入模块中的类别 v10Detect
)

from .transformer import (
    AIFI,           # 导入模块中的类别 AIFI
    MLP,            # 导入模块中的类别 MLP
    DeformableTransformerDecoder,   # 导入模块中的类别 DeformableTransformerDecoder
    DeformableTransformerDecoderLayer,   # 导入模块中的类别 DeformableTransformerDecoderLayer
    LayerNorm2d,    # 导入模块中的类别 LayerNorm2d
    MLPBlock,       # 导入模块中的类别 MLPBlock
    MSDeformAttn,   # 导入模块中的类别 MSDeformAttn
    TransformerBlock,   # 导入模块中的类别 TransformerBlock
    TransformerEncoderLayer,    # 导入模块中的类别 TransformerEncoderLayer
    TransformerLayer,   # 导入模块中的类别 TransformerLayer
)

__all__ = (
    "Conv",         # 将 Conv 加入到模块的公开 API 列表中
    "Conv2",        # 将 Conv2 加入到模块的公开 API 列表中
    "LightConv",    # 将 LightConv 加入到模块的公开 API 列表中
    "RepConv",      # 将 RepConv 加入到模块的公开 API 列表中
    "DWConv",       # 将 DWConv 加入到模块的公开 API 列表中
    "DWConvTranspose2d",    # 将 DWConvTranspose2d 加入到模块的公开 API 列表中
    "ConvTranspose",    # 将 ConvTranspose 加入到模块的公开 API 列表中
    "Focus",        # 将 Focus 加入到模块的公开 API 列表中
    "GhostConv",    # 将 GhostConv 加入到模块的公开 API 列表中
    "ChannelAttention", # 将 ChannelAttention 加入到模块的公开 API 列表中
    "SpatialAttention",    # 将 SpatialAttention 加入到模块的公开 API 列表中
    "CBAM",         # 将 CBAM 加入到模块的公开 API 列表中
    "Concat",       # 将 Concat 加入到模块的公开 API 列表中
    "TransformerLayer", # 将 TransformerLayer 加入到模块的公开 API 列表中
    "TransformerBlock", # 将 TransformerBlock 加入到模块的公开 API 列表中
    "MLPBlock",     # 将 MLPBlock 加入到模块的公开 API 列表中
    "LayerNorm2d",  # 将 LayerNorm2d 加入到模块的公开 API 列表中
    "DFL",          # 将 DFL 加入到模块的公开 API 列表中
    "HGBlock",      # 将 HGBlock 加入到模块的公开 API 列表中
    "HGStem",       # 将 HGStem 加入到模块的公开 API 列表中

.\yolov8\ultralytics\nn\tasks.py

# 导入必要的库和模块
import contextlib
from copy import deepcopy
from pathlib import Path

import torch  # 导入PyTorch库
import torch.nn as nn  # 导入PyTorch神经网络模块

# 从Ultralytics.nn.modules中导入多个自定义模块
from ultralytics.nn.modules import (
    AIFI,
    C1,
    C2,
    C3,
    C3TR,
    ELAN1,
    OBB,
    PSA,
    SPP,
    SPPELAN,
    SPPF,
    AConv,
    ADown,
    Bottleneck,
    BottleneckCSP,
    C2f,
    C2fAttn,
    C2fCIB,
    C3Ghost,
    C3x,
    CBFuse,
    CBLinear,
    Classify,
    Concat,
    Conv,
    Conv2,
    ConvTranspose,
    Detect,
    DWConv,
    DWConvTranspose2d,
    Focus,
    GhostBottleneck,
    GhostConv,
    HGBlock,
    HGStem,
    ImagePoolingAttn,
    Pose,
    RepC3,
    RepConv,
    RepNCSPELAN4,
    RepVGGDW,
    ResNetLayer,
    RTDETRDecoder,
    SCDown,
    Segment,
    WorldDetect,
    v10Detect,
)

# 从Ultralytics.utils中导入各种工具和函数
from ultralytics.utils import DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, colorstr, emojis, yaml_load
from ultralytics.utils.checks import check_requirements, check_suffix, check_yaml
from ultralytics.utils.loss import (
    E2EDetectLoss,
    v8ClassificationLoss,
    v8DetectionLoss,
    v8OBBLoss,
    v8PoseLoss,
    v8SegmentationLoss,
)
from ultralytics.utils.ops import make_divisible
from ultralytics.utils.plotting import feature_visualization
from ultralytics.utils.torch_utils import (
    fuse_conv_and_bn,
    fuse_deconv_and_bn,
    initialize_weights,
    intersect_dicts,
    model_info,
    scale_img,
    time_sync,
)

try:
    import thop
except ImportError:
    thop = None

# 定义一个基础模型类,作为Ultralytics YOLO系列模型的基类
class BaseModel(nn.Module):
    """The BaseModel class serves as a base class for all the models in the Ultralytics YOLO family."""

    def forward(self, x, *args, **kwargs):
        """
        模型的前向传播方法,对单个尺度进行处理。包装了 `_forward_once` 方法。

        Args:
            x (torch.Tensor | dict): 输入的图像张量或包含图像张量和gt标签的字典。

        Returns:
            (torch.Tensor): 网络的输出。
        """
        if isinstance(x, dict):  # 对训练和验证过程中的情况进行处理
            return self.loss(x, *args, **kwargs)
        return self.predict(x, *args, **kwargs)

    def predict(self, x, profile=False, visualize=False, augment=False, embed=None):
        """
        对网络进行前向传播。

        Args:
            x (torch.Tensor): 输入到模型的张量。
            profile (bool): 如果为True,打印每层的计算时间,默认为False。
            visualize (bool): 如果为True,保存模型的特征图,默认为False。
            augment (bool): 在预测过程中进行图像增强,默认为False。
            embed (list, optional): 要返回的特征向量或嵌入列表。

        Returns:
            (torch.Tensor): 模型的最后输出。
        """
        if augment:
            return self._predict_augment(x)
        return self._predict_once(x, profile, visualize, embed)
    # 执行一次模型的前向传播
    def _predict_once(self, x, profile=False, visualize=False, embed=None):
        """
        Perform a forward pass through the network.

        Args:
            x (torch.Tensor): The input tensor to the model.
            profile (bool): Print the computation time of each layer if True, defaults to False.
            visualize (bool): Save the feature maps of the model if True, defaults to False.
            embed (list, optional): A list of feature vectors/embeddings to return.

        Returns:
            (torch.Tensor): The last output of the model.
        """
        y, dt, embeddings = [], [], []  # outputs
        
        # 遍历模型的每一层
        for m in self.model:
            # 如果当前层不是从前一层得到的
            if m.f != -1:  # if not from previous layer
                x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
            
            # 如果需要进行性能分析
            if profile:
                self._profile_one_layer(m, x, dt)
            
            # 执行当前层的计算
            x = m(x)  # run
            
            # 保存当前层的输出
            y.append(x if m.i in self.save else None)  # save output
            
            # 如果需要可视化特征图
            if visualize:
                feature_visualization(x, m.type, m.i, save_dir=visualize)
            
            # 如果需要返回特定层的嵌入向量
            if embed and m.i in embed:
                embeddings.append(nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1))  # flatten
                # 如果当前层是要返回的最大嵌入层,则直接返回嵌入向量
                if m.i == max(embed):
                    return torch.unbind(torch.cat(embeddings, 1), dim=0)
        
        # 返回模型最后的输出
        return x

    # 执行输入图像 x 的增强操作,并返回增强后的推理结果
    def _predict_augment(self, x):
        """Perform augmentations on input image x and return augmented inference."""
        LOGGER.warning(
            f"WARNING ⚠️ {self.__class__.__name__} does not support 'augment=True' prediction. "
            f"Reverting to single-scale prediction."
        )
        return self._predict_once(x)

    # 对模型的单个层进行计算时间和 FLOPs 的性能分析,并将结果添加到提供的列表中
    def _profile_one_layer(self, m, x, dt):
        """
        Profile the computation time and FLOPs of a single layer of the model on a given input. Appends the results to
        the provided list.

        Args:
            m (nn.Module): The layer to be profiled.
            x (torch.Tensor): The input data to the layer.
            dt (list): A list to store the computation time of the layer.

        Returns:
            None
        """
        c = m == self.model[-1] and isinstance(x, list)  # is final layer list, copy input as inplace fix
        
        # 计算该层的 FLOPs
        flops = thop.profile(m, inputs=[x.copy() if c else x], verbose=False)[0] / 1e9 * 2 if thop else 0  # GFLOPs
        
        # 开始计时
        t = time_sync()
        for _ in range(10):
            m(x.copy() if c else x)
        
        # 计算执行时间并记录
        dt.append((time_sync() - t) * 100)
        
        # 如果是模型的第一层,输出性能分析的表头信息
        if m == self.model[0]:
            LOGGER.info(f"{'time (ms)':>10s} {'GFLOPs':>10s} {'params':>10s}  module")
        
        # 输出当前层的性能分析结果
        LOGGER.info(f"{dt[-1]:10.2f} {flops:10.2f} {m.np:10.0f}  {m.type}")
        
        # 如果是最后一层且输出为列表形式,则输出总计信息
        if c:
            LOGGER.info(f"{sum(dt):10.2f} {'-':>10s} {'-':>10s}  Total")
    def fuse(self, verbose=True):
        """
        Fuse the `Conv2d()` and `BatchNorm2d()` layers of the model into a single layer, in order to improve the
        computation efficiency.

        Returns:
            (nn.Module): The fused model is returned.
        """
        # 如果模型尚未融合
        if not self.is_fused():
            # 遍历模型的所有模块
            for m in self.model.modules():
                # 如果当前模块是 Conv、Conv2 或 DWConv,并且有 bn 属性
                if isinstance(m, (Conv, Conv2, DWConv)) and hasattr(m, "bn"):
                    # 如果当前模块是 Conv2 类型,则执行融合卷积操作
                    if isinstance(m, Conv2):
                        m.fuse_convs()
                    # 融合卷积层和批归一化层
                    m.conv = fuse_conv_and_bn(m.conv, m.bn)  # 更新卷积层
                    delattr(m, "bn")  # 移除批归一化层
                    m.forward = m.forward_fuse  # 更新前向传播方法
                # 如果当前模块是 ConvTranspose 并且有 bn 属性
                if isinstance(m, ConvTranspose) and hasattr(m, "bn"):
                    # 融合反卷积层和批归一化层
                    m.conv_transpose = fuse_deconv_and_bn(m.conv_transpose, m.bn)
                    delattr(m, "bn")  # 移除批归一化层
                    m.forward = m.forward_fuse  # 更新前向传播方法
                # 如果当前模块是 RepConv 类型
                if isinstance(m, RepConv):
                    # 执行重复卷积融合操作
                    m.fuse_convs()
                    m.forward = m.forward_fuse  # 更新前向传播方法
                # 如果当前模块是 RepVGGDW 类型
                if isinstance(m, RepVGGDW):
                    # 执行重复 VGG 深度可分离卷积融合操作
                    m.fuse()
                    m.forward = m.forward_fuse  # 更新前向传播方法
            # 打印模型信息
            self.info(verbose=verbose)

        # 返回融合后的模型实例
        return self

    def is_fused(self, thresh=10):
        """
        Check if the model has less than a certain threshold of BatchNorm layers.

        Args:
            thresh (int, optional): The threshold number of BatchNorm layers. Default is 10.

        Returns:
            (bool): True if the number of BatchNorm layers in the model is less than the threshold, False otherwise.
        """
        # 获取所有标准化层(如 BatchNorm2d())的类型元组
        bn = tuple(v for k, v in nn.__dict__.items() if "Norm" in k)  # normalization layers, i.e. BatchNorm2d()
        # 返回模型中标准化层数量是否小于阈值 thresh
        return sum(isinstance(v, bn) for v in self.modules()) < thresh  # True if < 'thresh' BatchNorm layers in model

    def info(self, detailed=False, verbose=True, imgsz=640):
        """
        Prints model information.

        Args:
            detailed (bool): if True, prints out detailed information about the model. Defaults to False
            verbose (bool): if True, prints out the model information. Defaults to False
            imgsz (int): the size of the image that the model will be trained on. Defaults to 640
        """
        # 调用 model_info 函数打印模型信息
        return model_info(self, detailed=detailed, verbose=verbose, imgsz=imgsz)
    def _apply(self, fn):
        """
        Applies a function to all the tensors in the model that are not parameters or registered buffers.

        Args:
            fn (function): the function to apply to the model

        Returns:
            (BaseModel): An updated BaseModel object.
        """
        self = super()._apply(fn)  # 调用父类的_apply方法,将fn函数应用于模型中不是参数或注册缓冲区的所有张量
        m = self.model[-1]  # 获取模型中的最后一个子模块(通常是Detect()类型)
        if isinstance(m, Detect):  # 检查最后一个子模块是否属于Detect类或其子类,如Segment, Pose, OBB, WorldDetect
            m.stride = fn(m.stride)  # 将fn函数应用于m的stride属性
            m.anchors = fn(m.anchors)  # 将fn函数应用于m的anchors属性
            m.strides = fn(m.strides)  # 将fn函数应用于m的strides属性
        return self  # 返回更新后的BaseModel对象

    def load(self, weights, verbose=True):
        """
        Load the weights into the model.

        Args:
            weights (dict | torch.nn.Module): The pre-trained weights to be loaded.
            verbose (bool, optional): Whether to log the transfer progress. Defaults to True.
        """
        model = weights["model"] if isinstance(weights, dict) else weights  # 如果weights是字典,则获取字典中的"model"键对应的值,否则直接使用weights
        csd = model.float().state_dict()  # 将模型的state_dict转换为float类型的checkpoint state_dict
        csd = intersect_dicts(csd, self.state_dict())  # 获取模型状态字典和self对象的状态字典的交集,用于加载权重
        self.load_state_dict(csd, strict=False)  # 使用加载的状态字典csd来加载模型参数,strict=False表示允许不严格匹配模型结构
        if verbose:
            LOGGER.info(f"Transferred {len(csd)}/{len(self.model.state_dict())} items from pretrained weights")
            # 如果verbose为True,则打印日志,显示从预训练权重中转移了多少项到当前模型中

    def loss(self, batch, preds=None):
        """
        Compute loss.

        Args:
            batch (dict): Batch to compute loss on
            preds (torch.Tensor | List[torch.Tensor]): Predictions.
        """
        if getattr(self, "criterion", None) is None:
            self.criterion = self.init_criterion()  # 如果模型中的损失函数属性criterion为None,则初始化损失函数

        preds = self.forward(batch["img"]) if preds is None else preds  # 如果未提供预测值preds,则使用模型前向传播得到预测值
        return self.criterion(preds, batch)  # 计算预测值和真实标签之间的损失值,使用模型的损失函数criterion

    def init_criterion(self):
        """Initialize the loss criterion for the BaseModel."""
        raise NotImplementedError("compute_loss() needs to be implemented by task heads")
        # 抛出NotImplementedError异常,提示需要由任务头部实现compute_loss()方法
class DetectionModel(BaseModel):
    """YOLOv8 detection model."""

    def __init__(self, cfg="yolov8n.yaml", ch=3, nc=None, verbose=True):  # model, input channels, number of classes
        """Initialize the YOLOv8 detection model with the given config and parameters."""
        super().__init__()
        self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg)  # cfg dict
        
        # Check if the first element in the 'backbone' section of the YAML config is 'Silence'
        if self.yaml["backbone"][0][2] == "Silence":
            LOGGER.warning(
                "WARNING ⚠️ YOLOv9 `Silence` module is deprecated in favor of nn.Identity. "
                "Please delete local *.pt file and re-download the latest model checkpoint."
            )
            # Update 'Silence' to 'nn.Identity' in the YAML config
            self.yaml["backbone"][0][2] = "nn.Identity"

        # Define model configuration parameters
        ch = self.yaml["ch"] = self.yaml.get("ch", ch)  # input channels
        
        # Override the number of classes in the YAML config if 'nc' is provided
        if nc and nc != self.yaml["nc"]:
            LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
            self.yaml["nc"] = nc  # override YAML value
        
        # Parse the model based on the YAML configuration
        self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose)  # model, savelist
        
        # Create a default names dictionary for the number of classes
        self.names = {i: f"{i}" for i in range(self.yaml["nc"])}  # default names dict
        
        # Check if 'inplace' is specified in the YAML config, default to True if not specified
        self.inplace = self.yaml.get("inplace", True)
        
        # Check if 'end2end' attribute is present in the last model component
        self.end2end = getattr(self.model[-1], "end2end", False)

        # Build strides
        m = self.model[-1]  # Detect()
        
        # Perform specific actions based on the type of 'm' (Detect subclass)
        if isinstance(m, Detect):  # includes all Detect subclasses like Segment, Pose, OBB, WorldDetect
            s = 256  # 2x min stride
            m.inplace = self.inplace
            
            # Define a function for the forward pass based on the 'end2end' attribute
            def _forward(x):
                """Performs a forward pass through the model, handling different Detect subclass types accordingly."""
                if self.end2end:
                    return self.forward(x)["one2many"]
                return self.forward(x)[0] if isinstance(m, (Segment, Pose, OBB)) else self.forward(x)

            # Calculate the stride values based on the input size and the forward pass result
            m.stride = torch.tensor([s / x.shape[-2] for x in _forward(torch.zeros(1, ch, s, s))])  # forward
            self.stride = m.stride
            
            # Initialize biases for 'm'
            m.bias_init()  # only run once
        else:
            # Set default stride for models like RTDETR
            self.stride = torch.Tensor([32])  # default stride for i.e. RTDETR

        # Initialize weights and biases for the model
        initialize_weights(self)
        
        # Output model information if verbose mode is enabled
        if verbose:
            self.info()
            LOGGER.info("")
    # 执行输入图像 x 的增强操作,并返回增强后的推理和训练输出
    def _predict_augment(self, x):
        # 如果设置了 end2end 属性为 True,则警告不支持 'augment=True' 的预测,回退到单尺度预测
        if getattr(self, "end2end", False):
            LOGGER.warning(
                "WARNING ⚠️ End2End model does not support 'augment=True' prediction. "
                "Reverting to single-scale prediction."
            )
            return self._predict_once(x)  # 调用单尺度预测方法
        img_size = x.shape[-2:]  # 获取图像的高度和宽度
        s = [1, 0.83, 0.67]  # 不同尺度的缩放比例
        f = [None, 3, None]  # 不同的翻转方式 (2-上下翻转, 3-左右翻转)
        y = []  # 存储输出
        for si, fi in zip(s, f):
            xi = scale_img(x.flip(fi) if fi else x, si, gs=int(self.stride.max()))  # 缩放图像并根据需要翻转
            yi = super().predict(xi)[0]  # 进行前向推理
            yi = self._descale_pred(yi, fi, si, img_size)  # 对预测结果进行反缩放操作
            y.append(yi)
        y = self._clip_augmented(y)  # 对增强后的结果进行裁剪
        return torch.cat(y, -1), None  # 返回增强后的推理结果和空的训练输出

    @staticmethod
    def _descale_pred(p, flips, scale, img_size, dim=1):
        """对增强推理后的预测进行反缩放操作(逆操作)。"""
        p[:, :4] /= scale  # 反缩放坐标
        x, y, wh, cls = p.split((1, 1, 2, p.shape[dim] - 4), dim)  # 拆分预测结果
        if flips == 2:
            y = img_size[0] - y  # 反上下翻转
        elif flips == 3:
            x = img_size[1] - x  # 反左右翻转
        return torch.cat((x, y, wh, cls), dim)  # 拼接反缩放后的结果

    def _clip_augmented(self, y):
        """裁剪 YOLO 增强推理结果的尾部。"""
        nl = self.model[-1].nl  # 检测层的数量 (P3-P5)
        g = sum(4**x for x in range(nl))  # 网格点数
        e = 1  # 排除层计数
        i = (y[0].shape[-1] // g) * sum(4**x for x in range(e))  # 索引计算
        y[0] = y[0][..., :-i]  # 裁剪大尺度的输出
        i = (y[-1].shape[-1] // g) * sum(4 ** (nl - 1 - x) for x in range(e))  # 索引计算
        y[-1] = y[-1][..., i:]  # 裁剪小尺度的输出
        return y

    def init_criterion(self):
        """初始化检测模型的损失函数。"""
        return E2EDetectLoss(self) if getattr(self, "end2end", False) else v8DetectionLoss(self)
class OBBModel(DetectionModel):
    """YOLOv8 Oriented Bounding Box (OBB) model."""

    def __init__(self, cfg="yolov8n-obb.yaml", ch=3, nc=None, verbose=True):
        """Initialize YOLOv8 OBB model with given config and parameters."""
        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
        # 调用父类的初始化方法,设置模型的配置文件路径、通道数、类别数和详细输出标志

    def init_criterion(self):
        """Initialize the loss criterion for the model."""
        return v8OBBLoss(self)
        # 返回一个针对 OBB 模型的损失函数对象 v8OBBLoss



class SegmentationModel(DetectionModel):
    """YOLOv8 segmentation model."""

    def __init__(self, cfg="yolov8n-seg.yaml", ch=3, nc=None, verbose=True):
        """Initialize YOLOv8 segmentation model with given config and parameters."""
        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
        # 调用父类的初始化方法,设置模型的配置文件路径、通道数、类别数和详细输出标志

    def init_criterion(self):
        """Initialize the loss criterion for the SegmentationModel."""
        return v8SegmentationLoss(self)
        # 返回一个针对分割模型的损失函数对象 v8SegmentationLoss



class PoseModel(DetectionModel):
    """YOLOv8 pose model."""

    def __init__(self, cfg="yolov8n-pose.yaml", ch=3, nc=None, data_kpt_shape=(None, None), verbose=True):
        """Initialize YOLOv8 Pose model."""
        if not isinstance(cfg, dict):
            cfg = yaml_model_load(cfg)  # load model YAML
        if any(data_kpt_shape) and list(data_kpt_shape) != list(cfg["kpt_shape"]):
            LOGGER.info(f"Overriding model.yaml kpt_shape={cfg['kpt_shape']} with kpt_shape={data_kpt_shape}")
            cfg["kpt_shape"] = data_kpt_shape
        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
        # 如果配置不是字典,则加载模型的 YAML 配置文件
        # 如果给定了关键点形状参数,并且与配置文件中的不同,则记录日志并进行覆盖
        # 调用父类的初始化方法,设置模型的配置、通道数、类别数、数据关键点形状和详细输出标志

    def init_criterion(self):
        """Initialize the loss criterion for the PoseModel."""
        return v8PoseLoss(self)
        # 返回一个针对姿态估计模型的损失函数对象 v8PoseLoss



class ClassificationModel(BaseModel):
    """YOLOv8 classification model."""

    def __init__(self, cfg="yolov8n-cls.yaml", ch=3, nc=None, verbose=True):
        """Init ClassificationModel with YAML, channels, number of classes, verbose flag."""
        super().__init__()
        self._from_yaml(cfg, ch, nc, verbose)
        # 调用 BaseModel 的初始化方法,然后调用自身的 _from_yaml 方法进行更详细的初始化

    def _from_yaml(self, cfg, ch, nc, verbose):
        """Set YOLOv8 model configurations and define the model architecture."""
        self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg)  # cfg dict
        # 如果 cfg 是字典,则直接使用,否则加载 YAML 文件得到配置字典

        # Define model
        ch = self.yaml["ch"] = self.yaml.get("ch", ch)  # input channels
        # 设置输入通道数为配置文件中的 ch 值或者默认值 ch

        if nc and nc != self.yaml["nc"]:
            LOGGER.info(f"Overriding model.yaml nc={self.yaml['nc']} with nc={nc}")
            self.yaml["nc"] = nc  # override YAML value
        elif not nc and not self.yaml.get("nc", None):
            raise ValueError("nc not specified. Must specify nc in model.yaml or function arguments.")
        # 如果给定了类别数 nc 并且与配置文件中的不同,则记录日志并进行覆盖
        # 如果未指定 nc 并且配置文件中也没有指定,则引发 ValueError

        self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose)  # model, savelist
        # 使用配置字典和通道数解析模型得到模型对象和保存列表

        self.stride = torch.Tensor([1])  # no stride constraints
        # 设置模型的步长为固定值 1

        self.names = {i: f"{i}" for i in range(self.yaml["nc"])}  # default names dict
        # 根据类别数设置默认的名称字典,键为类别索引,值为类别索引的字符串表示

        self.info()
        # 输出模型的详细信息
    def reshape_outputs(model, nc):
        """Update a TorchVision classification model to class count 'n' if required."""
        # 获取模型的最后一个子模块的名称和实例
        name, m = list((model.model if hasattr(model, "model") else model).named_children())[-1]  # last module
        
        # 如果最后一个模块是 Classify 类型(例如 YOLO 的分类头)
        if isinstance(m, Classify):
            # 如果当前输出特征数不等于 nc,则更新线性层的输出特征数
            if m.linear.out_features != nc:
                m.linear = nn.Linear(m.linear.in_features, nc)
        
        # 如果最后一个模块是 nn.Linear 类型(例如 ResNet, EfficientNet)
        elif isinstance(m, nn.Linear):
            # 如果当前输出特征数不等于 nc,则替换当前模块为新的 nn.Linear
            if m.out_features != nc:
                setattr(model, name, nn.Linear(m.in_features, nc))
        
        # 如果最后一个模块是 nn.Sequential 类型
        elif isinstance(m, nn.Sequential):
            # 获取所有子模块的类型列表
            types = [type(x) for x in m]
            
            # 如果类型列表中包含 nn.Linear
            if nn.Linear in types:
                # 找到最后一个 nn.Linear 的索引
                i = len(types) - 1 - types[::-1].index(nn.Linear)  # last nn.Linear index
                # 如果该 nn.Linear 的输出特征数不等于 nc,则更新它
                if m[i].out_features != nc:
                    m[i] = nn.Linear(m[i].in_features, nc)
            
            # 如果类型列表中包含 nn.Conv2d
            elif nn.Conv2d in types:
                # 找到最后一个 nn.Conv2d 的索引
                i = len(types) - 1 - types[::-1].index(nn.Conv2d)  # last nn.Conv2d index
                # 如果该 nn.Conv2d 的输出通道数不等于 nc,则更新它
                if m[i].out_channels != nc:
                    m[i] = nn.Conv2d(m[i].in_channels, nc, m[i].kernel_size, m[i].stride, bias=m[i].bias is not None)

    def init_criterion(self):
        """Initialize the loss criterion for the ClassificationModel."""
        # 返回一个 v8ClassificationLoss 的实例,用于分类模型的损失计算
        return v8ClassificationLoss()
    # RTDETRDetectionModel 类,继承自 DetectionModel,用于实现 RTDETR(Real-time DEtection and Tracking using Transformers)检测模型。
    """
    RTDETR (Real-time DEtection and Tracking using Transformers) Detection Model class.

    This class is responsible for constructing the RTDETR architecture, defining loss functions, and facilitating both
    the training and inference processes. RTDETR is an object detection and tracking model that extends from the
    DetectionModel base class.

    Attributes:
        cfg (str): The configuration file path or preset string. Default is 'rtdetr-l.yaml'.
        ch (int): Number of input channels. Default is 3 (RGB).
        nc (int, optional): Number of classes for object detection. Default is None.
        verbose (bool): Specifies if summary statistics are shown during initialization. Default is True.

    Methods:
        init_criterion: Initializes the criterion used for loss calculation.
        loss: Computes and returns the loss during training.
        predict: Performs a forward pass through the network and returns the output.
    """

    def __init__(self, cfg="rtdetr-l.yaml", ch=3, nc=None, verbose=True):
        """
        Initialize the RTDETRDetectionModel.

        Args:
            cfg (str): Configuration file name or path.
            ch (int): Number of input channels.
            nc (int, optional): Number of classes. Defaults to None.
            verbose (bool, optional): Print additional information during initialization. Defaults to True.
        """
        # 调用父类 DetectionModel 的初始化方法,传入配置文件名、通道数、类别数和是否显示详细信息
        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)

    def init_criterion(self):
        """Initialize the loss criterion for the RTDETRDetectionModel."""
        # 导入 RTDETRDetectionLoss 类,用于初始化损失函数,传入类别数和是否使用视觉语义分割损失
        from ultralytics.models.utils.loss import RTDETRDetectionLoss

        return RTDETRDetectionLoss(nc=self.nc, use_vfl=True)
    def loss(self, batch, preds=None):
        """
        Compute the loss for the given batch of data.

        Args:
            batch (dict): Dictionary containing image and label data.
            preds (torch.Tensor, optional): Precomputed model predictions. Defaults to None.

        Returns:
            (tuple): A tuple containing the total loss and main three losses in a tensor.
        """
        if not hasattr(self, "criterion"):
            self.criterion = self.init_criterion()  # 初始化损失函数

        img = batch["img"]
        # NOTE: preprocess gt_bbox and gt_labels to list.
        bs = len(img)
        batch_idx = batch["batch_idx"]
        # 计算每个 batch 中的样本数
        gt_groups = [(batch_idx == i).sum().item() for i in range(bs)]
        # 构建目标数据字典,包括类别、边界框、批次索引和分组信息
        targets = {
            "cls": batch["cls"].to(img.device, dtype=torch.long).view(-1),  # 类别数据
            "bboxes": batch["bboxes"].to(device=img.device),  # 边界框数据
            "batch_idx": batch_idx.to(img.device, dtype=torch.long).view(-1),  # 批次索引
            "gt_groups": gt_groups,  # 分组信息
        }

        # 如果未提供预测值 preds,则使用模型进行预测
        preds = self.predict(img, batch=targets) if preds is None else preds
        # 解析预测结果中的各项数据
        dec_bboxes, dec_scores, enc_bboxes, enc_scores, dn_meta = preds if self.training else preds[1]
        if dn_meta is None:
            dn_bboxes, dn_scores = None, None
        else:
            # 按照 dn_meta 中的信息对预测结果进行分割
            dn_bboxes, dec_bboxes = torch.split(dec_bboxes, dn_meta["dn_num_split"], dim=2)
            dn_scores, dec_scores = torch.split(dec_scores, dn_meta["dn_num_split"], dim=2)

        # 将编码器的预测结果与解码器的预测结果拼接起来
        dec_bboxes = torch.cat([enc_bboxes.unsqueeze(0), dec_bboxes])  # (7, bs, 300, 4)
        dec_scores = torch.cat([enc_scores.unsqueeze(0), dec_scores])

        # 计算损失函数
        loss = self.criterion(
            (dec_bboxes, dec_scores), targets, dn_bboxes=dn_bboxes, dn_scores=dn_scores, dn_meta=dn_meta
        )
        # NOTE: There are like 12 losses in RTDETR, backward with all losses but only show the main three losses.
        # 计算并返回总损失和主要三个损失项的张量形式
        return sum(loss.values()), torch.as_tensor(
            [loss[k].detach() for k in ["loss_giou", "loss_class", "loss_bbox"]], device=img.device
        )
    def predict(self, x, profile=False, visualize=False, batch=None, augment=False, embed=None):
        """
        Perform a forward pass through the model.

        Args:
            x (torch.Tensor): The input tensor.
            profile (bool, optional): If True, profile the computation time for each layer. Defaults to False.
            visualize (bool, optional): If True, save feature maps for visualization. Defaults to False.
            batch (dict, optional): Ground truth data for evaluation. Defaults to None.
            augment (bool, optional): If True, perform data augmentation during inference. Defaults to False.
            embed (list, optional): A list of feature vectors/embeddings to return.

        Returns:
            (torch.Tensor): Model's output tensor.
        """
        y, dt, embeddings = [], [], []  # outputs

        for m in self.model[:-1]:  # iterate through all layers except the last one (head)
            if m.f != -1:  # if not from previous layer
                x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # retrieve inputs from earlier layers

            if profile:
                self._profile_one_layer(m, x, dt)  # profile the computation time of the current layer

            x = m(x)  # perform forward pass through the current layer
            y.append(x if m.i in self.save else None)  # save output if specified by self.save

            if visualize:
                feature_visualization(x, m.type, m.i, save_dir=visualize)  # visualize feature maps if enabled

            if embed and m.i in embed:
                # compute embeddings by adaptive average pooling and flattening
                embeddings.append(nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1))
                if m.i == max(embed):
                    return torch.unbind(torch.cat(embeddings, 1), dim=0)  # return embeddings if the last embedding layer is reached

        head = self.model[-1]
        x = head([y[j] for j in head.f], batch)  # perform inference with the head layer using saved outputs and optional batch data
        return x  # return the final output tensor
class WorldModel(DetectionModel):
    """YOLOv8 World Model."""

    def __init__(self, cfg="yolov8s-world.yaml", ch=3, nc=None, verbose=True):
        """Initialize YOLOv8 world model with given config and parameters."""
        # 创建一个随机初始化的文本特征张量,形状为 (1, nc 或 80, 512),用作特征占位符
        self.txt_feats = torch.randn(1, nc or 80, 512)  # features placeholder
        # 初始化 CLIP 模型占位符为 None
        self.clip_model = None  # CLIP model placeholder
        # 调用父类的初始化方法,传入配置 cfg、通道数 ch、类别数 nc 和是否详细输出 verbose
        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)

    def set_classes(self, text, batch=80, cache_clip_model=True):
        """Set classes in advance so that model could do offline-inference without clip model."""
        try:
            import clip
        except ImportError:
            # 如果导入 clip 失败,则安装要求的版本
            check_requirements("git+https://github.com/ultralytics/CLIP.git")
            import clip

        # 如果 self.clip_model 属性不存在且 cache_clip_model 为 True,则加载 CLIP 模型
        if (
            not getattr(self, "clip_model", None) and cache_clip_model
        ):  # for backwards compatibility of models lacking clip_model attribute
            self.clip_model = clip.load("ViT-B/32")[0]
        
        # 如果 cache_clip_model 为 True,则使用缓存的 clip_model,否则加载新的 CLIP 模型
        model = self.clip_model if cache_clip_model else clip.load("ViT-B/32")[0]
        # 获取模型所在设备
        device = next(model.parameters()).device
        # 将输入文本转换为 CLIP 模型可接受的 token,并发送到指定设备
        text_token = clip.tokenize(text).to(device)
        # 使用 CLIP 模型对文本 token 进行编码,按批次分割并进行编码,然后分离梯度
        txt_feats = [model.encode_text(token).detach() for token in text_token.split(batch)]
        # 如果只有一个批次,则直接取第一个编码结果;否则在指定维度上拼接所有结果
        txt_feats = txt_feats[0] if len(txt_feats) == 1 else torch.cat(txt_feats, dim=0)
        # 对文本特征进行 L2 范数归一化处理
        txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
        # 将归一化后的文本特征重塑为指定形状,更新 self.txt_feats
        self.txt_feats = txt_feats.reshape(-1, len(text), txt_feats.shape[-1])
        # 更新模型最后一层的类别数为文本的长度
        self.model[-1].nc = len(text)
    def predict(self, x, profile=False, visualize=False, txt_feats=None, augment=False, embed=None):
        """
        Perform a forward pass through the model.

        Args:
            x (torch.Tensor): The input tensor.
            profile (bool, optional): If True, profile the computation time for each layer. Defaults to False.
            visualize (bool, optional): If True, save feature maps for visualization. Defaults to False.
            txt_feats (torch.Tensor): The text features, use it if it's given. Defaults to None.
            augment (bool, optional): If True, perform data augmentation during inference. Defaults to False.
            embed (list, optional): A list of feature vectors/embeddings to return.

        Returns:
            (torch.Tensor): Model's output tensor.
        """
        # Convert txt_feats to device and dtype of input tensor x, defaulting to self.txt_feats if txt_feats is None
        txt_feats = (self.txt_feats if txt_feats is None else txt_feats).to(device=x.device, dtype=x.dtype)
        
        # If txt_feats has different length than x, repeat txt_feats to match the length of x
        if len(txt_feats) != len(x):
            txt_feats = txt_feats.repeat(len(x), 1, 1)
        
        # Create a deep copy of txt_feats for potential use later
        ori_txt_feats = txt_feats.clone()
        
        y, dt, embeddings = [], [], []  # Initialize lists for outputs
        
        # Iterate through each module in self.model (except the head part)
        for m in self.model:
            # Check if m.f is not -1, meaning it's not from a previous layer
            if m.f != -1:
                # Determine input x based on m.f, which can be an int or a list of ints
                x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]
            
            # If profiling is enabled, profile the computation time for the current layer
            if profile:
                self._profile_one_layer(m, x, dt)
            
            # Apply specific operations based on module type
            if isinstance(m, C2fAttn):
                x = m(x, txt_feats)  # Apply attention module with text features
            elif isinstance(m, WorldDetect):
                x = m(x, ori_txt_feats)  # Apply world detection module with original text features
            elif isinstance(m, ImagePoolingAttn):
                txt_feats = m(x, txt_feats)  # Apply image pooling attention module to text features
            else:
                x = m(x)  # Perform standard forward pass for other module types
            
            # Save the output of the current module
            y.append(x if m.i in self.save else None)
            
            # If visualization is enabled, save feature maps for visualization
            if visualize:
                feature_visualization(x, m.type, m.i, save_dir=visualize)
            
            # If embeddings are requested and the current module index is in embed list, compute embeddings
            if embed and m.i in embed:
                embeddings.append(nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1))  # Flatten embeddings
                if m.i == max(embed):
                    return torch.unbind(torch.cat(embeddings, 1), dim=0)  # Return concatenated embeddings
        
        # Return the final output tensor
        return x


    def loss(self, batch, preds=None):
        """
        Compute loss.

        Args:
            batch (dict): Batch to compute loss on.
            preds (torch.Tensor | List[torch.Tensor]): Predictions.
        """
        # Initialize the criterion if it's not already initialized
        if not hasattr(self, "criterion"):
            self.criterion = self.init_criterion()
        
        # If preds is None, compute predictions using forward pass with batch["img"] and optional txt_feats
        if preds is None:
            preds = self.forward(batch["img"], txt_feats=batch["txt_feats"])
        
        # Compute and return the loss using initialized criterion
        return self.criterion(preds, batch)
class Ensemble(nn.ModuleList):
    """Ensemble of models."""

    def __init__(self):
        """Initialize an ensemble of models."""
        super().__init__()

    def forward(self, x, augment=False, profile=False, visualize=False):
        """Function generates the YOLO network's final layer."""
        # 对每个模型进行前向传播,获取输出列表
        y = [module(x, augment, profile, visualize)[0] for module in self]
        # 将每个模型的输出按第2维度(channel维度)拼接起来,用于非极大值抑制
        y = torch.cat(y, 2)  # nms ensemble, y shape(B, HW, C)
        return y, None  # 返回输出以及空对象,用于推断和训练输出


# Functions ------------------------------------------------------------------------------------------------------------


@contextlib.contextmanager
def temporary_modules(modules=None, attributes=None):
    """
    Context manager for temporarily adding or modifying modules in Python's module cache (`sys.modules`).

    This function can be used to change the module paths during runtime. It's useful when refactoring code,
    where you've moved a module from one location to another, but you still want to support the old import
    paths for backwards compatibility.

    Args:
        modules (dict, optional): A dictionary mapping old module paths to new module paths.
        attributes (dict, optional): A dictionary mapping old module attributes to new module attributes.

    Example:
        ```python
        with temporary_modules({'old.module': 'new.module'}, {'old.module.attribute': 'new.module.attribute'}):
            import old.module  # this will now import new.module
            from old.module import attribute  # this will now import new.module.attribute
        ```py

    Note:
        The changes are only in effect inside the context manager and are undone once the context manager exits.
        Be aware that directly manipulating `sys.modules` can lead to unpredictable results, especially in larger
        applications or libraries. Use this function with caution.
    """

    if modules is None:
        modules = {}
    if attributes is None:
        attributes = {}
    import sys
    from importlib import import_module

    try:
        # 将新的模块路径设置在sys.modules下的旧名称
        for old, new in attributes.items():
            old_module, old_attr = old.rsplit(".", 1)
            new_module, new_attr = new.rsplit(".", 1)
            setattr(import_module(old_module), old_attr, getattr(import_module(new_module), new_attr))

        # 将新的模块设置在sys.modules下的旧名称
        for old, new in modules.items():
            sys.modules[old] = import_module(new)

        yield
    finally:
        # 清除临时添加的模块路径
        for old in modules:
            if old in sys.modules:
                del sys.modules[old]


def torch_safe_load(weight):
    """
    This function attempts to load a PyTorch model with the torch.load() function. If a ModuleNotFoundError is raised,
    ```
    # 导入下载相关的函数
    from ultralytics.utils.downloads import attempt_download_asset
    
    # 检查文件后缀是否为 ".pt",如果不是则抛出异常
    check_suffix(file=weight, suffix=".pt")
    
    # 尝试从在线获取模型文件,如果本地不存在的话
    file = attempt_download_asset(weight)  # search online if missing locally
    
    try:
        # 使用临时的模块映射和属性映射加载模型检查点文件
        with temporary_modules(
            modules={
                "ultralytics.yolo.utils": "ultralytics.utils",
                "ultralytics.yolo.v8": "ultralytics.models.yolo",
                "ultralytics.yolo.data": "ultralytics.data",
            },
            attributes={
                "ultralytics.nn.modules.block.Silence": "torch.nn.Identity",  # YOLOv9e
                "ultralytics.nn.tasks.YOLOv10DetectionModel": "ultralytics.nn.tasks.DetectionModel",  # YOLOv10
            },
        ):
            # 使用 torch.load() 加载模型检查点文件到内存中,指定在 CPU 上加载
            ckpt = torch.load(file, map_location="cpu")

    except ModuleNotFoundError as e:  # 如果捕获到模块未找到的异常
        if e.name == "models":
            # 抛出类型错误,提示用户模型不兼容,并提供建议
            raise TypeError(
                emojis(
                    f"ERROR ❌️ {weight} appears to be an Ultralytics YOLOv5 model originally trained "
                    f"with https://github.com/ultralytics/yolov5.\nThis model is NOT forwards compatible with "
                    f"YOLOv8 at https://github.com/ultralytics/ultralytics."
                    f"\nRecommend fixes are to train a new model using the latest 'ultralytics' package or to "
                    f"run a command with an official Ultralytics model, i.e. 'yolo predict model=yolov8n.pt'"
                )
            ) from e
        # 记录警告信息,指出模型需要缺失的模块,并建议安装
        LOGGER.warning(
            f"WARNING ⚠️ {weight} appears to require '{e.name}', which is not in Ultralytics requirements."
            f"\nAutoInstall will run now for '{e.name}' but this feature will be removed in the future."
            f"\nRecommend fixes are to train a new model using the latest 'ultralytics' package or to "
            f"run a command with an official Ultralytics model, i.e. 'yolo predict model=yolov8n.pt'"
        )
        # 安装缺失的模块
        check_requirements(e.name)
        # 再次尝试加载模型检查点文件到内存中,指定在 CPU 上加载
        ckpt = torch.load(file, map_location="cpu")

    if not isinstance(ckpt, dict):
        # 如果加载的模型检查点不是字典类型,给出警告信息,并假设其格式不正确,尝试修复
        LOGGER.warning(
            f"WARNING ⚠️ The file '{weight}' appears to be improperly saved or formatted. "
            f"For optimal results, use model.save('filename.pt') to correctly save YOLO models."
        )
        # 假设该文件是使用 torch.save(model, "saved_model.pt") 保存的 YOLO 实例,将其存入字典中
        ckpt = {"model": ckpt.model}

    # 返回加载的模型检查点及其文件路径
    return ckpt, file  # load
# 尝试加载多个模型的权重(模型集合或单个模型)
def attempt_load_weights(weights, device=None, inplace=True, fuse=False):
    """Loads an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a."""

    # 创建一个 Ensemble 对象,用于存储加载的模型
    ensemble = Ensemble()

    # 遍历 weights 列表中的每个权重,如果 weights 是单个模型,则转换为列表再遍历
    for w in weights if isinstance(weights, list) else [weights]:
        # 使用 torch_safe_load 加载权重和检查点
        ckpt, w = torch_safe_load(w)  # load ckpt

        # 如果检查点中存在 "train_args",则将其与默认参数 DEFAULT_CFG_DICT 合并作为 args
        args = {**DEFAULT_CFG_DICT, **ckpt["train_args"]} if "train_args" in ckpt else None  # combined args

        # 从检查点中获取 EMA 模型或主模型,并将其转换为指定设备上的浮点数模型
        model = (ckpt.get("ema") or ckpt["model"]).to(device).float()  # FP32 model

        # 更新模型的一致性和其他属性
        model.args = args  # attach args to model
        model.pt_path = w  # attach *.pt file path to model
        model.task = guess_model_task(model)

        # 如果模型没有 stride 属性,则设置默认 stride 为 [32.0]
        if not hasattr(model, "stride"):
            model.stride = torch.tensor([32.0])

        # 如果开启了融合模式(fuse)并且模型具有 fuse 方法,则进行融合并设置模型为评估模式
        ensemble.append(model.fuse().eval() if fuse and hasattr(model, "fuse") else model.eval())  # model in eval mode

    # 遍历 ensemble 中的每个模型,并更新其模块的 inplace 属性或进行其他兼容性更新
    for m in ensemble.modules():
        if hasattr(m, "inplace"):
            m.inplace = inplace
        elif isinstance(m, nn.Upsample) and not hasattr(m, "recompute_scale_factor"):
            m.recompute_scale_factor = None  # torch 1.11.0 compatibility

    # 如果 ensemble 中只有一个模型,则返回该模型
    if len(ensemble) == 1:
        return ensemble[-1]

    # 如果有多个模型,则返回整个 ensemble
    LOGGER.info(f"Ensemble created with {weights}\n")
    for k in "names", "nc", "yaml":
        setattr(ensemble, k, getattr(ensemble[0], k))
    ensemble.stride = ensemble[int(torch.argmax(torch.tensor([m.stride.max() for m in ensemble])))].stride
    assert all(ensemble[0].nc == m.nc for m in ensemble), f"Models differ in class counts {[m.nc for m in ensemble]}"
    return ensemble


# 尝试加载单个模型的权重
def attempt_load_one_weight(weight, device=None, inplace=True, fuse=False):
    """Loads a single model weights."""

    # 使用 torch_safe_load 加载单个模型的权重和检查点
    ckpt, weight = torch_safe_load(weight)  # load ckpt

    # 将模型的默认配置参数与检查点中的训练参数合并,优先使用模型参数
    args = {**DEFAULT_CFG_DICT, **(ckpt.get("train_args", {}))}  # combine model and default args, preferring model args

    # 从检查点中获取 EMA 模型或主模型,并将其转换为指定设备上的浮点数模型
    model = (ckpt.get("ema") or ckpt["model"]).to(device).float()  # FP32 model

    # 更新模型的一致性和其他属性
    model.args = {k: v for k, v in args.items() if k in DEFAULT_CFG_KEYS}  # attach args to model
    model.pt_path = weight  # attach *.pt file path to model
    model.task = guess_model_task(model)

    # 如果模型没有 stride 属性,则设置默认 stride 为 [32.0]
    if not hasattr(model, "stride"):
        model.stride = torch.tensor([32.0])

    # 如果开启了融合模式(fuse)并且模型具有 fuse 方法,则进行融合并设置模型为评估模式
    model = model.fuse().eval() if fuse and hasattr(model, "fuse") else model.eval()  # model in eval mode

    # 遍历模型的所有模块,并更新 inplace 属性或进行其他兼容性更新
    for m in model.modules():
        if hasattr(m, "inplace"):
            m.inplace = inplace
        elif isinstance(m, nn.Upsample) and not hasattr(m, "recompute_scale_factor"):
            m.recompute_scale_factor = None  # torch 1.11.0 compatibility

    # 返回加载的模型和检查点
    return model, ckpt


# 解析 YOLO 模型的模型字典,转换为 PyTorch 模型
def parse_model(d, ch, verbose=True):  # model_dict, input_channels(3)
    """Parse a YOLO model.yaml dictionary into a PyTorch model."""
    import ast

    # Args
    # 设置最大通道数为无穷大
    max_channels = float("inf")
    
    # 从字典 d 中获取 nc、act、scales 的值,并分别赋给 nc、act、scales
    nc, act, scales = (d.get(x) for x in ("nc", "activation", "scales"))
    
    # 从字典 d 中获取 depth_multiple、width_multiple、kpt_shape 的值,如果不存在则使用默认值
    depth, width, kpt_shape = (d.get(x, 1.0) for x in ("depth_multiple", "width_multiple", "kpt_shape"))
    
    # 如果 scales 存在
    if scales:
        # 从字典 d 中获取 scale 的值
        scale = d.get("scale")
        # 如果 scale 不存在,则选择 scales 字典的第一个键作为默认值,并发出警告信息
        if not scale:
            scale = tuple(scales.keys())[0]
            LOGGER.warning(f"WARNING ⚠️ no model scale passed. Assuming scale='{scale}'.")
        # 将 depth、width、max_channels 设置为 scales[scale] 中的值
        depth, width, max_channels = scales[scale]
    
    # 如果 act 存在
    if act:
        # 重新定义默认激活函数为 eval(act),例如 Conv.default_act = nn.SiLU()
        Conv.default_act = eval(act)
        # 如果 verbose 为真,则打印激活函数信息
        if verbose:
            LOGGER.info(f"{colorstr('activation:')} {act}")
    
    # 如果 verbose 为真
    if verbose:
        # 打印信息表头,显示模块的各种参数信息
        LOGGER.info(f"\n{'':>3}{'from':>20}{'n':>3}{'params':>10}  {'module':<45}{'arguments':<30}")
    
    # 将 ch 包装成列表
    ch = [ch]
    
    # 初始化 layers、save、c2 为空列表
    layers, save, c2 = [], [], ch[-1]  # layers, savelist, ch out
    
    # 返回 nn.Sequential(*layers) 和 sorted(save)
    return nn.Sequential(*layers), sorted(save)
def yaml_model_load(path):
    """Load a YOLOv8 model from a YAML file."""
    import re  # 导入正则表达式模块

    path = Path(path)  # 将路径转换为Path对象
    # 检查是否是YOLOv5或YOLOv8系列模型,若是,则重命名模型文件名
    if path.stem in (f"yolov{d}{x}6" for x in "nsmlx" for d in (5, 8)):
        new_stem = re.sub(r"(\d+)([nslmx])6(.+)?$", r"\1\2-p6\3", path.stem)
        LOGGER.warning(f"WARNING ⚠️ Ultralytics YOLO P6 models now use -p6 suffix. Renaming {path.stem} to {new_stem}.")
        path = path.with_name(new_stem + path.suffix)

    # 统一模型文件名格式,例如将yolov8x.yaml -> yolov8.yaml
    unified_path = re.sub(r"(\d+)([nslmx])(.+)?$", r"\1\3", str(path))
    # 检查并加载YAML文件,优先使用unified_path,其次使用原始路径path
    yaml_file = check_yaml(unified_path, hard=False) or check_yaml(path)
    d = yaml_load(yaml_file)  # 加载YAML文件内容到字典d,表示模型
    d["scale"] = guess_model_scale(path)  # 猜测模型的规模大小,并存储在字典中
    d["yaml_file"] = str(path)  # 记录模型的YAML文件路径到字典中
    return d  # 返回模型的字典表示


def guess_model_scale(model_path):
    """
    Takes a path to a YOLO model's YAML file as input and extracts the size character of the model's scale. The function
    uses regular expression matching to find the pattern of the model scale in the YAML file name, which is denoted by
    n, s, m, l, or x. The function returns the size character of the model scale as a string.

    Args:
        model_path (str | Path): The path to the YOLO model's YAML file.

    Returns:
        (str): The size character of the model's scale, which can be n, s, m, l, or x.
    """
    with contextlib.suppress(AttributeError):
        import re  # 导入正则表达式模块

        return re.search(r"yolov\d+([nslmx])", Path(model_path).stem).group(1)  # 从模型路径中提取规模大小字符,如n, s, m, l, x
    return ""  # 若提取失败,则返回空字符串


def guess_model_task(model):
    """
    Guess the task of a PyTorch model from its architecture or configuration.

    Args:
        model (nn.Module | dict): PyTorch model or model configuration in YAML format.

    Returns:
        (str): Task of the model ('detect', 'segment', 'classify', 'pose').

    Raises:
        SyntaxError: If the task of the model could not be determined.
    """

    def cfg2task(cfg):
        """Guess from YAML dictionary."""
        m = cfg["head"][-1][-2].lower()  # 提取输出模块名称,并转换为小写
        if m in {"classify", "classifier", "cls", "fc"}:
            return "classify"  # 分类任务
        if "detect" in m:
            return "detect"  # 目标检测任务
        if m == "segment":
            return "segment"  # 分割任务
        if m == "pose":
            return "pose"  # 姿态估计任务
        if m == "obb":
            return "obb"  # 方向边界框任务

    # 从模型配置中猜测任务类型
    if isinstance(model, dict):
        with contextlib.suppress(Exception):
            return cfg2task(model)

    # 从PyTorch模型中猜测任务类型
    # 如果 model 是 nn.Module 的实例,表示这是一个 PyTorch 模型
    if isinstance(model, nn.Module):  # PyTorch model
        # 遍历可能包含任务信息的不同路径
        for x in "model.args", "model.model.args", "model.model.model.args":
            # 尝试从路径中获取任务信息,如果成功则返回任务名称
            with contextlib.suppress(Exception):
                return eval(x)["task"]
        
        # 如果未能从上述路径中获取任务信息,尝试从 YAML 文件中解析任务信息
        for x in "model.yaml", "model.model.yaml", "model.model.model.yaml":
            # 尝试解析 YAML 文件并转换为任务信息
            with contextlib.suppress(Exception):
                return cfg2task(eval(x))

        # 遍历模型的所有模块
        for m in model.modules():
            # 根据模块类型判断任务类型
            if isinstance(m, Segment):
                return "segment"
            elif isinstance(m, Classify):
                return "classify"
            elif isinstance(m, Pose):
                return "pose"
            elif isinstance(m, OBB):
                return "obb"
            elif isinstance(m, (Detect, WorldDetect, v10Detect)):
                return "detect"

    # 如果 model 是字符串或路径的实例,尝试根据文件名猜测任务类型
    if isinstance(model, (str, Path)):
        model = Path(model)
        # 根据文件名的特定标识来猜测任务类型
        if "-seg" in model.stem or "segment" in model.parts:
            return "segment"
        elif "-cls" in model.stem or "classify" in model.parts:
            return "classify"
        elif "-pose" in model.stem or "pose" in model.parts:
            return "pose"
        elif "-obb" in model.stem or "obb" in model.parts:
            return "obb"
        elif "detect" in model.parts:
            return "detect"

    # 如果无法从模型中确定任务类型,则发出警告,并假设任务为检测 ("detect")
    LOGGER.warning(
        "WARNING ⚠️ Unable to automatically guess model task, assuming 'task=detect'. "
        "Explicitly define task for your model, i.e. 'task=detect', 'segment', 'classify','pose' or 'obb'."
    )
    return "detect"  # assume detect
posted @ 2024-09-05 12:00  绝不原创的飞龙  阅读(8)  评论(0编辑  收藏  举报