Yolov8-源码解析-三十六-

Yolov8 源码解析(三十六)

.\yolov8\ultralytics\models\yolo\pose\__init__.py

# 导入模块 predict 中的 PosePredictor 类
# 导入模块 train 中的 PoseTrainer 类
# 导入模块 val 中的 PoseValidator 类
from .predict import PosePredictor
from .train import PoseTrainer
from .val import PoseValidator

# 定义 __all__ 变量,包含需要在该模块中公开的类名字符串
__all__ = "PoseTrainer", "PoseValidator", "PosePredictor"

.\yolov8\ultralytics\models\yolo\segment\predict.py

# 导入必要的模块和类
from ultralytics.engine.results import Results
from ultralytics.models.yolo.detect.predict import DetectionPredictor
from ultralytics.utils import DEFAULT_CFG, ops

class SegmentationPredictor(DetectionPredictor):
    """
    一个扩展了DetectionPredictor类的类,用于基于分割模型进行预测。

    示例:
        ```python
        from ultralytics.utils import ASSETS
        from ultralytics.models.yolo.segment import SegmentationPredictor

        args = dict(model='yolov8n-seg.pt', source=ASSETS)
        predictor = SegmentationPredictor(overrides=args)
        predictor.predict_cli()
        ```py
    """

    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
        """
        初始化SegmentationPredictor对象,使用提供的配置、覆盖和回调函数。
        """
        super().__init__(cfg, overrides, _callbacks)
        self.args.task = "segment"  # 设置预测任务为分割任务

    def postprocess(self, preds, img, orig_imgs):
        """
        对每个输入批次中的图像应用非最大抑制,并处理检测结果。
        """
        # 对预测结果应用非最大抑制
        p = ops.non_max_suppression(
            preds[0],
            self.args.conf,
            self.args.iou,
            agnostic=self.args.agnostic_nms,
            max_det=self.args.max_det,
            nc=len(self.model.names),
            classes=self.args.classes,
        )

        # 如果输入图像不是一个列表,而是一个torch.Tensor,则转换为numpy数组
        if not isinstance(orig_imgs, list):
            orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)

        results = []  # 初始化结果列表
        proto = preds[1][-1] if isinstance(preds[1], tuple) else preds[1]  # 确定使用的协议格式
        for i, (pred, orig_img, img_path) in enumerate(zip(p, orig_imgs, self.batch[0])):
            if not len(pred):  # 如果预测结果为空,保存空框
                masks = None
            elif self.args.retina_masks:  # 如果需要返回掩膜
                # 缩放框,并处理原始图像生成掩膜
                pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
                masks = ops.process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2])  # HWC
            else:
                # 处理掩膜,生成掩膜,并缩放框
                masks = ops.process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True)  # HWC
                pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
            # 将处理后的结果添加到结果列表中
            results.append(Results(orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6], masks=masks))
        return results  # 返回处理后的结果列表

.\yolov8\ultralytics\models\yolo\segment\train.py

# 导入必要的模块和类
from copy import copy
from ultralytics.models import yolo
from ultralytics.nn.tasks import SegmentationModel
from ultralytics.utils import DEFAULT_CFG, RANK
from ultralytics.utils.plotting import plot_images, plot_results

# 定义一个继承自DetectionTrainer的类,用于分割模型的训练
class SegmentationTrainer(yolo.detect.DetectionTrainer):
    """
    A class extending the DetectionTrainer class for training based on a segmentation model.

    Example:
        ```python
        from ultralytics.models.yolo.segment import SegmentationTrainer

        args = dict(model='yolov8n-seg.pt', data='coco8-seg.yaml', epochs=3)
        trainer = SegmentationTrainer(overrides=args)
        trainer.train()
        ```py
    """

    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
        """Initialize a SegmentationTrainer object with given arguments."""
        if overrides is None:
            overrides = {}
        # 设置任务类型为分割任务
        overrides["task"] = "segment"
        super().__init__(cfg, overrides, _callbacks)

    def get_model(self, cfg=None, weights=None, verbose=True):
        """Return SegmentationModel initialized with specified config and weights."""
        # 使用指定的配置和权重初始化分割模型
        model = SegmentationModel(cfg, ch=3, nc=self.data["nc"], verbose=verbose and RANK == -1)
        if weights:
            model.load(weights)

        return model

    def get_validator(self):
        """Return an instance of SegmentationValidator for validation of YOLO model."""
        # 返回一个SegmentationValidator的实例,用于验证YOLO模型
        self.loss_names = "box_loss", "seg_loss", "cls_loss", "dfl_loss"
        return yolo.segment.SegmentationValidator(
            self.test_loader, save_dir=self.save_dir, args=copy(self.args), _callbacks=self.callbacks
        )

    def plot_training_samples(self, batch, ni):
        """Creates a plot of training sample images with labels and box coordinates."""
        # 绘制包含标签和框坐标的训练样本图像的图表
        plot_images(
            batch["img"],
            batch["batch_idx"],
            batch["cls"].squeeze(-1),
            batch["bboxes"],
            masks=batch["masks"],
            paths=batch["im_file"],
            fname=self.save_dir / f"train_batch{ni}.jpg",
            on_plot=self.on_plot,
        )

    def plot_metrics(self):
        """Plots training/val metrics."""
        # 绘制训练/验证指标的图表
        plot_results(file=self.csv, segment=True, on_plot=self.on_plot)  # save results.png

.\yolov8\ultralytics\models\yolo\segment\val.py

# 导入所需模块
from multiprocessing.pool import ThreadPool
from pathlib import Path

# 导入 NumPy 和 PyTorch 库
import numpy as np
import torch
import torch.nn.functional as F

# 导入 Ultralytics 相关模块和函数
from ultralytics.models.yolo.detect import DetectionValidator
from ultralytics.utils import LOGGER, NUM_THREADS, ops
from ultralytics.utils.checks import check_requirements
from ultralytics.utils.metrics import SegmentMetrics, box_iou, mask_iou
from ultralytics.utils.plotting import output_to_target, plot_images

# 定义一个继承自 DetectionValidator 的 SegmentationValidator 类
class SegmentationValidator(DetectionValidator):
    """
    A class extending the DetectionValidator class for validation based on a segmentation model.

    Example:
        ```python
        from ultralytics.models.yolo.segment import SegmentationValidator

        args = dict(model='yolov8n-seg.pt', data='coco8-seg.yaml')
        validator = SegmentationValidator(args=args)
        validator()
        ```py
    """

    def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None):
        """Initialize SegmentationValidator and set task to 'segment', metrics to SegmentMetrics."""
        # 调用父类的初始化方法
        super().__init__(dataloader, save_dir, pbar, args, _callbacks)
        # 初始化额外的属性
        self.plot_masks = None
        self.process = None
        # 将任务设置为 'segment',并初始化评估指标为 SegmentMetrics
        self.args.task = "segment"
        self.metrics = SegmentMetrics(save_dir=self.save_dir, on_plot=self.on_plot)

    def preprocess(self, batch):
        """Preprocesses batch by converting masks to float and sending to device."""
        # 调用父类的预处理方法
        batch = super().preprocess(batch)
        # 将批次中的 masks 转换为 float 类型,并发送到设备上
        batch["masks"] = batch["masks"].to(self.device).float()
        return batch

    def init_metrics(self, model):
        """Initialize metrics and select mask processing function based on save_json flag."""
        # 调用父类的初始化评估指标方法
        super().init_metrics(model)
        # 初始化绘制 masks 的列表
        self.plot_masks = []
        # 如果设置了保存为 JSON 格式,则检查所需的 pycocotools 版本
        if self.args.save_json:
            check_requirements("pycocotools>=2.0.6")
        # 根据保存标志选择处理 masks 的函数
        # 如果设置了保存为 JSON 或 TXT,则选择更精确的本地处理函数
        self.process = ops.process_mask_native if self.args.save_json or self.args.save_txt else ops.process_mask
        # 初始化统计信息字典
        self.stats = dict(tp_m=[], tp=[], conf=[], pred_cls=[], target_cls=[], target_img=[])

    def get_desc(self):
        """Return a formatted description of evaluation metrics."""
        # 返回格式化的评估指标描述字符串
        return ("%22s" + "%11s" * 10) % (
            "Class",
            "Images",
            "Instances",
            "Box(P",
            "R",
            "mAP50",
            "mAP50-95)",
            "Mask(P",
            "R",
            "mAP50",
            "mAP50-95)",
        )
    def postprocess(self, preds):
        """
        Post-processes YOLO predictions and returns output detections with proto.

        Args:
            preds (list): List of prediction outputs from YOLO model.

        Returns:
            tuple: A tuple containing processed predictions (p) and prototype data (proto).
        """
        # Perform non-maximum suppression on the first prediction output
        p = ops.non_max_suppression(
            preds[0],
            self.args.conf,
            self.args.iou,
            labels=self.lb,
            multi_label=True,
            agnostic=self.args.single_cls,
            max_det=self.args.max_det,
            nc=self.nc,
        )
        # Determine the prototype data from the second prediction output
        proto = preds[1][-1] if len(preds[1]) == 3 else preds[1]  # second output is len 3 if pt, but only 1 if exported
        return p, proto

    def _prepare_batch(self, si, batch):
        """
        Prepares a batch for training or inference by processing images and targets.

        Args:
            si (int): Index of the current sample in the batch.
            batch (dict): Dictionary containing batch data including images and targets.

        Returns:
            dict: A prepared batch dictionary with additional 'masks' data.
        """
        # Call superclass method to prepare the batch
        prepared_batch = super()._prepare_batch(si, batch)
        # Determine which indices to use for masks based on overlap_mask flag
        midx = [si] if self.args.overlap_mask else batch["batch_idx"] == si
        # Add masks data to the prepared batch
        prepared_batch["masks"] = batch["masks"][midx]
        return prepared_batch

    def _prepare_pred(self, pred, pbatch, proto):
        """
        Prepares predictions for training or inference by processing images and targets.

        Args:
            pred (Tensor): Predictions from the model.
            pbatch (dict): Prepared batch data.
            proto (Tensor): Prototype data for processing masks.

        Returns:
            tuple: A tuple containing processed predictions (predn) and processed masks (pred_masks).
        """
        # Call superclass method to prepare predictions
        predn = super()._prepare_pred(pred, pbatch)
        # Process masks using prototype data and prediction outputs
        pred_masks = self.process(proto, pred[:, 6:], pred[:, :4], shape=pbatch["imgsz"])
        return predn, pred_masks
    # 更新评估指标的方法
    def update_metrics(self, preds, batch):
        """Metrics."""
        # 遍历预测结果的每个样本
        for si, (pred, proto) in enumerate(zip(preds[0], preds[1])):
            # 增加已处理样本计数
            self.seen += 1
            # 计算当前预测的数量
            npr = len(pred)
            # 初始化统计数据结构
            stat = dict(
                conf=torch.zeros(0, device=self.device),  # 置信度列表
                pred_cls=torch.zeros(0, device=self.device),  # 预测类别列表
                tp=torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device),  # True Positive 列表
                tp_m=torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device),  # True Positive for Masked 列表
            )
            # 准备批次数据
            pbatch = self._prepare_batch(si, batch)
            # 分离出类别和边界框数据
            cls, bbox = pbatch.pop("cls"), pbatch.pop("bbox")
            # 计算目标类别和独特类别
            nl = len(cls)
            stat["target_cls"] = cls
            stat["target_img"] = cls.unique()
            
            # 如果没有预测结果,但有真实标签
            if npr == 0:
                if nl:
                    # 将统计数据添加到总体统计中
                    for k in self.stats.keys():
                        self.stats[k].append(stat[k])
                    # 如果需要绘图,处理混淆矩阵
                    if self.args.plots:
                        self.confusion_matrix.process_batch(detections=None, gt_bboxes=bbox, gt_cls=cls)
                continue

            # 处理掩膜数据
            gt_masks = pbatch.pop("masks")
            
            # 准备预测数据
            if self.args.single_cls:
                pred[:, 5] = 0
            predn, pred_masks = self._prepare_pred(pred, pbatch, proto)
            stat["conf"] = predn[:, 4]
            stat["pred_cls"] = predn[:, 5]

            # 如果有真实标签,评估预测结果
            if nl:
                stat["tp"] = self._process_batch(predn, bbox, cls)
                stat["tp_m"] = self._process_batch(
                    predn, bbox, cls, pred_masks, gt_masks, self.args.overlap_mask, masks=True
                )
                # 如果需要绘图,处理混淆矩阵
                if self.args.plots:
                    self.confusion_matrix.process_batch(predn, bbox, cls)

            # 将统计数据添加到总体统计中
            for k in self.stats.keys():
                self.stats[k].append(stat[k])

            # 转换预测掩膜为Tensor,并添加到绘图列表中
            pred_masks = torch.as_tensor(pred_masks, dtype=torch.uint8)
            if self.args.plots and self.batch_i < 3:
                self.plot_masks.append(pred_masks[:15].cpu())  # 选取前15个样本进行绘图

            # 保存预测结果到JSON文件
            if self.args.save_json:
                self.pred_to_json(
                    predn,
                    batch["im_file"][si],
                    ops.scale_image(
                        pred_masks.permute(1, 2, 0).contiguous().cpu().numpy(),
                        pbatch["ori_shape"],
                        ratio_pad=batch["ratio_pad"][si],
                    ),
                )
            # 保存预测结果到文本文件
            if self.args.save_txt:
                self.save_one_txt(
                    predn,
                    pred_masks,
                    self.args.save_conf,
                    pbatch["ori_shape"],
                    self.save_dir / "labels" / f'{Path(batch["im_file"][si]).stem}.txt',
                )
    def finalize_metrics(self, *args, **kwargs):
        """
        Sets speed and confusion matrix for evaluation metrics.
        """
        # 将速度和混淆矩阵设置为评估指标中的属性值
        self.metrics.speed = self.speed
        self.metrics.confusion_matrix = self.confusion_matrix

    def _process_batch(self, detections, gt_bboxes, gt_cls, pred_masks=None, gt_masks=None, overlap=False, masks=False):
        """
        Compute correct prediction matrix for a batch based on bounding boxes and optional masks.

        Args:
            detections (torch.Tensor): Tensor of shape (N, 6) representing detected bounding boxes and
                associated confidence scores and class indices. Each row is of the format [x1, y1, x2, y2, conf, class].
            gt_bboxes (torch.Tensor): Tensor of shape (M, 4) representing ground truth bounding box coordinates.
                Each row is of the format [x1, y1, x2, y2].
            gt_cls (torch.Tensor): Tensor of shape (M,) representing ground truth class indices.
            pred_masks (torch.Tensor | None): Tensor representing predicted masks, if available. The shape should
                match the ground truth masks.
            gt_masks (torch.Tensor | None): Tensor of shape (M, H, W) representing ground truth masks, if available.
            overlap (bool): Flag indicating if overlapping masks should be considered.
            masks (bool): Flag indicating if the batch contains mask data.

        Returns:
            (torch.Tensor): A correct prediction matrix of shape (N, 10), where 10 represents different IoU levels.

        Note:
            - If `masks` is True, the function computes IoU between predicted and ground truth masks.
            - If `overlap` is True and `masks` is True, overlapping masks are taken into account when computing IoU.

        Example:
            ```python
            detections = torch.tensor([[25, 30, 200, 300, 0.8, 1], [50, 60, 180, 290, 0.75, 0]])
            gt_bboxes = torch.tensor([[24, 29, 199, 299], [55, 65, 185, 295]])
            gt_cls = torch.tensor([1, 0])
            correct_preds = validator._process_batch(detections, gt_bboxes, gt_cls)
            ```py
        """
        if masks:
            # 如果处理的是带有掩码数据的批次
            if overlap:
                # 如果要考虑重叠的掩码
                nl = len(gt_cls)
                # 创建索引并扩展掩码以匹配预测掩码的形状
                index = torch.arange(nl, device=gt_masks.device).view(nl, 1, 1) + 1
                gt_masks = gt_masks.repeat(nl, 1, 1)
                gt_masks = torch.where(gt_masks == index, 1.0, 0.0)
            if gt_masks.shape[1:] != pred_masks.shape[1:]:
                # 如果地面真实掩码的形状与预测掩码的形状不匹配,进行插值操作
                gt_masks = F.interpolate(gt_masks[None], pred_masks.shape[1:], mode="bilinear", align_corners=False)[0]
                gt_masks = gt_masks.gt_(0.5)
            # 计算掩码的 IoU
            iou = mask_iou(gt_masks.view(gt_masks.shape[0], -1), pred_masks.view(pred_masks.shape[0], -1))
        else:  # 处理框
            # 计算框的 IoU
            iou = box_iou(gt_bboxes, detections[:, :4])

        # 返回匹配预测结果
        return self.match_predictions(detections[:, 5], gt_cls, iou)
    def plot_val_samples(self, batch, ni):
        """Plots validation samples with bounding box labels."""
        # 使用自定义函数 plot_images 绘制验证样本图像,并添加边界框标签
        plot_images(
            batch["img"],  # 图像数据
            batch["batch_idx"],  # 批次索引
            batch["cls"].squeeze(-1),  # 压缩类别信息
            batch["bboxes"],  # 边界框信息
            masks=batch["masks"],  # 可选参数,掩膜信息
            paths=batch["im_file"],  # 图像文件路径
            fname=self.save_dir / f"val_batch{ni}_labels.jpg",  # 保存文件名
            names=self.names,  # 类别名称映射
            on_plot=self.on_plot,  # 绘图回调函数
        )

    def plot_predictions(self, batch, preds, ni):
        """Plots batch predictions with masks and bounding boxes."""
        # 使用自定义函数 plot_images 绘制预测结果图像,包括掩膜和边界框
        plot_images(
            batch["img"],  # 图像数据
            *output_to_target(preds[0], max_det=15),  # 将预测转换为目标格式,最多15个检测结果
            torch.cat(self.plot_masks, dim=0) if len(self.plot_masks) else self.plot_masks,  # 组合绘制的掩膜信息
            paths=batch["im_file"],  # 图像文件路径
            fname=self.save_dir / f"val_batch{ni}_pred.jpg",  # 保存文件名
            names=self.names,  # 类别名称映射
            on_plot=self.on_plot,  # 绘图回调函数
        )  # pred
        self.plot_masks.clear()  # 清空掩膜列表

    def save_one_txt(self, predn, pred_masks, save_conf, shape, file):
        """Save YOLO detections to a txt file in normalized coordinates in a specific format."""
        # 使用 Results 类保存 YOLO 检测结果到文本文件,使用指定的格式和坐标
        from ultralytics.engine.results import Results

        Results(
            np.zeros((shape[0], shape[1]), dtype=np.uint8),  # 创建一个全零数组作为占位符
            path=None,  # 不保存路径信息
            names=self.names,  # 类别名称映射
            boxes=predn[:, :6],  # 边界框信息
            masks=pred_masks,  # 掩膜信息
        ).save_txt(file, save_conf=save_conf)  # 调用 Results 类的 save_txt 方法保存文本文件

    def pred_to_json(self, predn, filename, pred_masks):
        """
        Save one JSON result.

        Examples:
             >>> result = {"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}
        """
        # 导入 pycocotools.mask 中的 encode 函数
        from pycocotools.mask import encode  # noqa

        def single_encode(x):
            """Encode predicted masks as RLE and append results to jdict."""
            # 将预测的掩膜编码为 RLE,并追加到结果字典 jdict 中
            rle = encode(np.asarray(x[:, :, None], order="F", dtype="uint8"))[0]
            rle["counts"] = rle["counts"].decode("utf-8")  # 将编码后的 counts 字段解码为 UTF-8 格式
            return rle

        stem = Path(filename).stem  # 获取文件名的主干部分
        image_id = int(stem) if stem.isnumeric() else stem  # 如果主干部分是数字,则转换为整数作为 image_id
        box = ops.xyxy2xywh(predn[:, :4])  # 将边界框格式从 xyxy 转换为 xywh
        box[:, :2] -= box[:, 2:] / 2  # 将边界框的中心点坐标转换为左上角坐标
        pred_masks = np.transpose(pred_masks, (2, 0, 1))  # 转置掩膜数据的维度顺序
        with ThreadPool(NUM_THREADS) as pool:  # 使用线程池并行处理
            rles = pool.map(single_encode, pred_masks)  # 并行编码掩膜数据
        for i, (p, b) in enumerate(zip(predn.tolist(), box.tolist())):  # 遍历预测结果和边界框
            self.jdict.append(  # 将结果以字典形式追加到 jdict 中
                {
                    "image_id": image_id,  # 图像 ID
                    "category_id": self.class_map[int(p[5])],  # 类别 ID,通过 class_map 映射获取
                    "bbox": [round(x, 3) for x in b],  # 边界框坐标,保留三位小数
                    "score": round(p[4], 5),  # 分数,保留五位小数
                    "segmentation": rles[i],  # 掩膜编码结果
                }
            )
    def eval_json(self, stats):
        """Return COCO-style object detection evaluation metrics."""
        # 检查是否需要保存 JSON,并且数据格式为 COCO,并且 jdict 不为空
        if self.args.save_json and self.is_coco and len(self.jdict):
            # 定义标注文件和预测文件的路径
            anno_json = self.data["path"] / "annotations/instances_val2017.json"  # annotations
            pred_json = self.save_dir / "predictions.json"  # predictions
            # 记录评估过程中使用的文件
            LOGGER.info(f"\nEvaluating pycocotools mAP using {pred_json} and {anno_json}...")
            try:  # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb
                # 检查并导入 pycocotools 所需的版本
                check_requirements("pycocotools>=2.0.6")
                from pycocotools.coco import COCO  # noqa
                from pycocotools.cocoeval import COCOeval  # noqa

                # 确保注释文件和预测文件存在
                for x in anno_json, pred_json:
                    assert x.is_file(), f"{x} file not found"
                # 初始化 COCO 对象用于注释
                anno = COCO(str(anno_json))  # init annotations api
                # 加载预测结果用于 COCO 对象
                pred = anno.loadRes(str(pred_json))  # init predictions api (must pass string, not Path)
                # 进行两种评估:bbox 和 segm
                for i, eval in enumerate([COCOeval(anno, pred, "bbox"), COCOeval(anno, pred, "segm")]):
                    # 如果是 COCO 格式,设置图像 IDs 用于评估
                    if self.is_coco:
                        eval.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files]  # im to eval
                    eval.evaluate()
                    eval.accumulate()
                    eval.summarize()
                    # 更新统计信息中的 mAP50-95 和 mAP50
                    idx = i * 4 + 2
                    stats[self.metrics.keys[idx + 1]], stats[self.metrics.keys[idx]] = eval.stats[
                        :2
                    ]  # update mAP50-95 and mAP50
            except Exception as e:
                # 捕获异常并记录警告信息
                LOGGER.warning(f"pycocotools unable to run: {e}")
        # 返回更新后的统计信息
        return stats

.\yolov8\ultralytics\models\yolo\segment\__init__.py

# 导入模块:从当前包中导入 SegmentationPredictor、SegmentationTrainer 和 SegmentationValidator 类
from .predict import SegmentationPredictor
from .train import SegmentationTrainer
from .val import SegmentationValidator

# __all__ 变量定义:指定在使用 `from package import *` 时应导入的公共接口
__all__ = "SegmentationPredictor", "SegmentationTrainer", "SegmentationValidator"

.\yolov8\ultralytics\models\yolo\world\train.py

# Ultralytics YOLO 🚀, AGPL-3.0 license

import itertools  # 导入 itertools 模块

from ultralytics.data import build_yolo_dataset  # 从 ultralytics.data 模块导入 build_yolo_dataset 函数
from ultralytics.models import yolo  # 从 ultralytics.models 模块导入 yolo 模型
from ultralytics.nn.tasks import WorldModel  # 从 ultralytics.nn.tasks 导入 WorldModel 类
from ultralytics.utils import DEFAULT_CFG, RANK, checks  # 从 ultralytics.utils 导入 DEFAULT_CFG, RANK, checks
from ultralytics.utils.torch_utils import de_parallel  # 从 ultralytics.utils.torch_utils 导入 de_parallel 函数


def on_pretrain_routine_end(trainer):
    """Callback."""
    if RANK in {-1, 0}:
        # NOTE: for evaluation
        # 从 trainer.test_loader.dataset.data["names"] 中获取所有名称,仅保留第一个斜杠之前的部分作为名称
        names = [name.split("/")[0] for name in list(trainer.test_loader.dataset.data["names"].values())]
        # 设置 trainer.ema.ema 中的类别为 names,不缓存剪辑模型
        de_parallel(trainer.ema.ema).set_classes(names, cache_clip_model=False)
    device = next(trainer.model.parameters()).device  # 获取 trainer.model 中第一个参数的设备信息
    # 使用指定设备加载 ViT-B/32 模型到 trainer.text_model 中
    trainer.text_model, _ = trainer.clip.load("ViT-B/32", device=device)
    # 将 trainer.text_model 中所有参数设为不需要梯度计算
    for p in trainer.text_model.parameters():
        p.requires_grad_(False)


class WorldTrainer(yolo.detect.DetectionTrainer):
    """
    A class to fine-tune a world model on a close-set dataset.

    Example:
        ```py
        from ultralytics.models.yolo.world import WorldModel

        args = dict(model='yolov8s-world.pt', data='coco8.yaml', epochs=3)
        trainer = WorldTrainer(overrides=args)
        trainer.train()
        ```
    """

    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
        """Initialize a WorldTrainer object with given arguments."""
        if overrides is None:
            overrides = {}
        # 调用父类构造函数初始化对象
        super().__init__(cfg, overrides, _callbacks)

        # Import and assign clip
        try:
            import clip
        except ImportError:
            # 检查是否安装了 CLIP 库,如果未安装,则安装该库
            checks.check_requirements("git+https://github.com/ultralytics/CLIP.git")
            import clip
        self.clip = clip  # 将 clip 模块赋值给 self.clip

    def get_model(self, cfg=None, weights=None, verbose=True):
        """Return WorldModel initialized with specified config and weights."""
        # NOTE: This `nc` here is the max number of different text samples in one image, rather than the actual `nc`.
        # NOTE: Following the official config, nc hard-coded to 80 for now.
        # 使用 cfg 和 weights 参数初始化 WorldModel,设置 nc 为数据集中的最大文本样本数和 80 中的最小值
        model = WorldModel(
            cfg["yaml_file"] if isinstance(cfg, dict) else cfg,
            ch=3,
            nc=min(self.data["nc"], 80),
            verbose=verbose and RANK == -1,
        )
        if weights:
            model.load(weights)  # 如果提供了 weights 参数,则加载模型权重
        self.add_callback("on_pretrain_routine_end", on_pretrain_routine_end)  # 添加回调函数 on_pretrain_routine_end 到对象

        return model  # 返回初始化的 WorldModel 对象
    # 获取当前模型的最大步长,如果模型存在则获取最大步长,否则返回0,并转为整数
    gs = max(int(de_parallel(self.model).stride.max() if self.model else 0), 32)
    # 调用函数构建 YOLO 数据集,返回构建的数据集对象
    return build_yolo_dataset(
        self.args, img_path, batch, self.data, mode=mode, rect=mode == "val", stride=gs, multi_modal=mode == "train"
    )


```py    
    # 调用父类方法处理图像批次
    batch = super().preprocess_batch(batch)

    # NOTE: add text features
    # 将所有图像批次中的文本合并为一个列表
    texts = list(itertools.chain(*batch["texts"]))
    # 使用 CLIP 模型对文本进行标记化,并将其转移到与图像批次相同的设备上
    text_token = self.clip.tokenize(texts).to(batch["img"].device)
    # 使用文本模型对文本进行编码,并转换为与图像批次相同的数据类型(torch.float32)
    txt_feats = self.text_model.encode_text(text_token).to(dtype=batch["img"].dtype)
    # 对编码后的文本特征进行归一化处理
    txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
    # 将处理后的文本特征重塑为适合批次的形状,并存储在批次字典中
    batch["txt_feats"] = txt_feats.reshape(len(batch["texts"]), -1, txt_feats.shape[-1])
    # 返回预处理后的批次数据
    return batch

.\yolov8\ultralytics\models\yolo\world\train_world.py

# 导入需要的模块和函数
from ultralytics.data import YOLOConcatDataset, build_grounding, build_yolo_dataset
from ultralytics.data.utils import check_det_dataset
from ultralytics.models.yolo.world import WorldTrainer
from ultralytics.utils import DEFAULT_CFG
from ultralytics.utils.torch_utils import de_parallel

# 定义一个从 WorldTrainer 继承的类,用于从头开始训练世界模型
class WorldTrainerFromScratch(WorldTrainer):
    """
    A class extending the WorldTrainer class for training a world model from scratch on open-set dataset.

    Example:
        ```python
        from ultralytics.models.yolo.world.train_world import WorldTrainerFromScratch
        from ultralytics import YOLOWorld

        data = dict(
            train=dict(
                yolo_data=["Objects365.yaml"],
                grounding_data=[
                    dict(
                        img_path="../datasets/flickr30k/images",
                        json_file="../datasets/flickr30k/final_flickr_separateGT_train.json",
                    ),
                    dict(
                        img_path="../datasets/GQA/images",
                        json_file="../datasets/GQA/final_mixed_train_no_coco.json",
                    ),
                ],
            ),
            val=dict(yolo_data=["lvis.yaml"]),
        )

        model = YOLOWorld("yolov8s-worldv2.yaml")
        model.train(data=data, trainer=WorldTrainerFromScratch)
        ```py
    """

    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
        """Initialize a WorldTrainer object with given arguments."""
        # 如果没有传入 overrides 参数,则设为一个空字典
        if overrides is None:
            overrides = {}
        # 调用父类的构造函数初始化对象
        super().__init__(cfg, overrides, _callbacks)

    def build_dataset(self, img_path, mode="train", batch=None):
        """
        Build YOLO Dataset.

        Args:
            img_path (List[str] | str): Path to the folder containing images.
            mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
            batch (int, optional): Size of batches, this is for `rect`. Defaults to None.
        """
        # 计算并获取模型的最大步长
        gs = max(int(de_parallel(self.model).stride.max() if self.model else 0), 32)
        # 如果不是训练模式,则构建 YOLO 数据集
        if mode != "train":
            return build_yolo_dataset(self.args, img_path, batch, self.data, mode=mode, rect=mode == "val", stride=gs)
        # 否则,构建多模态数据集或者地面推理数据集
        dataset = [
            build_yolo_dataset(self.args, im_path, batch, self.data, stride=gs, multi_modal=True)
            if isinstance(im_path, str)
            else build_grounding(self.args, im_path["img_path"], im_path["json_file"], batch, stride=gs)
            for im_path in img_path
        ]
        # 如果数据集数量大于1,则返回连接后的 YOLO 数据集,否则返回单个数据集
        return YOLOConcatDataset(dataset) if len(dataset) > 1 else dataset[0]
    def get_dataset(self):
        """
        Get train, val path from data dict if it exists.

        Returns None if data format is not recognized.
        """
        # 初始化最终数据字典
        final_data = {}
        # 获取数据字典的路径
        data_yaml = self.args.data
        # 断言数据字典中包含训练集数据
        assert data_yaml.get("train", False), "train dataset not found"  # object365.yaml
        # 断言数据字典中包含验证集数据
        assert data_yaml.get("val", False), "validation dataset not found"  # lvis.yaml
        # 构建数据字典,检查每个yolo_data的数据集
        data = {k: [check_det_dataset(d) for d in v.get("yolo_data", [])] for k, v in data_yaml.items()}
        # 断言验证集数据只有一个数据集
        assert len(data["val"]) == 1, f"Only support validating on 1 dataset for now, but got {len(data['val'])}."
        # 根据数据集名判断验证集的分割类型
        val_split = "minival" if "lvis" in data["val"][0]["val"] else "val"
        # 处理minival数据集的路径
        for d in data["val"]:
            if d.get("minival") is None:  # for lvis dataset
                continue
            d["minival"] = str(d["path"] / d["minival"])
        # 遍历训练集和验证集,将数据路径添加到final_data字典中
        for s in ["train", "val"]:
            final_data[s] = [d["train" if s == "train" else val_split] for d in data[s]]
            # 如果有地面数据,保存到final_data中
            grounding_data = data_yaml[s].get("grounding_data")
            if grounding_data is None:
                continue
            grounding_data = grounding_data if isinstance(grounding_data, list) else [grounding_data]
            # 断言地面数据应为字典格式
            for g in grounding_data:
                assert isinstance(g, dict), f"Grounding data should be provided in dict format, but got {type(g)}"
            final_data[s] += grounding_data
        # 设置训练所需的类别数和类别名称
        final_data["nc"] = data["val"][0]["nc"]
        final_data["names"] = data["val"][0]["names"]
        # 将最终数据保存到对象的属性中
        self.data = final_data
        # 返回训练集和验证集的路径
        return final_data["train"], final_data["val"][0]

    def plot_training_labels(self):
        """DO NOT plot labels."""
        # 该方法不做任何操作,避免绘制标签

    def final_eval(self):
        """Performs final evaluation and validation for object detection YOLO-World model."""
        # 获取验证集数据
        val = self.args.data["val"]["yolo_data"][0]
        # 设置验证器的数据和分割类型
        self.validator.args.data = val
        self.validator.args.split = "minival" if isinstance(val, str) and "lvis" in val else "val"
        # 调用父类方法执行最终评估
        return super().final_eval()

.\yolov8\ultralytics\models\yolo\world\__init__.py

# 导入WorldTrainer类从.train模块
from .train import WorldTrainer

# 将WorldTrainer添加到当前模块的__all__列表中,使其在使用import *时被导入
__all__ = ["WorldTrainer"]

.\yolov8\ultralytics\models\yolo\__init__.py

# 导入Ultralytics YOLO相关模块和函数
from ultralytics.models.yolo import classify, detect, obb, pose, segment, world

# 从当前目录中导入YOLO和YOLOWorld类
from .model import YOLO, YOLOWorld

# 定义__all__变量,包含需要公开的所有符号名称
__all__ = "classify", "segment", "detect", "pose", "obb", "world", "YOLO", "YOLOWorld"

.\yolov8\ultralytics\models\__init__.py

# 导入Ultralytics YOLO相关模块,这些模块都遵循AGPL-3.0许可证

# 从当前目录下的fastsam模块中导入FastSAM类
from .fastsam import FastSAM

# 从当前目录下的nas模块中导入NAS类
from .nas import NAS

# 从当前目录下的rtdetr模块中导入RTDETR类
from .rtdetr import RTDETR

# 从当前目录下的sam模块中导入SAM类
from .sam import SAM

# 从当前目录下的yolo模块中导入YOLO类和YOLOWorld类
from .yolo import YOLO, YOLOWorld

# 定义__all__变量,使得YOLO、RTDETR、SAM、FastSAM、NAS、YOLOWorld可以直接通过简单导入方式使用
__all__ = "YOLO", "RTDETR", "SAM", "FastSAM", "NAS", "YOLOWorld"  # allow simpler import

.\yolov8\ultralytics\nn\autobackend.py

# 导入必要的模块和库
import ast  # 用于处理抽象语法树的模块
import contextlib  # 提供上下文管理工具的模块
import json  # 处理 JSON 数据的模块
import platform  # 获取平台信息的模块
import zipfile  # 处理 ZIP 文件的模块
from collections import OrderedDict, namedtuple  # 引入有序字典和命名元组
from pathlib import Path  # 操作文件路径的模块

import cv2  # OpenCV 图像处理库
import numpy as np  # 处理数值数据的库
import torch  # PyTorch 深度学习框架
import torch.nn as nn  # 神经网络模块
from PIL import Image  # Python Imaging Library,处理图像的库

# 导入 Ultralytics 自定义工具函数和常量
from ultralytics.utils import ARM64, IS_JETSON, IS_RASPBERRYPI, LINUX, LOGGER, ROOT, yaml_load
from ultralytics.utils.checks import check_requirements, check_suffix, check_version, check_yaml
from ultralytics.utils.downloads import attempt_download_asset, is_url


def check_class_names(names):
    """
    检查类别名称。

    如果需要,将 ImageNet 类别映射到可读的名称。将列表转换为字典形式。
    """
    if isinstance(names, list):  # 如果 names 是一个列表
        names = dict(enumerate(names))  # 转换为字典
    if isinstance(names, dict):
        # 将字符串键转换为整数,例如 '0' 变为 0,将非字符串值转换为字符串,例如 True 变为 'True'
        names = {int(k): str(v) for k, v in names.items()}
        n = len(names)
        if max(names.keys()) >= n:
            raise KeyError(
                f"{n}-class dataset requires class indices 0-{n - 1}, but you have invalid class indices "
                f"{min(names.keys())}-{max(names.keys())} defined in your dataset YAML."
            )
        if isinstance(names[0], str) and names[0].startswith("n0"):  # ImageNet 类别代码,例如 'n01440764'
            names_map = yaml_load(ROOT / "cfg/datasets/ImageNet.yaml")["map"]  # 加载人类可读的名称映射
            names = {k: names_map[v] for k, v in names.items()}
    return names


def default_class_names(data=None):
    """为输入的 YAML 文件应用默认类别名称,或返回数值类别名称。"""
    if data:
        with contextlib.suppress(Exception):
            return yaml_load(check_yaml(data))["names"]
    return {i: f"class{i}" for i in range(999)}  # 如果出错,返回默认的数值类别名称


class AutoBackend(nn.Module):
    """
    处理使用 Ultralytics YOLO 模型进行推理时的动态后端选择。

    AutoBackend 类设计为提供各种推理引擎的抽象层。它支持广泛
    """
    range of formats, each with specific naming conventions as outlined below:

        Supported Formats and Naming Conventions:
            | Format                | File Suffix      |
            |-----------------------|------------------|
            | PyTorch               | *.pt             |
            | TorchScript           | *.torchscript    |
            | ONNX Runtime          | *.onnx           |
            | ONNX OpenCV DNN       | *.onnx (dnn=True)|
            | OpenVINO              | *openvino_model/ |
            | CoreML                | *.mlpackage      |
            | TensorRT              | *.engine         |
            | TensorFlow SavedModel | *_saved_model    |
            | TensorFlow GraphDef   | *.pb             |
            | TensorFlow Lite       | *.tflite         |
            | TensorFlow Edge TPU   | *_edgetpu.tflite |
            | PaddlePaddle          | *_paddle_model   |
            | NCNN                  | *_ncnn_model     |

    This class offers dynamic backend switching capabilities based on the input model format, making it easier to deploy
    models across various platforms.
    """



    # 以下是初始化函数的定义,使用了torch.no_grad()修饰符来确保初始化过程中不会计算梯度
    @torch.no_grad()
    def __init__(
        self,
        weights="yolov8n.pt",
        device=torch.device("cpu"),
        dnn=False,
        data=None,
        fp16=False,
        batch=1,
        fuse=True,
        verbose=True,
    ):



        # 将numpy数组转换为张量的静态方法
        """
        Convert a numpy array to a tensor.

        Args:
            x (np.ndarray): The array to be converted.

        Returns:
            (torch.Tensor): The converted tensor
        """



        # 模型预热方法,通过使用虚拟输入运行一次前向传递来预热模型
        """
        Warm up the model by running one forward pass with a dummy input.

        Args:
            imgsz (tuple): The shape of the dummy input tensor in the format (batch_size, channels, height, width)
        """



        import torchvision  # noqa (import here so torchvision import time not recorded in postprocess time)

        # 定义预热的模型类型列表
        warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton, self.nn_module
        # 如果任何预热类型为真,并且设备类型不是CPU或者使用了Triton推理服务器
        if any(warmup_types) and (self.device.type != "cpu" or self.triton):
            # 创建一个虚拟输入张量
            im = torch.empty(*imgsz, dtype=torch.half if self.fp16 else torch.float, device=self.device)  # input
            # 对于JIT模型,运行两次前向传递
            for _ in range(2 if self.jit else 1):
                self.forward(im)  # warmup
    def _model_type(p="path/to/model.pt"):
        """
        This function takes a path to a model file and returns the model type. Possibles types are pt, jit, onnx, xml,
        engine, coreml, saved_model, pb, tflite, edgetpu, tfjs, ncnn or paddle.

        Args:
            p: path to the model file. Defaults to path/to/model.pt

        Examples:
            >>> model = AutoBackend(weights="path/to/model.onnx")
            >>> model_type = model._model_type()  # returns "onnx"
        """
        from ultralytics.engine.exporter import export_formats  # 导入 export_formats 函数从 ultralytics.engine.exporter 模块

        sf = list(export_formats().Suffix)  # 获取导出格式的后缀列表
        if not is_url(p) and not isinstance(p, str):  # 如果 p 不是 URL 且不是字符串
            check_suffix(p, sf)  # 检查 p 的后缀是否符合预期
        name = Path(p).name  # 获取路径 p 的文件名部分
        types = [s in name for s in sf]  # 检查文件名是否包含导出格式的后缀
        types[5] |= name.endswith(".mlmodel")  # 对于旧版 Apple CoreML *.mlmodel 格式,保留支持
        types[8] &= not types[9]  # tflite &= not edgetpu,确保 tflite 格式排除 edgetpu 格式的影响
        if any(types):  # 如果 types 列表中有任何元素为 True
            triton = False  # triton 标志置为 False
        else:
            from urllib.parse import urlsplit  # 导入 urlsplit 函数从 urllib.parse 模块

            url = urlsplit(p)  # 解析路径 p 为 URL 元组
            triton = bool(url.netloc) and bool(url.path) and url.scheme in {"http", "grpc"}  # 检查是否符合 Triton 的 URL 格式

        return types + [triton]  # 返回 types 列表和 triton 标志的组合结果

.\yolov8\ultralytics\nn\modules\activation.py

# 导入 PyTorch 库中需要的模块
import torch
import torch.nn as nn

# 定义 AGLU 类,继承自 nn.Module 类,表示这是一个 PyTorch 模型模块
class AGLU(nn.Module):
    """Unified activation function module from https://github.com/kostas1515/AGLU."""

    def __init__(self, device=None, dtype=None) -> None:
        """初始化统一激活函数模块。"""
        super().__init__()
        # 使用 Softplus 激活函数初始化 self.act,beta 参数设为 -1.0
        self.act = nn.Softplus(beta=-1.0)
        # 使用均匀分布初始化 lambda 参数,作为可学习参数 nn.Parameter
        self.lambd = nn.Parameter(nn.init.uniform_(torch.empty(1, device=device, dtype=dtype)))  # lambda parameter
        # 使用均匀分布初始化 kappa 参数,作为可学习参数 nn.Parameter
        self.kappa = nn.Parameter(nn.init.uniform_(torch.empty(1, device=device, dtype=dtype)))  # kappa parameter

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """计算统一激活函数的前向传播。"""
        # 限制 lambda 参数的最小值为 0.0001
        lam = torch.clamp(self.lambd, min=0.0001)
        # 计算统一激活函数的输出
        y = torch.exp((1 / lam) * self.act((self.kappa * x) - torch.log(lam)))
        # 返回统一激活函数的输出结果
        return y  # for AGLU simply return y * input

.\yolov8\ultralytics\nn\modules\block.py

# 导入PyTorch库中的必要模块
import torch
import torch.nn as nn
import torch.nn.functional as F

# 导入自定义的工具函数fuse_conv_and_bn
from ultralytics.utils.torch_utils import fuse_conv_and_bn

# 从当前目录下的conv.py文件中导入以下模块
from .conv import Conv, DWConv, GhostConv, LightConv, RepConv, autopad

# 从当前目录下的transformer.py文件中导入TransformerBlock模块
from .transformer import TransformerBlock

# 定义模块的公开接口,列出所有公开的类和函数
__all__ = (
    "DFL",
    "HGBlock",
    "HGStem",
    "SPP",
    "SPPF",
    "C1",
    "C2",
    "C3",
    "C2f",
    "C2fAttn",
    "ImagePoolingAttn",
    "ContrastiveHead",
    "BNContrastiveHead",
    "C3x",
    "C3TR",
    "C3Ghost",
    "GhostBottleneck",
    "Bottleneck",
    "BottleneckCSP",
    "Proto",
    "RepC3",
    "ResNetLayer",
    "RepNCSPELAN4",
    "ELAN1",
    "ADown",
    "AConv",
    "SPPELAN",
    "CBFuse",
    "CBLinear",
    "RepVGGDW",
    "CIB",
    "C2fCIB",
    "Attention",
    "PSA",
    "SCDown",
)

# 定义DFL类,继承自nn.Module,实现分布式焦点损失的核心模块
class DFL(nn.Module):
    """
    Distribution Focal Loss(DFL)的核心模块。

    提出于《Generalized Focal Loss》https://ieeexplore.ieee.org/document/9792391
    """

    def __init__(self, c1=16):
        """
        初始化具有指定输入通道数的卷积层。

        Args:
            c1 (int): 输入通道数
        """
        super().__init__()
        # 创建一个卷积层,输入通道数为c1,输出通道数为1,卷积核大小为1x1,没有偏置项
        self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False)
        # 初始化卷积核权重为1到c1的浮点数序列
        x = torch.arange(c1, dtype=torch.float)
        self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1))
        self.c1 = c1

    def forward(self, x):
        """
        对输入张量'x'应用转换层,并返回张量。

        Args:
            x (tensor): 输入张量,维度为[b, c1, 4, a]

        Returns:
            tensor: 转换后的张量,维度为[b, 4, a]
        """
        b, _, a = x.shape  # 批量大小,通道数,锚点数
        # 对输入进行形状变换,然后进行softmax操作,最后再次进行形状变换
        return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
        # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a)

# 定义Proto类,继承自nn.Module,用于YOLOv8的掩膜Proto模块,适用于分割模型
class Proto(nn.Module):
    """
    YOLOv8分割模型的掩膜Proto模块。
    """

    def __init__(self, c1, c_=256, c2=32):
        """
        初始化YOLOv8掩膜Proto模块,指定原型和掩膜的数量。

        Args:
            c1 (int): 输入通道数
            c_ (int): 原型通道数,默认为256
            c2 (int): 掩膜通道数,默认为32
        """
        super().__init__()
        # 第一个卷积层,输入通道数为c1,输出通道数为c_,卷积核大小为3x3
        self.cv1 = Conv(c1, c_, k=3)
        # 上采样层,将输入特征图上采样,输出通道数不变,采样因子为2
        self.upsample = nn.ConvTranspose2d(c_, c_, 2, 2, 0, bias=True)  # nn.Upsample(scale_factor=2, mode='nearest')
        # 第二个卷积层,输入通道数和输出通道数都为c_,卷积核大小为3x3
        self.cv2 = Conv(c_, c_, k=3)
        # 第三个卷积层,输入通道数为c_,输出通道数为c2,卷积核大小为3x3
        self.cv3 = Conv(c_, c2)

    def forward(self, x):
        """
        通过各层进行前向传播,使用上采样的输入图像。

        Args:
            x (tensor): 输入张量

        Returns:
            tensor: 经过Proto模块处理后的张量
        """
        return self.cv3(self.cv2(self.upsample(self.cv1(x))))


class HGStem(nn.Module):
    """
    PPHGNetV2的StemBlock,包含5个卷积层和一个maxpool2d层。

    参考:https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
    """
    def __init__(self, c1, cm, c2):
        """
        Initialize the SPP layer with input/output channels and specified kernel sizes for max pooling.
        """
        super().__init__()  # 调用父类的初始化方法

        # 第一个卷积层,输入通道数为c1,输出通道数为cm,卷积核大小为3x3,步长为2,激活函数为ReLU
        self.stem1 = Conv(c1, cm, 3, 2, act=nn.ReLU())

        # 第二个卷积层分支a,输入通道数为cm,输出通道数为cm//2,卷积核大小为2x2,步长为1,无填充,激活函数为ReLU
        self.stem2a = Conv(cm, cm // 2, 2, 1, 0, act=nn.ReLU())

        # 第二个卷积层分支b,输入通道数为cm//2,输出通道数为cm,卷积核大小为2x2,步长为1,无填充,激活函数为ReLU
        self.stem2b = Conv(cm // 2, cm, 2, 1, 0, act=nn.ReLU())

        # 第三个卷积层,输入通道数为2*cm,输出通道数为cm,卷积核大小为3x3,步长为2,激活函数为ReLU
        self.stem3 = Conv(cm * 2, cm, 3, 2, act=nn.ReLU())

        # 第四个卷积层,输入通道数为cm,输出通道数为c2,卷积核大小为1x1,步长为1,激活函数为ReLU
        self.stem4 = Conv(cm, c2, 1, 1, act=nn.ReLU())

        # 最大池化层,核大小为2x2,步长为1,无填充,且在计算输出大小时采用ceil模式
        self.pool = nn.MaxPool2d(kernel_size=2, stride=1, padding=0, ceil_mode=True)

    def forward(self, x):
        """
        Forward pass of a PPHGNetV2 backbone layer.
        """
        x = self.stem1(x)  # 经过第一个卷积层
        x = F.pad(x, [0, 1, 0, 1])  # 在x的右侧和底部各填充1列/行零
        x2 = self.stem2a(x)  # 经过第二个卷积层分支a
        x2 = F.pad(x2, [0, 1, 0, 1])  # 在x2的右侧和底部各填充1列/行零
        x2 = self.stem2b(x2)  # 经过第二个卷积层分支b
        x1 = self.pool(x)  # 对x进行最大池化
        x = torch.cat([x1, x2], dim=1)  # 将x1和x2在通道维度上拼接
        x = self.stem3(x)  # 经过第三个卷积层
        x = self.stem4(x)  # 经过第四个卷积层
        return x  # 返回处理后的结果x
# HG_Block类定义,用于PaddleDetection中PPHGNetV2的一个模块,包含两个卷积层和LightConv。
class HGBlock(nn.Module):
    """
    HG_Block of PPHGNetV2 with 2 convolutions and LightConv.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
    """

    def __init__(self, c1, cm, c2, k=3, n=6, lightconv=False, shortcut=False, act=nn.ReLU()):
        """Initializes a CSP Bottleneck with 1 convolution using specified input and output channels."""
        super().__init__()
        block = LightConv if lightconv else Conv
        # 创建包含n个LightConv或Conv模块的ModuleList,第一个模块使用c1通道,其余使用cm通道
        self.m = nn.ModuleList(block(c1 if i == 0 else cm, cm, k=k, act=act) for i in range(n))
        # squeeze conv,将输入c1和n*cm的通道数合并为c2/2,使用1x1卷积进行处理
        self.sc = Conv(c1 + n * cm, c2 // 2, 1, 1, act=act)
        # excitation conv,将c2/2通道数处理为c2,使用1x1卷积进行处理
        self.ec = Conv(c2 // 2, c2, 1, 1, act=act)
        # 如果设置了shortcut且输入输出通道数相同,则将输入添加到输出中
        self.add = shortcut and c1 == c2

    def forward(self, x):
        """Forward pass of a PPHGNetV2 backbone layer."""
        y = [x]
        # 对输入x进行一系列的n次卷积操作,并将结果存储在列表y中
        y.extend(m(y[-1]) for m in self.m)
        # 将列表y中的所有特征图连接起来,然后依次经过squeeze conv和excitation conv操作得到最终输出y
        y = self.ec(self.sc(torch.cat(y, 1)))
        # 如果add为True,则将最终输出y与输入x相加作为最终输出;否则直接返回y
        return y + x if self.add else y


# SPP类定义,实现空间金字塔池化(SPP)层,参考论文https://arxiv.org/abs/1406.4729
class SPP(nn.Module):
    """Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729."""

    def __init__(self, c1, c2, k=(5, 9, 13)):
        """Initialize the SPP layer with input/output channels and pooling kernel sizes."""
        super().__init__()
        c_ = c1 // 2  # hidden channels
        # 第一个卷积层,将输入通道c1压缩为c_通道
        self.cv1 = Conv(c1, c_, 1, 1)
        # 第二个卷积层,将c_*(len(k)+1)个通道压缩为c2个通道
        self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
        # 创建一系列的最大池化层,每个层的kernel_size分别为k中的元素,stride和padding设置为1
        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])

    def forward(self, x):
        """Forward pass of the SPP layer, performing spatial pyramid pooling."""
        # 经过第一个卷积层cv1处理输入x
        x = self.cv1(x)
        # 将经过cv1的特征图x与一系列池化结果连接起来,并经过第二个卷积层cv2处理,得到最终的输出特征图
        return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))


# SPPF类定义,实现快速的空间金字塔池化(SPPF)层,用于YOLOv5
class SPPF(nn.Module):
    """Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher."""

    def __init__(self, c1, c2, k=5):
        """
        Initializes the SPPF layer with given input/output channels and kernel size.

        This module is equivalent to SPP(k=(5, 9, 13)).
        """
        super().__init__()
        c_ = c1 // 2  # hidden channels
        # 第一个卷积层,将输入通道c1压缩为c_通道
        self.cv1 = Conv(c1, c_, 1, 1)
        # 第二个卷积层,将c_*4个通道压缩为c2个通道
        self.cv2 = Conv(c_ * 4, c2, 1, 1)
        # 创建一个最大池化层,kernel_size为k,stride和padding设置为k//2
        self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)

    def forward(self, x):
        """Forward pass through Ghost Convolution block."""
        # 经过第一个卷积层cv1处理输入x
        y = [self.cv1(x)]
        # 对cv1处理后的特征图进行3次最大池化操作,并将结果连接起来,然后经过第二个卷积层cv2处理,得到最终的输出特征图
        y.extend(self.m(y[-1]) for _ in range(3))
        return self.cv2(torch.cat(y, 1))


# C1类定义,实现具有1个卷积的CSP Bottleneck
class C1(nn.Module):
    """CSP Bottleneck with 1 convolution."""

    def __init__(self, c1, c2, n=1):
        """Initializes the CSP Bottleneck with configurations for 1 convolution with arguments ch_in, ch_out, number."""
        super().__init__()
        # 第一个卷积层,将输入通道c1压缩为c2通道
        self.cv1 = Conv(c1, c2, 1, 1)
        # 创建一个序列模块,包含n个3x3卷积层
        self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n)))

    def forward(self, x):
        """Applies cross-convolutions to input in the C3 module."""
        # 经过第一个卷积层cv1处理输入x,然后经过一系列的3x3卷积层m,将结果与cv1的输出相加作为最终输出
        y = self.cv1(x)
        return self.m(y) + y


class C2(nn.Module):
    """This class is incomplete and requires further implementation."""

    # 此处为未完整的类定义,需要进一步实现
    """CSP Bottleneck with 2 convolutions."""

    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
        """初始化具有两个卷积层的CSP Bottleneck模块,参数包括输入通道数c1、输出通道数c2、重复次数n、是否使用shortcut、分组数g、扩展系数e。"""
        super().__init__()
        self.c = int(c2 * e)  # 隐藏层通道数
        self.cv1 = Conv(c1, 2 * self.c, 1, 1)  # 第一个卷积层,输入c1通道,输出2*self.c通道,卷积核大小1x1
        self.cv2 = Conv(2 * self.c, c2, 1)  # 第二个卷积层,输入2*self.c通道,输出c2通道,卷积核大小1x1(可选使用激活函数FReLU(c2))
        # self.attention = ChannelAttention(2 * self.c)  # 或者使用空间注意力(SpatialAttention)
        self.m = nn.Sequential(*(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n)))
        # 使用n个重复的Bottleneck块,每个块输入输出都是self.c通道,是否使用shortcut由参数shortcut决定,分组数为g,内部卷积核大小为(3x3, 3x3),扩展系数为1.0

    def forward(self, x):
        """通过CSP Bottleneck进行前向传播,包含两个卷积层。"""
        a, b = self.cv1(x).chunk(2, 1)  # 将第一个卷积层的输出分为两部分,分别为a和b
        return self.cv2(torch.cat((self.m(a), b), 1))
        # 将a经过重复的Bottleneck块处理后与b拼接,再经过第二个卷积层cv2处理,返回结果
class C2f(nn.Module):
    """Faster Implementation of CSP Bottleneck with 2 convolutions."""

    def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
        """Initialize CSP bottleneck layer with two convolutions with arguments ch_in, ch_out, number, shortcut, groups,
        expansion.
        """
        super().__init__()
        self.c = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, 2 * self.c, 1, 1)  # First convolution layer
        self.cv2 = Conv((2 + n) * self.c, c2, 1)  # Second convolution layer with optional activation
        self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))

    def forward(self, x):
        """Forward pass through C2f layer."""
        y = list(self.cv1(x).chunk(2, 1))  # Split output of first convolution into 2 chunks
        y.extend(m(y[-1]) for m in self.m)  # Apply each Bottleneck module to the last chunk
        return self.cv2(torch.cat(y, 1))  # Concatenate all chunks and pass through second convolution

    def forward_split(self, x):
        """Forward pass using split() instead of chunk()."""
        y = list(self.cv1(x).split((self.c, self.c), 1))  # Split output of first convolution into 2 parts
        y.extend(m(y[-1]) for m in self.m)  # Apply each Bottleneck module to the last part
        return self.cv2(torch.cat(y, 1))  # Concatenate all parts and pass through second convolution


class C3(nn.Module):
    """CSP Bottleneck with 3 convolutions."""

    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
        """Initialize the CSP Bottleneck with given channels, number, shortcut, groups, and expansion values."""
        super().__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c_, 1, 1)  # First convolution layer
        self.cv2 = Conv(c1, c_, 1, 1)  # Second convolution layer
        self.cv3 = Conv(2 * c_, c2, 1)  # Third convolution layer with optional activation
        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n)))  # Sequence of Bottleneck modules

    def forward(self, x):
        """Forward pass through the CSP bottleneck with 2 convolutions."""
        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))  # Concatenate outputs and pass through third convolution


class C3x(C3):
    """C3 module with cross-convolutions."""

    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
        """Initialize C3TR instance and set default parameters."""
        super().__init__(c1, c2, n, shortcut, g, e)
        self.c_ = int(c2 * e)  # hidden channels
        self.m = nn.Sequential(*(Bottleneck(self.c_, self.c_, shortcut, g, k=((1, 3), (3, 1)), e=1) for _ in range(n)))  # Sequence of cross Bottleneck modules


class RepC3(nn.Module):
    """Rep C3."""

    def __init__(self, c1, c2, n=3, e=1.0):
        """Initialize CSP Bottleneck with a single convolution using input channels, output channels, and number."""
        super().__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, c2, 1, 1)  # First convolution layer
        self.cv2 = Conv(c1, c2, 1, 1)  # Second convolution layer
        self.m = nn.Sequential(*[RepConv(c_, c_) for _ in range(n)])  # Sequence of RepConv modules
        self.cv3 = Conv(c_, c2, 1, 1) if c_ != c2 else nn.Identity()  # Third convolution layer or Identity if channels match

    def forward(self, x):
        """Forward pass of RT-DETR neck layer."""
        return self.cv3(self.m(self.cv1(x)) + self.cv2(x))  # Combine outputs using addition and pass through third convolution


class C3TR(C3):
    """C3 module with TransformerBlock()."""
    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
        """
        Initialize C3Ghost module with GhostBottleneck().
        """
        # 调用父类的构造函数进行初始化
        super().__init__(c1, c2, n, shortcut, g, e)
        # 计算降维后的通道数
        c_ = int(c2 * e)
        # 创建一个 TransformerBlock 对象,并赋值给 self.m
        self.m = TransformerBlock(c_, c_, 4, n)
class C3Ghost(C3):
    """C3 module with GhostBottleneck()."""

    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
        """Initialize 'SPP' module with various pooling sizes for spatial pyramid pooling."""
        super().__init__(c1, c2, n, shortcut, g, e)
        c_ = int(c2 * e)  # hidden channels
        # 使用GhostBottleneck构建一个序列模块,重复n次
        self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))


class GhostBottleneck(nn.Module):
    """Ghost Bottleneck https://github.com/huawei-noah/ghostnet."""

    def __init__(self, c1, c2, k=3, s=1):
        """Initializes GhostBottleneck module with arguments ch_in, ch_out, kernel, stride."""
        super().__init__()
        c_ = c2 // 2
        # 构建一个序列的卷积模块
        self.conv = nn.Sequential(
            GhostConv(c1, c_, 1, 1),  # pw
            # 如果stride为2,则使用深度卷积层;否则使用恒等映射
            DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(),  # dw
            GhostConv(c_, c2, 1, 1, act=False),  # pw-linear
        )
        # 如果stride为2,则创建一个深度卷积和卷积的序列作为shortcut;否则使用恒等映射
        self.shortcut = (
            nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity()
        )

    def forward(self, x):
        """Applies skip connection and concatenation to input tensor."""
        # 应用卷积操作和shortcut,然后将结果相加
        return self.conv(x) + self.shortcut(x)


class Bottleneck(nn.Module):
    """Standard bottleneck."""

    def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):
        """Initializes a bottleneck module with given input/output channels, shortcut option, group, kernels, and
        expansion.
        """
        super().__init__()
        c_ = int(c2 * e)  # hidden channels
        # 创建两个卷积层,然后根据shortcut标志决定是否添加输入和输出的相加
        self.cv1 = Conv(c1, c_, k[0], 1)
        self.cv2 = Conv(c_, c2, k[1], 1, g=g)
        self.add = shortcut and c1 == c2

    def forward(self, x):
        """'forward()' applies the YOLO FPN to input data."""
        # 如果add标志为True,则添加输入和输出;否则直接返回输出
        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))


class BottleneckCSP(nn.Module):
    """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks."""

    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
        """Initializes the CSP Bottleneck given arguments for ch_in, ch_out, number, shortcut, groups, expansion."""
        super().__init__()
        c_ = int(c2 * e)  # hidden channels
        # 创建四个卷积层和一个Batch Normalization层,并应用激活函数
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
        self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
        self.cv4 = Conv(2 * c_, c2, 1, 1)
        self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
        self.act = nn.SiLU()
        # 使用Bottleneck构建一个序列模块,重复n次
        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))

    def forward(self, x):
        """Applies a CSP bottleneck with 3 convolutions."""
        # 对输入x应用CSP bottleneck操作,返回处理后的张量
        y1 = self.cv3(self.m(self.cv1(x)))
        y2 = self.cv2(x)
        return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))


class ResNetBlock(nn.Module):
    """ResNet block with standard convolution layers."""
    # 定义一个卷积块的初始化方法,接受输入参数 c1, c2,可选参数 s 和 e,默认值为 s=1, e=4
    def __init__(self, c1, c2, s=1, e=4):
        """Initialize convolution with given parameters."""
        # 调用父类的初始化方法
        super().__init__()
        # 计算第三个卷积层的输出通道数
        c3 = e * c2
        # 定义第一个卷积层,1x1 卷积,输入通道数为 c1,输出通道数为 c2,步长为 1,激活函数为 ReLU
        self.cv1 = Conv(c1, c2, k=1, s=1, act=True)
        # 定义第二个卷积层,3x3 卷积,输入输出通道数均为 c2,步长为 s,填充为 1,激活函数为 ReLU
        self.cv2 = Conv(c2, c2, k=3, s=s, p=1, act=True)
        # 定义第三个卷积层,1x1 卷积,输入通道数为 c2,输出通道数为 c3,不使用激活函数
        self.cv3 = Conv(c2, c3, k=1, act=False)
        # 根据是否需要进行下采样(s != 1 或者 c1 != c3),定义 shortcut 分支
        self.shortcut = nn.Sequential(Conv(c1, c3, k=1, s=s, act=False)) if s != 1 or c1 != c3 else nn.Identity()

    def forward(self, x):
        """Forward pass through the ResNet block."""
        # 实现 ResNet 块的前向传播过程
        # 先经过 cv1 -> cv2 -> cv3 的卷积处理,然后加上 shortcut 分支的结果,最后使用 ReLU 激活函数
        return F.relu(self.cv3(self.cv2(self.cv1(x))) + self.shortcut(x))
class ResNetLayer(nn.Module):
    """ResNet layer with multiple ResNet blocks."""

    def __init__(self, c1, c2, s=1, is_first=False, n=1, e=4):
        """Initializes the ResNetLayer given arguments."""
        super().__init__()
        self.is_first = is_first

        if self.is_first:
            # 如果是第一层,则使用序列模块包含一个卷积层和最大池化层
            self.layer = nn.Sequential(
                Conv(c1, c2, k=7, s=2, p=3, act=True), nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
            )
        else:
            # 如果不是第一层,则创建多个ResNetBlock组成的序列模块
            blocks = [ResNetBlock(c1, c2, s, e=e)]
            blocks.extend([ResNetBlock(e * c2, c2, 1, e=e) for _ in range(n - 1)])
            self.layer = nn.Sequential(*blocks)

    def forward(self, x):
        """Forward pass through the ResNet layer."""
        return self.layer(x)


class MaxSigmoidAttnBlock(nn.Module):
    """Max Sigmoid attention block."""

    def __init__(self, c1, c2, nh=1, ec=128, gc=512, scale=False):
        """Initializes MaxSigmoidAttnBlock with specified arguments."""
        super().__init__()
        self.nh = nh
        self.hc = c2 // nh
        self.ec = Conv(c1, ec, k=1, act=False) if c1 != ec else None
        self.gl = nn.Linear(gc, ec)
        self.bias = nn.Parameter(torch.zeros(nh))
        self.proj_conv = Conv(c1, c2, k=3, s=1, act=False)
        self.scale = nn.Parameter(torch.ones(1, nh, 1, 1)) if scale else 1.0

    def forward(self, x, guide):
        """Forward process."""
        bs, _, h, w = x.shape

        guide = self.gl(guide)
        guide = guide.view(bs, -1, self.nh, self.hc)
        embed = self.ec(x) if self.ec is not None else x
        embed = embed.view(bs, self.nh, self.hc, h, w)

        aw = torch.einsum("bmchw,bnmc->bmhwn", embed, guide)
        aw = aw.max(dim=-1)[0]
        aw = aw / (self.hc**0.5)
        aw = aw + self.bias[None, :, None, None]
        aw = aw.sigmoid() * self.scale

        x = self.proj_conv(x)
        x = x.view(bs, self.nh, -1, h, w)
        x = x * aw.unsqueeze(2)
        return x.view(bs, -1, h, w)


class C2fAttn(nn.Module):
    """C2f module with an additional attn module."""

    def __init__(self, c1, c2, n=1, ec=128, nh=1, gc=512, shortcut=False, g=1, e=0.5):
        """Initialize CSP bottleneck layer with two convolutions with arguments ch_in, ch_out, number, shortcut, groups,
        expansion.
        """
        super().__init__()
        self.c = int(c2 * e)  # hidden channels
        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
        self.cv2 = Conv((3 + n) * self.c, c2, 1)  # optional act=FReLU(c2)
        self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
        self.attn = MaxSigmoidAttnBlock(self.c, self.c, gc=gc, ec=ec, nh=nh)

    def forward(self, x, guide):
        """Forward pass through C2f layer."""
        y = list(self.cv1(x).chunk(2, 1))
        y.extend(m(y[-1]) for m in self.m)
        y.append(self.attn(y[-1], guide))
        return self.cv2(torch.cat(y, 1))
    # 定义一个方法 forward_split,用于执行前向传播,使用 split() 替代 chunk() 方法。
    def forward_split(self, x, guide):
        """Forward pass using split() instead of chunk()."""
        # 将输入 x 经过第一个卷积层 self.cv1,并使用 split 方法按照指定大小分割,得到 y 列表
        y = list(self.cv1(x).split((self.c, self.c), 1))
        # 对 y 列表中最后一个元素应用每个模块 self.m 中的函数,并将结果扩展到 y 列表中
        y.extend(m(y[-1]) for m in self.m)
        # 将 self.attn 应用于 y 列表中的最后一个元素和指南 guide,将结果添加到 y 列表
        y.append(self.attn(y[-1], guide))
        # 将 y 列表中的所有元素在维度 1 上进行拼接,然后经过第二个卷积层 self.cv2 处理,并返回结果
        return self.cv2(torch.cat(y, 1))
class ImagePoolingAttn(nn.Module):
    """ImagePoolingAttn: Enhance the text embeddings with image-aware information."""

    def __init__(self, ec=256, ch=(), ct=512, nh=8, k=3, scale=False):
        """Initializes ImagePoolingAttn with specified arguments."""
        super().__init__()

        nf = len(ch)
        # Define layers for query, key, and value transformations
        self.query = nn.Sequential(nn.LayerNorm(ct), nn.Linear(ct, ec))
        self.key = nn.Sequential(nn.LayerNorm(ec), nn.Linear(ec, ec))
        self.value = nn.Sequential(nn.LayerNorm(ec), nn.Linear(ec, ec))
        
        # Projection layer to transform enhanced embeddings back to original dimension
        self.proj = nn.Linear(ec, ct)
        
        # Scaling factor for attention weights
        self.scale = nn.Parameter(torch.tensor([0.0]), requires_grad=True) if scale else 1.0
        
        # Convolutional projections for image features
        self.projections = nn.ModuleList([nn.Conv2d(in_channels, ec, kernel_size=1) for in_channels in ch])
        
        # Adaptive max pooling layers for image feature pooling
        self.im_pools = nn.ModuleList([nn.AdaptiveMaxPool2d((k, k)) for _ in range(nf)])
        
        # Store other parameters
        self.ec = ec
        self.nh = nh
        self.nf = nf
        self.hc = ec // nh
        self.k = k

    def forward(self, x, text):
        """Executes attention mechanism on input tensor x and guide tensor."""
        bs = x[0].shape[0]  # Batch size
        assert len(x) == self.nf  # Ensure correct number of image features
        num_patches = self.k**2  # Number of patches in each image feature
        
        # Process each image feature with projection and pooling
        x = [pool(proj(x)).view(bs, -1, num_patches) for (x, proj, pool) in zip(x, self.projections, self.im_pools)]
        x = torch.cat(x, dim=-1).transpose(1, 2)  # Concatenate and transpose for attention computation
        
        # Transform text input using query, key, and value networks
        q = self.query(text)
        k = self.key(x)
        v = self.value(x)

        # Reshape query, key, and value tensors for batched matrix multiplication
        q = q.reshape(bs, -1, self.nh, self.hc)
        k = k.reshape(bs, -1, self.nh, self.hc)
        v = v.reshape(bs, -1, self.nh, self.hc)

        # Compute attention weights using matrix multiplication and softmax
        aw = torch.einsum("bnmc,bkmc->bmnk", q, k)
        aw = aw / (self.hc**0.5)
        aw = F.softmax(aw, dim=-1)

        # Compute attended output using weighted sum of values
        x = torch.einsum("bmnk,bkmc->bnmc", aw, v)
        x = self.proj(x.reshape(bs, -1, self.ec))  # Project back to original embedding dimension
        return x * self.scale + text  # Scale and add residual connection with text embeddings


class ContrastiveHead(nn.Module):
    """Contrastive Head for YOLO-World compute the region-text scores according to the similarity between image and text
    features.
    """

    def __init__(self):
        """Initializes ContrastiveHead with specified region-text similarity parameters."""
        super().__init__()
        # Initialize bias for contrastive head
        self.bias = nn.Parameter(torch.tensor([-10.0]))
        # Initialize logit scale for contrastive head
        self.logit_scale = nn.Parameter(torch.ones([]) * torch.tensor(1 / 0.07).log())

    def forward(self, x, w):
        """Forward function of contrastive learning."""
        x = F.normalize(x, dim=1, p=2)  # Normalize input embeddings
        w = F.normalize(w, dim=-1, p=2)  # Normalize text embeddings
        x = torch.einsum("bchw,bkc->bkhw", x, w)  # Compute contrastive scores
        return x * self.logit_scale.exp() + self.bias  # Scale and add bias


class BNContrastiveHead(nn.Module):
    """
    Batch Norm Contrastive Head for YOLO-World using batch norm instead of l2-normalization.

    Args:
        embed_dims (int): Embed dimensions of text and image features.
    """
    def __init__(self, embed_dims: int):
        """Initialize ContrastiveHead with region-text similarity parameters."""
        super().__init__()
        # 初始化一个二维批归一化层,用于处理输入特征图的通道维度
        self.norm = nn.BatchNorm2d(embed_dims)
        
        # NOTE: 使用 -10.0 来保持初始的类别损失与其他损失一致
        # 初始化一个偏置参数,用于调整模型预测中的偏差
        self.bias = nn.Parameter(torch.tensor([-10.0]))
        
        # use -1.0 is more stable
        # 初始化一个对数尺度参数,用于缩放模型输出的对数概率
        self.logit_scale = nn.Parameter(-1.0 * torch.ones([]))

    def forward(self, x, w):
        """Forward function of contrastive learning."""
        # 对输入特征图进行批归一化处理
        x = self.norm(x)
        
        # 对输入的权重向量进行 L2 归一化处理
        w = F.normalize(w, dim=-1, p=2)
        
        # 使用 Einstein Summation (einsum) 完成张量乘法,计算特征图与权重的点积
        x = torch.einsum("bchw,bkc->bkhw", x, w)
        
        # 返回加权后的特征图,通过指数函数和偏置进行缩放和平移
        return x * self.logit_scale.exp() + self.bias
class RepBottleneck(Bottleneck):
    """Rep bottleneck."""

    def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):
        """Initializes a RepBottleneck module with customizable in/out channels, shortcut option, groups and expansion
        ratio."""
        super().__init__(c1, c2, shortcut, g, k, e)
        # Calculate the number of hidden channels
        c_ = int(c2 * e)  # hidden channels
        # Initialize RepConv module with input channels c1, output channels c_, and kernel size k[0]
        self.cv1 = RepConv(c1, c_, k[0], 1)


class RepCSP(C3):
    """Rep CSP Bottleneck with 3 convolutions."""

    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
        """Initializes RepCSP layer with given channels, repetitions, shortcut, groups and expansion ratio."""
        super().__init__(c1, c2, n, shortcut, g, e)
        # Calculate the number of hidden channels
        c_ = int(c2 * e)  # hidden channels
        # Create a sequence of RepBottleneck modules with input channels c_, output channels c_, and other parameters
        self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))


class RepNCSPELAN4(nn.Module):
    """CSP-ELAN."""

    def __init__(self, c1, c2, c3, c4, n=1):
        """Initializes CSP-ELAN layer with specified channel sizes, repetitions, and convolutions."""
        super().__init__()
        # Calculate the number of channels for the first convolution layer
        self.c = c3 // 2
        # Initialize the first convolution layer with input channels c1 and output channels c3
        self.cv1 = Conv(c1, c3, 1, 1)
        # Create a sequence of RepCSP module followed by a Conv module
        self.cv2 = nn.Sequential(RepCSP(c3 // 2, c4, n), Conv(c4, c4, 3, 1))
        # Create another sequence of RepCSP module followed by a Conv module
        self.cv3 = nn.Sequential(RepCSP(c4, c4, n), Conv(c4, c4, 3, 1))
        # Initialize the last convolution layer with input channels c3 + 2 * c4 and output channels c2
        self.cv4 = Conv(c3 + (2 * c4), c2, 1, 1)

    def forward(self, x):
        """Forward pass through RepNCSPELAN4 layer."""
        # Apply the first convolution layer and split its output into two parts
        y = list(self.cv1(x).chunk(2, 1))
        # Apply self.cv2 and self.cv3 sequentially on the last split part of y
        y.extend((m(y[-1])) for m in [self.cv2, self.cv3])
        # Concatenate all parts of y and apply the last convolution layer
        return self.cv4(torch.cat(y, 1))

    def forward_split(self, x):
        """Forward pass using split() instead of chunk()."""
        # Apply the first convolution layer and split its output into two parts
        y = list(self.cv1(x).split((self.c, self.c), 1))
        # Apply self.cv2 and self.cv3 sequentially on the last split part of y
        y.extend(m(y[-1]) for m in [self.cv2, self.cv3])
        # Concatenate all parts of y and apply the last convolution layer
        return self.cv4(torch.cat(y, 1))


class ELAN1(RepNCSPELAN4):
    """ELAN1 module with 4 convolutions."""

    def __init__(self, c1, c2, c3, c4):
        """Initializes ELAN1 layer with specified channel sizes."""
        super().__init__(c1, c2, c3, c4)
        # Calculate the number of channels for the first convolution layer
        self.c = c3 // 2
        # Initialize the first convolution layer with input channels c1 and output channels c3
        self.cv1 = Conv(c1, c3, 1, 1)
        # Initialize the second convolution layer with input channels c3 // 2 and output channels c4
        self.cv2 = Conv(c3 // 2, c4, 3, 1)
        # Initialize the third convolution layer with input and output channels both as c4
        self.cv3 = Conv(c4, c4, 3, 1)
        # Initialize the last convolution layer with input channels c3 + 2 * c4 and output channels c2
        self.cv4 = Conv(c3 + (2 * c4), c2, 1, 1)


class AConv(nn.Module):
    """AConv."""

    def __init__(self, c1, c2):
        """Initializes AConv module with convolution layers."""
        super().__init__()
        # Initialize convolution layer with input channels c1, output channels c2, kernel size 3x3, stride 2, padding 1
        self.cv1 = Conv(c1, c2, 3, 2, 1)

    def forward(self, x):
        """Forward pass through AConv layer."""
        # Apply average pooling to input x with kernel size 2x2 and stride 2
        x = torch.nn.functional.avg_pool2d(x, 2, 1, 0, False, True)
        # Apply the initialized convolution layer
        return self.cv1(x)


class ADown(nn.Module):
    """ADown."""

    def __init__(self, c1, c2):
        """Initializes ADown module with convolution layers to downsample input from channels c1 to c2."""
        super().__init__()
        # Calculate the number of channels for the first convolution layer
        self.c = c2 // 2
        # Initialize the first convolution layer to downsample input from c1 // 2 channels to self.c channels
        self.cv1 = Conv(c1 // 2, self.c, 3, 2, 1)
        # Initialize the second convolution layer with input channels c1 // 2, output channels self.c, kernel size 1x1, stride 1, padding 0
        self.cv2 = Conv(c1 // 2, self.c, 1, 1, 0)
    def forward(self, x):
        """Forward pass through ADown layer."""
        # 对输入 x 进行 2x2 平均池化操作,步长为1,填充为0,保持输入大小
        x = torch.nn.functional.avg_pool2d(x, 2, 1, 0, False, True)
        # 将经过平均池化后的 x 在第二个维度上分割成两部分 x1 和 x2
        x1, x2 = x.chunk(2, 1)
        # 对 x1 应用 self.cv1 网络层
        x1 = self.cv1(x1)
        # 对 x2 进行 3x3 最大池化操作,步长为2,填充为1
        x2 = torch.nn.functional.max_pool2d(x2, 3, 2, 1)
        # 对池化后的 x2 应用 self.cv2 网络层
        x2 = self.cv2(x2)
        # 将经过处理的 x1 和 x2 沿着第二个维度拼接起来
        return torch.cat((x1, x2), 1)
class SPPELAN(nn.Module):
    """SPP-ELAN."""

    def __init__(self, c1, c2, c3, k=5):
        """Initializes SPP-ELAN block with convolution and max pooling layers for spatial pyramid pooling."""
        super().__init__()
        self.c = c3
        self.cv1 = Conv(c1, c3, 1, 1)  # 使用 Conv 函数创建一个 1x1 的卷积层
        self.cv2 = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)  # 使用 MaxPool2d 创建一个最大池化层
        self.cv3 = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)  # 使用 MaxPool2d 创建一个最大池化层
        self.cv4 = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)  # 使用 MaxPool2d 创建一个最大池化层
        self.cv5 = Conv(4 * c3, c2, 1, 1)  # 使用 Conv 函数创建一个 1x1 的卷积层

    def forward(self, x):
        """Forward pass through SPPELAN layer."""
        y = [self.cv1(x)]  # 对输入 x 进行第一个卷积操作
        y.extend(m(y[-1]) for m in [self.cv2, self.cv3, self.cv4])  # 对 y[-1] 分别应用 cv2, cv3, cv4 的池化操作,并将结果添加到 y 中
        return self.cv5(torch.cat(y, 1))  # 将 y 中的结果在通道维度上拼接,然后应用 cv5 的卷积操作


class CBLinear(nn.Module):
    """CBLinear."""

    def __init__(self, c1, c2s, k=1, s=1, p=None, g=1):
        """Initializes the CBLinear module, passing inputs unchanged."""
        super(CBLinear, self).__init__()
        self.c2s = c2s
        self.conv = nn.Conv2d(c1, sum(c2s), k, s, autopad(k, p), groups=g, bias=True)  # 使用 Conv2d 创建一个卷积层

    def forward(self, x):
        """Forward pass through CBLinear layer."""
        return self.conv(x).split(self.c2s, dim=1)  # 对输入 x 应用卷积操作后,按照通道维度将结果分割为 c2s 指定的通道数


class CBFuse(nn.Module):
    """CBFuse."""

    def __init__(self, idx):
        """Initializes CBFuse module with layer index for selective feature fusion."""
        super(CBFuse, self).__init__()
        self.idx = idx  # 保存传入的 idx 参数作为属性

    def forward(self, xs):
        """Forward pass through CBFuse layer."""
        target_size = xs[-1].shape[2:]  # 获取 xs 中最后一个张量的空间维度
        res = [F.interpolate(x[self.idx[i]], size=target_size, mode="nearest") for i, x in enumerate(xs[:-1])]  # 对 xs[:-1] 中的张量进行插值操作,并根据 idx 选择相应的通道
        return torch.sum(torch.stack(res + xs[-1:]), dim=0)  # 将插值后的结果与 xs 的最后一个张量相加,并在维度 0 上求和


class RepVGGDW(torch.nn.Module):
    """RepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture."""

    def __init__(self, ed) -> None:
        """Initializes RepVGGDW with depthwise separable convolutional layers for efficient processing."""
        super().__init__()
        self.conv = Conv(ed, ed, 7, 1, 3, g=ed, act=False)  # 使用 Conv 函数创建一个 7x7 的卷积层,g=ed 表示组卷积
        self.conv1 = Conv(ed, ed, 3, 1, 1, g=ed, act=False)  # 使用 Conv 函数创建一个 3x3 的卷积层,g=ed 表示组卷积
        self.dim = ed  # 保存输入的 ed 参数作为属性
        self.act = nn.SiLU()  # 创建 SiLU 激活函数对象

    def forward(self, x):
        """
        Performs a forward pass of the RepVGGDW block.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after applying the depth wise separable convolution.
        """
        return self.act(self.conv(x) + self.conv1(x))  # 对输入 x 分别应用 conv 和 conv1 操作,然后将结果相加并使用 SiLU 激活函数

    def forward_fuse(self, x):
        """
        Performs a forward pass of the RepVGGDW block without fusing the convolutions.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after applying the depth wise separable convolution.
        """
        return self.act(self.conv(x))  # 对输入 x 只应用 conv 操作,并使用 SiLU 激活函数

    @torch.no_grad()
    def fuse(self):
        """
        Fuses the convolutional layers in the RepVGGDW block.

        This method fuses the convolutional layers and updates the weights and biases accordingly.
        """
        # 融合第一个卷积层和其对应的批标准化层
        conv = fuse_conv_and_bn(self.conv.conv, self.conv.bn)
        # 融合第二个卷积层和其对应的批标准化层
        conv1 = fuse_conv_and_bn(self.conv1.conv, self.conv1.bn)

        # 获取融合后的第一个卷积层的权重和偏置
        conv_w = conv.weight
        conv_b = conv.bias
        # 获取融合后的第二个卷积层的权重和偏置
        conv1_w = conv1.weight
        conv1_b = conv1.bias

        # 对第二个卷积层的权重进行填充,使用2个单位宽度的零填充
        conv1_w = torch.nn.functional.pad(conv1_w, [2, 2, 2, 2])

        # 计算最终融合后的卷积层的权重和偏置
        final_conv_w = conv_w + conv1_w
        final_conv_b = conv_b + conv1_b

        # 将融合后的权重和偏置复制到第一个卷积层对象中
        conv.weight.data.copy_(final_conv_w)
        conv.bias.data.copy_(final_conv_b)

        # 更新 self.conv 为融合后的第一个卷积层对象,删除原始的第二个卷积层对象 self.conv1
        self.conv = conv
        del self.conv1
class CIB(nn.Module):
    """
    Conditional Identity Block (CIB) module.

    Args:
        c1 (int): Number of input channels.
        c2 (int): Number of output channels.
        shortcut (bool, optional): Whether to add a shortcut connection. Defaults to True.
        e (float, optional): Scaling factor for the hidden channels. Defaults to 0.5.
        lk (bool, optional): Whether to use RepVGGDW for the third convolutional layer. Defaults to False.
    """

    def __init__(self, c1, c2, shortcut=True, e=0.5, lk=False):
        """Initializes the custom model with optional shortcut, scaling factor, and RepVGGDW layer."""
        super().__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = nn.Sequential(
            Conv(c1, c1, 3, g=c1),  # 3x3 convolution with c1 channels
            Conv(c1, 2 * c_, 1),    # 1x1 convolution increasing channels to 2*c_
            RepVGGDW(2 * c_) if lk else Conv(2 * c_, 2 * c_, 3, g=2 * c_),  # RepVGGDW or 3x3 convolution based on lk
            Conv(2 * c_, c2, 1),    # 1x1 convolution reducing channels to c2
            Conv(c2, c2, 3, g=c2),  # 3x3 convolution with c2 channels
        )

        self.add = shortcut and c1 == c2  # Determine whether to add shortcut connection

    def forward(self, x):
        """
        Forward pass of the CIB module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor.
        """
        return x + self.cv1(x) if self.add else self.cv1(x)


class C2fCIB(C2f):
    """
    C2fCIB class represents a convolutional block with C2f and CIB modules.

    Args:
        c1 (int): Number of input channels.
        c2 (int): Number of output channels.
        n (int, optional): Number of CIB modules to stack. Defaults to 1.
        shortcut (bool, optional): Whether to use shortcut connection. Defaults to False.
        lk (bool, optional): Whether to use local key connection. Defaults to False.
        g (int, optional): Number of groups for grouped convolution. Defaults to 1.
        e (float, optional): Expansion ratio for CIB modules. Defaults to 0.5.
    """

    def __init__(self, c1, c2, n=1, shortcut=False, lk=False, g=1, e=0.5):
        """Initializes the module with specified parameters for channel, shortcut, local key, groups, and expansion."""
        super().__init__(c1, c2, n, shortcut, g, e)
        self.m = nn.ModuleList(CIB(self.c, self.c, shortcut, e=1.0, lk=lk) for _ in range(n))


class Attention(nn.Module):
    """
    Attention module that performs self-attention on the input tensor.

    Args:
        dim (int): The input tensor dimension.
        num_heads (int): The number of attention heads.
        attn_ratio (float): The ratio of the attention key dimension to the head dimension.
    """
    Attributes:
        num_heads (int): The number of attention heads.
        head_dim (int): The dimension of each attention head.
        key_dim (int): The dimension of the attention key.
        scale (float): The scaling factor for the attention scores.
        qkv (Conv): Convolutional layer for computing the query, key, and value.
        proj (Conv): Convolutional layer for projecting the attended values.
        pe (Conv): Convolutional layer for positional encoding.
    """

    def __init__(self, dim, num_heads=8, attn_ratio=0.5):
        """Initializes multi-head attention module with query, key, and value convolutions and positional encoding."""
        super().__init__()
        # 设置注意力头的数量
        self.num_heads = num_heads
        # 计算每个注意力头的维度
        self.head_dim = dim // num_heads
        # 根据比例计算注意力键的维度
        self.key_dim = int(self.head_dim * attn_ratio)
        # 缩放因子,用于注意力分数的缩放
        self.scale = self.key_dim**-0.5
        # 计算 qkv 的输入通道数
        nh_kd = self.key_dim * num_heads
        # 计算 qkv 模块的输出通道数
        h = dim + nh_kd * 2
        # 创建计算查询、键、值的卷积层 qkv
        self.qkv = Conv(dim, h, 1, act=False)
        # 创建投影注意力值的卷积层 proj
        self.proj = Conv(dim, dim, 1, act=False)
        # 创建用于位置编码的卷积层 pe
        self.pe = Conv(dim, dim, 3, 1, g=dim, act=False)

    def forward(self, x):
        """
        Forward pass of the Attention module.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            (torch.Tensor): The output tensor after self-attention.
        """
        # 获取输入张量的形状信息
        B, C, H, W = x.shape
        # 计算位置数量 N
        N = H * W
        # 计算查询、键、值的结果
        qkv = self.qkv(x)
        # 按照注意力头、注意力键维度、注意力值维度拆分 qkv 张量
        q, k, v = qkv.view(B, self.num_heads, self.key_dim * 2 + self.head_dim, N).split(
            [self.key_dim, self.key_dim, self.head_dim], dim=2
        )
        # 计算注意力分数
        attn = (q.transpose(-2, -1) @ k) * self.scale
        # 对注意力分数进行 softmax 归一化
        attn = attn.softmax(dim=-1)
        # 计算自注意力结果并加上位置编码
        x = (v @ attn.transpose(-2, -1)).view(B, C, H, W) + self.pe(v.reshape(B, C, H, W))
        # 对注意力结果进行投影
        x = self.proj(x)
        return x
class PSA(nn.Module):
    """
    Position-wise Spatial Attention module.

    Args:
        c1 (int): Number of input channels.
        c2 (int): Number of output channels.
        e (float): Expansion factor for the intermediate channels. Default is 0.5.

    Attributes:
        c (int): Number of intermediate channels.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
        attn (Attention): Attention module for spatial attention.
        ffn (nn.Sequential): Feed-forward network module.
    """

    def __init__(self, c1, c2, e=0.5):
        """Initializes convolution layers, attention module, and feed-forward network with channel reduction."""
        super().__init__()
        assert c1 == c2
        # 计算中间通道数
        self.c = int(c1 * e)
        # 第一个 1x1 卷积层,将输入通道数减少到 2*c
        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
        # 第二个 1x1 卷积层,将输出通道数减少到 c
        self.cv2 = Conv(2 * self.c, c1, 1)

        # 空间注意力模块
        self.attn = Attention(self.c, attn_ratio=0.5, num_heads=self.c // 64)
        # 前馈网络模块
        self.ffn = nn.Sequential(Conv(self.c, self.c * 2, 1), Conv(self.c * 2, self.c, 1, act=False))

    def forward(self, x):
        """
        Forward pass of the PSA module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor.
        """
        # 对输入进行第一次卷积,然后按通道数分割为 a 和 b
        a, b = self.cv1(x).split((self.c, self.c), dim=1)
        # 应用空间注意力模块到 b
        b = b + self.attn(b)
        # 应用前馈网络到 b
        b = b + self.ffn(b)
        # 将 a 和更新后的 b 拼接后通过第二个卷积层 cv2
        return self.cv2(torch.cat((a, b), 1))


class SCDown(nn.Module):
    """Spatial Channel Downsample (SCDown) module for reducing spatial and channel dimensions."""

    def __init__(self, c1, c2, k, s):
        """
        Spatial Channel Downsample (SCDown) module.

        Args:
            c1 (int): Number of input channels.
            c2 (int): Number of output channels.
            k (int): Kernel size for the convolutional layer.
            s (int): Stride for the convolutional layer.
        """
        super().__init__()
        # 第一个 1x1 卷积层,用于降低输入通道数到 c2
        self.cv1 = Conv(c1, c2, 1, 1)
        # 第二个卷积层,用于进一步降维,包括空间和通道
        self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)

    def forward(self, x):
        """
        Forward pass of the SCDown module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after applying the SCDown module.
        """
        # 应用两个卷积层来实现空间和通道的降维
        return self.cv2(self.cv1(x))
posted @ 2024-09-05 11:59  绝不原创的飞龙  阅读(14)  评论(0编辑  收藏  举报