Transformers-源码解析-七十一-

Transformers 源码解析（七十一）

`.\models\mask2former\modeling_mask2former.py`

# 设置文件编码为 UTF-8
# 版权声明，指出 Meta Platforms, Inc. 和 The HuggingFace Inc. 团队的版权信息
# 版权遵循 Apache 许可证 2.0 版本，详见 https://www.apache.org/licenses/LICENSE-2.0
# 除非符合许可证要求或书面同意，否则禁止使用本文件中的代码
# 本代码基于 "AS IS" 基础分发，无论是明示还是暗示的，均不提供任何形式的保证或条件
# 更多信息请参见许可证内容

""" PyTorch Mask2Former model. """

# 导入所需的库和模块
import math  # 导入数学库
import warnings  # 导入警告处理模块
from dataclasses import dataclass  # 导入 dataclass 用于数据类的装饰器
from typing import Dict, List, Optional, Tuple  # 导入类型提示相关的工具

import numpy as np  # 导入 NumPy 库
import torch  # 导入 PyTorch 深度学习库
from torch import Tensor, nn  # 导入 PyTorch 的张量和神经网络模块

# 导入 Hugging Face 库中的模块和函数
from ...activations import ACT2FN  # 导入激活函数映射
from ...file_utils import (  # 导入文件工具函数
    ModelOutput,  # 模型输出类
    add_start_docstrings,  # 添加文档字符串的装饰器
    add_start_docstrings_to_model_forward,  # 为模型前向方法添加文档字符串的装饰器
    is_scipy_available,  # 检查是否安装了 SciPy 库
    replace_return_docstrings,  # 替换返回文档字符串的装饰器
    requires_backends,  # 要求后端支持的装饰器
)
from ...modeling_outputs import (  # 导入模型输出相关的类
    BaseModelOutput,  # 基础模型输出类
    BaseModelOutputWithCrossAttentions,  # 带跨注意力机制的基础模型输出类
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型的工具函数
from ...pytorch_utils import is_torch_greater_or_equal_than_2_1  # 检查是否是 PyTorch 2.1 及以上版本的工具函数
from ...utils import is_accelerate_available, logging  # 导入加速库是否可用和日志工具
from ...utils.backbone_utils import load_backbone  # 导入加载骨干网络的工具函数
from .configuration_mask2former import Mask2FormerConfig  # 导入 Mask2Former 的配置类

# 如果安装了 SciPy 库，则导入线性求和分配功能
if is_scipy_available():
    from scipy.optimize import linear_sum_assignment

# 如果加速库可用，则导入部分状态和减少工具
if is_accelerate_available():
    from accelerate import PartialState
    from accelerate.utils import reduce

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 模型文档相关信息
_CONFIG_FOR_DOC = "Mask2FormerConfig"  # 配置文档名称
_CHECKPOINT_FOR_DOC = "facebook/mask2former-swin-small-coco-instance"  # 预训练模型检查点信息
_IMAGE_PROCESSOR_FOR_DOC = "Mask2FormerImageProcessor"  # 图像处理器信息

# 预训练模型存档列表
MASK2FORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/mask2former-swin-small-coco-instance",  # Facebook 提供的 Mask2Former 预训练模型
    # 可以查看所有的 Mask2Former 模型列表 https://huggingface.co/models?filter=mask2former
]

@dataclass
class Mask2FormerPixelDecoderOutput(ModelOutput):
    """
    Mask2Former's pixel decoder module output, practically a Multi-Scale Deformable Attention based decoder. It returns
    the mask features and the multiscale features.
    """
    pass
    Args:
        multi_scale_features (`tuple(torch.FloatTensor)`):
            Tuple of multi-scale features of scales [1/8, 1/16, 1/32] and shape `(batch_size, num_channels, height,
            width)`from the Multi-Scale Deformable Attention based Pixel Decoder.
            多尺度特征的元组，包含比例为 [1/8, 1/16, 1/32] 的特征，形状为 `(batch_size, num_channels, height, width)`，
            来自基于多尺度可变注意力的像素解码器。
        mask_features (`torch.FloatTensor`):
            Tensor of shape `(batch_size, num_channels, height, width)`, 1/4 scale features from the last Pixel Decoder
            Layer.
            形状为 `(batch_size, num_channels, height, width)` 的张量，来自最后一个像素解码器层的1/4比例特征。
        attentions (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights from pixel decoder. Returned when `output_attentions=True` is passed
            or when `config.output_attentions=True`
            可选的注意力权重元组，每个元素的形状为 `(batch_size, num_heads, sequence_length, sequence_length)`，
            表示像素解码器中的注意力权重。在设置 `output_attentions=True` 或 `config.output_attentions=True` 时返回。
    """

    multi_scale_features: Tuple[torch.FloatTensor] = None
    mask_features: torch.FloatTensor = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class Mask2FormerMaskedAttentionDecoderOutput(BaseModelOutputWithCrossAttentions):
    """
    Mask2FormerMaskedAttentionDecoderOutput 类用于表示 Transformer 解码器的输出。
    它在 BaseModelOutputWithCrossAttentions 的基础上添加了两个属性：mask 预测的 logits 和中间解码器激活的元组，
    即每个解码器层的输出，每个输出都经过 layernorm 处理。

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的隐藏状态序列。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            一个元组，包含 `torch.FloatTensor` 类型的张量。第一个张量是从嵌入层输出的结果，其余每个张量对应每个层的输出，
            形状为 `(batch_size, sequence_length, hidden_size)`。当 `output_hidden_states=True` 时返回。
        attentions (`tuple(torch.FloatTensor)`, *optional*):
            一个元组，包含 `torch.FloatTensor` 类型的张量，每个张量的形状为 `(batch_size, num_heads, sequence_length,
            sequence_length)`。表示经过注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均。
            当 `output_attentions=True` 时返回。
        masks_queries_logits (`tuple(torch.FloatTensor)` of shape `(batch_size, num_queries, height, width)`):
            一个元组，包含 Transformer 解码器所有层的 mask 预测 logits。
        intermediate_hidden_states (`tuple(torch.FloatTensor)` of shape `(num_queries, 1, hidden_size)`):
            中间解码器激活的元组，即每个解码器层的输出，每个输出都经过 layernorm 处理。
    """

    last_hidden_state: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[torch.FloatTensor] = None
    masks_queries_logits: Tuple[torch.FloatTensor] = None
    intermediate_hidden_states: Tuple[torch.FloatTensor] = None


@dataclass
class Mask2FormerPixelLevelModuleOutput(ModelOutput):
    """
    Mask2FormerPixelLevelModuleOutput 类表示 Mask2Former 模型的像素级模块输出。
    它返回了编码器的输出（可选）以及 `decoder` 的所有隐藏状态（多尺度特征）。
    默认情况下，`encoder` 是 Swin 骨干网络，`decoder` 是基于多尺度可变形注意力的解码器。

    `decoder_last_hidden_state` 是每个像素的嵌入，而 `decoder_hidden_states` 指的是使用论文中定义的多尺度策略产生的多尺度特征图。

    Args:
        decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            解码器最后一层的每个像素的嵌入。
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            一个元组，包含 `torch.FloatTensor` 类型的张量。表示使用多尺度策略生成的多尺度特征图。
    """
    # 定义函数的参数列表，包括四个输入参数，均为torch.FloatTensor类型
    Args:
        encoder_last_hidden_state (`torch.FloatTensor`):
            编码器最后的隐藏状态，即最后阶段编码器的最终特征图，形状为`(batch_size, num_channels, height, width)`
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            编码器每个阶段输出的隐藏状态的元组。每个元素是形状为`(batch_size, num_channels, height, width)`的torch.FloatTensor。
            如果设置了output_hidden_states为True，则返回此参数。
        decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width))`:
            解码器最后一个Pixel解码层的1/4比例特征。
        decoder_hidden_states (`tuple(torch.FloatTensor)`):
            解码器每个阶段输出的隐藏状态的元组。每个元素是形状为`(batch_size, num_channels, height, width)`的torch.FloatTensor。
        """
    
    # 初始化函数内的变量，默认值为None
    encoder_last_hidden_state: torch.FloatTensor = None
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    decoder_last_hidden_state: torch.FloatTensor = None
    decoder_hidden_states: Tuple[torch.FloatTensor] = None
@dataclass
class Mask2FormerModelOutput(ModelOutput):
    """
    Class for outputs of [`Mask2FormerModel`]. This class returns all the needed hidden states to compute the logits.

    Args:
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
            Last hidden states (final feature map) of the last stage of the encoder model (backbone). Returned when
            `output_hidden_states=True` is passed.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the encoder
            model at the output of each stage. Returned when `output_hidden_states=True` is passed.
        pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
            Last hidden states (final feature map) of the last stage of the pixel decoder model.
        pixel_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, num_channels, height, width)`. Hidden-states (also called feature maps) of the pixel
            decoder model at the output of each stage. Returned when `output_hidden_states=True` is passed.
        transformer_decoder_last_hidden_state (`tuple(torch.FloatTensor)`):
            Final output of the transformer decoder `(batch_size, sequence_length, hidden_size)`.
        transformer_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called feature maps) of the
            transformer decoder at the output of each stage. Returned when `output_hidden_states=True` is passed.
        transformer_decoder_intermediate_states (`tuple(torch.FloatTensor)` of shape `(num_queries, 1, hidden_size)`):
            Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
            layernorm.
        masks_queries_logits (`tuple(torch.FloatTensor)` of shape `(batch_size, num_queries, height, width)`)
            Mask Predictions from each layer in the transformer decoder.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed):
            Tuple of `tuple(torch.FloatTensor)` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Self attentions weights from transformer decoder.
    """

    # 定义一个数据类，用于存储 Mask2FormerModel 的输出结果，包括各个模型阶段的隐藏状态和注意力权重

    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
        # encoder 最后一个隐藏状态（即最终特征图），形状为 `(batch_size, num_channels, height, width)`，可选参数
        Last hidden states (final feature map) of the last stage of the encoder model.

    encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        # encoder 隐藏状态的元组，每个元素为 `torch.FloatTensor`，形状为 `(batch_size, num_channels, height, width)`
        Tuple of `torch.FloatTensor` representing hidden states of the encoder model at each stage.

    pixel_decoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, *optional*):
        # pixel decoder 最后一个隐藏状态（即最终特征图），形状为 `(batch_size, num_channels, height, width)`，可选参数
        Last hidden states of the last stage of the pixel decoder model.

    pixel_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        # pixel decoder 隐藏状态的元组，每个元素为 `torch.FloatTensor`，形状为 `(batch_size, num_channels, height, width)`
        Tuple of `torch.FloatTensor` representing hidden states of the pixel decoder model at each stage.

    transformer_decoder_last_hidden_state (`tuple(torch.FloatTensor)`):
        # transformer decoder 最终输出，形状为 `(batch_size, sequence_length, hidden_size)`
        Final output of the transformer decoder.

    transformer_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        # transformer decoder 隐藏状态的元组，每个元素为 `torch.FloatTensor`，形状为 `(batch_size, sequence_length, hidden_size)`
        Tuple of `torch.FloatTensor` representing hidden states of the transformer decoder at each stage.

    transformer_decoder_intermediate_states (`tuple(torch.FloatTensor)` of shape `(num_queries, 1, hidden_size)`):
        # transformer decoder 中间层激活的元组，每个元素为 `torch.FloatTensor`，形状为 `(num_queries, 1, hidden_size)`
        Intermediate decoder activations, each gone through a layernorm.

    masks_queries_logits (`tuple(torch.FloatTensor)` of shape `(batch_size, num_queries, height, width)`)
        # transformer decoder 中每层的 mask 预测，形状为 `(batch_size, num_queries, height, width)`
        Mask Predictions from each layer in the transformer decoder.

    attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed):
        # transformer decoder 自注意力权重的元组，每个元素为 `tuple(torch.FloatTensor)`，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`
        Self attentions weights from transformer decoder.
    """
    encoder_last_hidden_state: torch.FloatTensor = None
    pixel_decoder_last_hidden_state: torch.FloatTensor = None
    transformer_decoder_last_hidden_state: torch.FloatTensor = None
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    pixel_decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    transformer_decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    transformer_decoder_intermediate_states: Tuple[torch.FloatTensor] = None
    masks_queries_logits: Tuple[torch.FloatTensor] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    """
@dataclass
class Mask2FormerForUniversalSegmentationOutput(ModelOutput):
    """
    [`Mask2FormerForUniversalSegmentationOutput`]的输出类。

    这个输出可以直接传递给[`~Mask2FormerImageProcessor.post_process_semantic_segmentation`]、
    [`~Mask2FormerImageProcessor.post_process_instance_segmentation`]或
    [`~Mask2FormerImageProcessor.post_process_panoptic_segmentation`]以计算最终的分割图。
    请参阅[`~Mask2FormerImageProcessor`]获取有关使用的详细信息。
    """

    loss: Optional[torch.FloatTensor] = None
    class_queries_logits: torch.FloatTensor = None
    masks_queries_logits: torch.FloatTensor = None
    auxiliary_logits: Optional[List[Dict[str, torch.FloatTensor]]] = None
    encoder_last_hidden_state: torch.FloatTensor = None
    pixel_decoder_last_hidden_state: torch.FloatTensor = None
    transformer_decoder_last_hidden_state: torch.FloatTensor = None
    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    pixel_decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    transformer_decoder_hidden_states: Optional[torch.FloatTensor] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


# Adapted from https://github.com/facebookresearch/detectron2/blob/main/projects/PointRend/point_rend/point_features.py
def sample_point(
    input_features: torch.Tensor, point_coordinates: torch.Tensor, add_dim=False, **kwargs
) -> torch.Tensor:
    """
    一个对`torch.nn.functional.grid_sample`进行包装的函数，支持3D点坐标张量。

    Args:
        input_features (`torch.Tensor` of shape (batch_size, channels, height, width)):
            包含在高度*宽度网格上的特征映射的张量
        point_coordinates (`torch.Tensor` of shape (batch_size, num_points, 2) or (batch_size, grid_height, grid_width,
        2)):
            包含[0, 1] * [0, 1]规范化点坐标的张量
        add_dim (`bool`):
            用于跟踪是否添加了维度

    Returns:
        point_features (`torch.Tensor` of shape (batch_size, channels, num_points) or (batch_size, channels,
        height_grid, width_grid)):
            包含`point_coordinates`中点的特征的张量。
    """
    if point_coordinates.dim() == 3:
        add_dim = True
        point_coordinates = point_coordinates.unsqueeze(2)

    # 使用nn.functional.grid_sample通过双线性插值获取`point_coordinates`中点的特征
    point_features = torch.nn.functional.grid_sample(input_features, 2.0 * point_coordinates - 1.0, **kwargs)
    if add_dim:
        point_features = point_features.squeeze(3)

    return point_features


# Copied from transformers.models.maskformer.modeling_maskformer.dice_loss
def dice_loss(inputs: Tensor, labels: Tensor, num_masks: int) -> Tensor:
    r"""
    计算DICE损失，类似于掩码的广义IOU，计算方式如下：
    """
    计算二进制分割任务中的 Dice Loss。

    Args:
        inputs (`torch.Tensor`):
            表示一个掩码的张量。
        labels (`torch.Tensor`):
            与输入张量具有相同形状的张量。存储每个元素的二进制分类标签
            （0表示负类，1表示正类）。
        num_masks (`int`):
            当前批次中存在的掩码数量，用于归一化。

    Returns:
        `torch.Tensor`: 计算得到的损失值。
    """
    # 计算概率，并将结果展平为二维数组
    probs = inputs.sigmoid().flatten(1)
    # 计算 Dice 损失的分子部分
    numerator = 2 * (probs * labels).sum(-1)
    # 计算 Dice 损失的分母部分
    denominator = probs.sum(-1) + labels.sum(-1)
    # 计算最终的 Dice 损失
    loss = 1 - (numerator + 1) / (denominator + 1)
    # 将损失值对每个掩码进行求和并进行归一化
    loss = loss.sum() / num_masks
    return loss
# 定义一个函数，计算输入张量和标签之间的 sigmoid 交叉熵损失
def sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Tensor, num_masks: int) -> torch.Tensor:
    r"""
    Args:
        inputs (`torch.Tensor`):
            任意形状的浮点张量。
        labels (`torch.Tensor`):
            与输入张量形状相同的张量。存储每个输入元素的二元分类标签
            （0 表示负类，1 表示正类）。

    Returns:
        loss (`torch.Tensor`): 计算得到的损失张量。
    """
    # 使用 BCEWithLogitsLoss 函数定义损失计算方式，不进行汇总
    criterion = nn.BCEWithLogitsLoss(reduction="none")
    # 计算交叉熵损失
    cross_entropy_loss = criterion(inputs, labels)

    # 计算平均损失，并按 num_masks 汇总
    loss = cross_entropy_loss.mean(1).sum() / num_masks
    return loss


# 从 transformers.models.maskformer.modeling_maskformer.pair_wise_dice_loss 复制过来的代码
def pair_wise_dice_loss(inputs: Tensor, labels: Tensor) -> Tensor:
    """
    一对一版本的 Dice 损失，参见 `dice_loss` 的用法。

    Args:
        inputs (`torch.Tensor`):
            表示掩码的张量。
        labels (`torch.Tensor`):
            与输入张量形状相同的张量。存储每个输入元素的二元分类标签
            （0 表示负类，1 表示正类）。

    Returns:
        `torch.Tensor`: 每对之间计算得到的损失。
    """
    # 对输入张量应用 sigmoid 函数，并展平到第一维度
    inputs = inputs.sigmoid().flatten(1)
    numerator = 2 * torch.matmul(inputs, labels.T)
    # 使用广播获取一个 [num_queries, NUM_CLASSES] 的矩阵
    denominator = inputs.sum(-1)[:, None] + labels.sum(-1)[None, :]
    loss = 1 - (numerator + 1) / (denominator + 1)
    return loss


# 定义一个函数，计算输入张量和标签之间的一对一 sigmoid 交叉熵损失
def pair_wise_sigmoid_cross_entropy_loss(inputs: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
    r"""
    一对一版本的交叉熵损失，参见 `sigmoid_cross_entropy_loss` 的用法。

    Args:
        inputs (`torch.Tensor`):
            表示掩码的张量。
        labels (`torch.Tensor`):
            与输入张量形状相同的张量。存储每个输入元素的二元分类标签
            （0 表示负类，1 表示正类）。

    Returns:
        loss (`torch.Tensor`): 每对之间计算得到的损失。
    """

    # 获取输入张量的高度和宽度
    height_and_width = inputs.shape[1]

    # 使用 BCEWithLogitsLoss 函数定义损失计算方式，不进行汇总
    criterion = nn.BCEWithLogitsLoss(reduction="none")
    # 分别计算正类和负类的交叉熵损失
    cross_entropy_loss_pos = criterion(inputs, torch.ones_like(inputs))
    cross_entropy_loss_neg = criterion(inputs, torch.zeros_like(inputs))

    # 计算正类和负类的损失
    loss_pos = torch.matmul(cross_entropy_loss_pos / height_and_width, labels.T)
    loss_neg = torch.matmul(cross_entropy_loss_neg / height_and_width, (1 - labels).T)
    # 组合正类和负类的损失
    loss = loss_pos + loss_neg
    return loss


# 从 https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/matcher.py 调整而来
class Mask2FormerHungarianMatcher(nn.Module):
    """这个类计算标签和网络预测之间的分配。
    """
    For efficiency reasons, the labels don't include the no_object. Because of this, in general, there are more
    predictions than labels. In this case, we do a 1-to-1 matching of the best predictions, while the others are
    un-matched (and thus treated as non-objects).
    """

    def __init__(
        self, cost_class: float = 1.0, cost_mask: float = 1.0, cost_dice: float = 1.0, num_points: int = 12544
    ):
        """Creates the matcher

        Params:
            cost_class (`float`, *optional*, defaults to 1.0):
                Relative weight of the classification error in the matching cost.
            cost_mask (`float`, *optional*,  defaults to 1.0):
                This is the relative weight of the focal loss of the binary mask in the matching cost.
            cost_dice (`float`, *optional*, defaults to 1.0):
                This is the relative weight of the dice loss of the binary mask in the matching cost.
            num_points (`int`, *optional*, defaults to 12544):
                No. of points to sample on which the mask loss will be calculated. The same set of K points are
                uniformly sampled for all prediction and ground truth masks to construct the cost matrix for bipartite
                matching.
        """
        # 调用父类初始化方法
        super().__init__()
        # 如果分类、掩模和 Dice 损失权重均为零，则抛出异常
        if cost_class == 0 and cost_mask == 0 and cost_dice == 0:
            raise ValueError("All costs cant be 0")

        # 初始化属性
        self.num_points = num_points  # 设置采样点的数量
        self.cost_class = cost_class  # 设置分类错误的权重
        self.cost_mask = cost_mask    # 设置掩模损失的权重
        self.cost_dice = cost_dice    # 设置 Dice 损失的权重

    @torch.no_grad()
    def forward(
        self,
        masks_queries_logits: torch.Tensor,
        class_queries_logits: torch.Tensor,
        mask_labels: torch.Tensor,
        class_labels: torch.Tensor,
# Adapted from https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/criterion.py

# 定义 Mask2FormerLoss 类，继承自 nn.Module
class Mask2FormerLoss(nn.Module):
    def __init__(self, config: Mask2FormerConfig, weight_dict: Dict[str, float]):
        """
        The Mask2Former Loss. The loss is computed very similar to DETR. The process happens in two steps: 1) we
        compute hungarian assignment between ground truth masks and the outputs of the model 2) we supervise each pair
        of matched ground-truth / prediction (supervise class and mask)

        Args:
            config (`Mask2FormerConfig`):
                The configuration for Mask2Former model also containing loss calculation specific parameters.
            weight_dict (`Dict[str, float]`):
                A dictionary of weights to be applied to the different losses.
        """
        super().__init__()
        requires_backends(self, ["scipy"])  # 确保需要的后端库被加载
        self.num_labels = config.num_labels  # 从配置中获取标签数量
        self.weight_dict = weight_dict  # 保存权重字典

        # Weight to apply to the null class
        self.eos_coef = config.no_object_weight  # 获取空对象的权重系数
        empty_weight = torch.ones(self.num_labels + 1)
        empty_weight[-1] = self.eos_coef
        self.register_buffer("empty_weight", empty_weight)  # 将权重缓存起来

        # pointwise mask loss parameters
        self.num_points = config.train_num_points  # 获取训练点数
        self.oversample_ratio = config.oversample_ratio  # 获取过采样比例
        self.importance_sample_ratio = config.importance_sample_ratio  # 获取重要性采样比例

        # 初始化匈牙利匹配器，用于计算损失
        self.matcher = Mask2FormerHungarianMatcher(
            cost_class=1.0,
            cost_dice=config.dice_weight,
            cost_mask=config.mask_weight,
            num_points=self.num_points,
        )

    # 从 sizes 列表中找到每个维度的最大值，并返回最大值列表
    def _max_by_axis(self, sizes: List[List[int]]) -> List[int]:
        maxes = sizes[0]
        for sublist in sizes[1:]:
            for index, item in enumerate(sublist):
                maxes[index] = max(maxes[index], item)
        return maxes

    # 将输入的张量列表进行填充，使它们的尺寸达到批次中最大的尺寸，并返回填充后的张量及其对应的填充掩码
    # 函数功能类似于原始实现中的 nested_tensor_from_tensor_list()
    def _pad_images_to_max_in_batch(self, tensors: List[Tensor]) -> Tuple[Tensor, Tensor]:
        # 获取批次中的最大尺寸
        max_size = self._max_by_axis([list(tensor.shape) for tensor in tensors])
        # 计算最终的批次形状
        batch_shape = [len(tensors)] + max_size
        batch_size, _, height, width = batch_shape
        dtype = tensors[0].dtype
        device = tensors[0].device
        padded_tensors = torch.zeros(batch_shape, dtype=dtype, device=device)
        padding_masks = torch.ones((batch_size, height, width), dtype=torch.bool, device=device)
        # 将张量填充到最大尺寸
        for tensor, padded_tensor, padding_mask in zip(tensors, padded_tensors, padding_masks):
            padded_tensor[: tensor.shape[0], : tensor.shape[1], : tensor.shape[2]].copy_(tensor)
            padding_mask[: tensor.shape[1], : tensor.shape[2]] = False

        return padded_tensors, padding_masks
    def loss_labels(
        self, class_queries_logits: Tensor, class_labels: List[Tensor], indices: Tuple[np.array]
    ) -> Dict[str, Tensor]:
        """Compute the losses related to the labels using cross entropy.

        Args:
            class_queries_logits (`torch.Tensor`):
                A tensor of shape `batch_size, num_queries, num_labels`
            class_labels (`List[torch.Tensor]`):
                List of class labels of shape `(labels)`.
            indices (`Tuple[np.array])`:
                The indices computed by the Hungarian matcher.

        Returns:
            `Dict[str, Tensor]`: A dict of `torch.Tensor` containing the following key:
            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
        """
        # Assigning the predicted logits to a local variable
        pred_logits = class_queries_logits
        # Extracting dimensions from the predicted logits tensor
        batch_size, num_queries, _ = pred_logits.shape
        # Defining the cross entropy loss criterion with optional weights
        criterion = nn.CrossEntropyLoss(weight=self.empty_weight)
        # Obtaining permutation indices for predictions based on the Hungarian matcher
        idx = self._get_predictions_permutation_indices(indices)  # shape of (batch_size, num_queries)
        # Concatenating target mask labels based on indices
        target_classes_o = torch.cat(
            [target[j] for target, (_, j) in zip(class_labels, indices)]
        )  # shape of (batch_size, num_queries)
        # Creating a tensor filled with a specific value for target classes
        target_classes = torch.full(
            (batch_size, num_queries), fill_value=self.num_labels, dtype=torch.int64, device=pred_logits.device
        )
        # Assigning the concatenated target classes to their respective positions using indices
        target_classes[idx] = target_classes_o
        # Transposing the predicted logits tensor for cross entropy computation
        pred_logits_transposed = pred_logits.transpose(1, 2)
        # Calculating cross entropy loss between transposed logits and target classes
        loss_ce = criterion(pred_logits_transposed, target_classes)
        # Constructing dictionary containing the computed cross entropy loss
        losses = {"loss_cross_entropy": loss_ce}
        return losses

    def loss_masks(
        self,
        masks_queries_logits: torch.Tensor,
        mask_labels: List[torch.Tensor],
        indices: Tuple[np.array],
        num_masks: int,
    ) -> Dict[str, torch.Tensor]:
        """Compute the losses related to the masks using sigmoid_cross_entropy_loss and dice loss.

        Args:
            masks_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, height, width)`.
            mask_labels (`torch.Tensor`):
                List of mask labels of shape `(labels, height, width)`.
            indices (`Tuple[np.array])`:
                The indices computed by the Hungarian matcher.
            num_masks (`int)`:
                The number of masks, used for normalization.

        Returns:
            losses (`Dict[str, Tensor]`): A dict of `torch.Tensor` containing two keys:
            - **loss_mask** -- The loss computed using sigmoid cross entropy loss on the predicted and ground truth.
              masks.
            - **loss_dice** -- The loss computed using dice loss on the predicted on the predicted and ground truth,
              masks.
        """
        # 获取预测排序后的索引
        src_idx = self._get_predictions_permutation_indices(indices)
        # 获取目标排序后的索引
        tgt_idx = self._get_targets_permutation_indices(indices)
        # shape (batch_size * num_queries, height, width)
        # 从预测的logits中选择对应src_idx的预测掩码
        pred_masks = masks_queries_logits[src_idx]
        # shape (batch_size, num_queries, height, width)
        # 将目标掩码进行填充以匹配批次中最大的图像，并在num_labels维度上堆叠
        target_masks, _ = self._pad_images_to_max_in_batch(mask_labels)
        # 根据tgt_idx选择目标掩码
        target_masks = target_masks[tgt_idx]

        # 由于使用了归一化坐标，不需要对预测进行上采样
        pred_masks = pred_masks[:, None]
        target_masks = target_masks[:, None]

        # 采样点坐标
        with torch.no_grad():
            point_coordinates = self.sample_points_using_uncertainty(
                pred_masks,
                lambda logits: self.calculate_uncertainty(logits),
                self.num_points,
                self.oversample_ratio,
                self.importance_sample_ratio,
            )

            # 使用采样点从目标掩码中获取点标签，不对齐角点
            point_labels = sample_point(target_masks, point_coordinates, align_corners=False).squeeze(1)

        # 使用采样点从预测掩码中获取点logits，不对齐角点
        point_logits = sample_point(pred_masks, point_coordinates, align_corners=False).squeeze(1)

        # 计算损失
        losses = {
            "loss_mask": sigmoid_cross_entropy_loss(point_logits, point_labels, num_masks),
            "loss_dice": dice_loss(point_logits, point_labels, num_masks),
        }

        # 清理临时变量
        del pred_masks
        del target_masks
        return losses

    def _get_predictions_permutation_indices(self, indices):
        # 根据indices对预测进行排列
        batch_indices = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
        predictions_indices = torch.cat([src for (src, _) in indices])
        return batch_indices, predictions_indices
    # 根据给定的索引重新排列标签
    def _get_targets_permutation_indices(self, indices):
        # 创建批次索引，使每个标签在批次中重复出现
        batch_indices = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
        # 创建目标索引，将所有目标标签连接成一个张量
        target_indices = torch.cat([tgt for (_, tgt) in indices])
        return batch_indices, target_indices

    # 计算不确定性分数
    def calculate_uncertainty(self, logits: torch.Tensor) -> torch.Tensor:
        """
        在Mask2Former论文中，不确定性被估计为logits中前景类的预测与0.0之间的L1距离。

        Args:
            logits (`torch.Tensor`): 形状为(R, 1, ...)的张量，R为所有预测掩码的总数，C为前景类的数量，值为logits。

        Returns:
            scores (`torch.Tensor`): 形状为(R, 1, ...)的张量，包含不确定性分数，不确定位置的分数最高。
        """
        # 计算不确定性分数，使用-logits的绝对值
        uncertainty_scores = -(torch.abs(logits))
        return uncertainty_scores

    # 使用不确定性函数采样点
    def sample_points_using_uncertainty(
        self,
        logits: torch.Tensor,
        uncertainty_function,
        num_points: int,
        oversample_ratio: int,
        importance_sample_ratio: float,
    ) -> torch.Tensor:
        """
        This function samples points in [0, 1] * [0, 1] coordinate space based on uncertainty of logits predictions.

        Args:
            logits (`torch.Tensor`):
                Logit predictions for bounding boxes.
            uncertainty_function:
                Function to calculate uncertainties based on logit predictions.
            num_points (`int`):
                Number of points to sample.
            oversample_ratio (`int`):
                Oversampling ratio for point sampling.
            importance_sample_ratio (`float`):
                Ratio of points sampled via importance sampling.

        Returns:
            point_coordinates (`torch.Tensor`):
                Coordinates of sampled points.
        """

        num_boxes = logits.shape[0]
        num_points_sampled = int(num_points * oversample_ratio)

        # Get random coordinates for points within each bounding box
        point_coordinates = torch.rand(num_boxes, num_points_sampled, 2, device=logits.device)

        # Sample logits values at the sampled coordinates
        point_logits = sample_point(logits, point_coordinates, align_corners=False)

        # Calculate uncertainties based on the sampled logits values
        point_uncertainties = uncertainty_function(point_logits)

        num_uncertain_points = int(importance_sample_ratio * num_points)
        num_random_points = num_points - num_uncertain_points

        # Select uncertain points based on top uncertainties
        idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
        shift = num_points_sampled * torch.arange(num_boxes, dtype=torch.long, device=logits.device)
        idx += shift[:, None]
        point_coordinates = point_coordinates.view(-1, 2)[idx.view(-1), :].view(num_boxes, num_uncertain_points, 2)

        # Add random points to complete the required number of points
        if num_random_points > 0:
            point_coordinates = torch.cat(
                [point_coordinates, torch.rand(num_boxes, num_random_points, 2, device=logits.device)],
                dim=1,
            )

        return point_coordinates
        """
        This performs the loss computation.

        Args:
            masks_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, height, width)`.
                Contains logits for predicted masks.
            class_queries_logits (`torch.Tensor`):
                A tensor of shape `(batch_size, num_queries, num_labels)`.
                Contains logits for predicted class labels.
            mask_labels (`torch.Tensor`):
                List of mask labels of shape `(labels, height, width)`.
                Ground truth masks.
            class_labels (`List[torch.Tensor]`):
                List of class labels of shape `(labels)`.
                Ground truth class labels.
            auxiliary_predictions (`Dict[str, torch.Tensor]`, *optional*):
                if `use_auxiliary_loss` was set to `true` in [`Mask2FormerConfig`], then it contains the logits from
                the inner layers of the Mask2FormerMaskedAttentionDecoder.
                Dictionary of auxiliary predictions from intermediate layers.

        Returns:
            losses (`Dict[str, Tensor]`): A dict of `torch.Tensor` containing three keys:
            - **loss_cross_entropy** -- The loss computed using cross entropy on the predicted and ground truth labels.
            - **loss_mask** -- The loss computed using sigmoid cross_entropy loss on the predicted and ground truth
              masks.
            - **loss_dice** -- The loss computed using dice loss on the predicted and ground truth masks.
            if `use_auxiliary_loss` was set to `true` in [`Mask2FormerConfig`], the dictionary contains additional
            losses for each auxiliary predictions.
        """

        # retrieve the matching between the outputs of the last layer and the labels
        indices = self.matcher(masks_queries_logits, class_queries_logits, mask_labels, class_labels)
        # compute the average number of target masks for normalization purposes
        num_masks = self.get_num_masks(class_labels, device=class_labels[0].device)
        # get all the losses
        losses: Dict[str, Tensor] = {
            **self.loss_masks(masks_queries_logits, mask_labels, indices, num_masks),
            **self.loss_labels(class_queries_logits, class_labels, indices),
        }
        # in case of auxiliary losses, we repeat this process with the output of each intermediate layer.
        if auxiliary_predictions is not None:
            for idx, aux_outputs in enumerate(auxiliary_predictions):
                masks_queries_logits = aux_outputs["masks_queries_logits"]
                class_queries_logits = aux_outputs["class_queries_logits"]
                loss_dict = self.forward(masks_queries_logits, class_queries_logits, mask_labels, class_labels)
                loss_dict = {f"{key}_{idx}": value for key, value in loss_dict.items()}
                losses.update(loss_dict)

        return losses
    # 计算每个批次中目标掩码的平均数量，用于归一化目的
    def get_num_masks(self, class_labels: torch.Tensor, device: torch.device) -> torch.Tensor:
        """
        Computes the average number of target masks across the batch, for normalization purposes.
        """
        # 计算每个样本中类标签列表的总长度，即目标掩码的总数
        num_masks = sum([len(classes) for classes in class_labels])
        # 将总数转换为张量，并指定数据类型和设备
        num_masks = torch.as_tensor(num_masks, dtype=torch.float, device=device)
        # 初始化世界大小为 1
        world_size = 1
        # 检查是否可用加速功能
        if is_accelerate_available():
            # 检查是否存在部分状态的共享状态
            if PartialState._shared_state != {}:
                # 使用 reduce 函数对目标掩码数目进行归约操作
                num_masks = reduce(num_masks)
                # 获取部分状态的进程数
                world_size = PartialState().num_processes

        # 对目标掩码数目进行截断操作，确保不低于 1
        num_masks = torch.clamp(num_masks / world_size, min=1)
        # 返回归一化后的目标掩码数目张量
        return num_masks
# 从transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention复制而来
def multi_scale_deformable_attention(
    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
) -> Tensor:
    # 获取输入张量的形状信息
    batch_size, _, num_heads, hidden_dim = value.shape
    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
    # 将value按照空间形状切分成不同的部分
    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
    # 计算采样网格的位置
    sampling_grids = 2 * sampling_locations - 1
    sampling_value_list = []
    for level_id, (height, width) in enumerate(value_spatial_shapes):
        # 将value_list中的每个部分展平，并且重塑成合适的形状
        value_l_ = (
            value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
        )
        # 调整采样网格的形状以匹配value_l_的大小，并使用双线性插值进行采样
        sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
        sampling_value_l_ = nn.functional.grid_sample(
            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
        )
        sampling_value_list.append(sampling_value_l_)
    # 重新组织注意力权重的形状，以便与采样值匹配
    attention_weights = attention_weights.transpose(1, 2).reshape(
        batch_size * num_heads, 1, num_queries, num_levels * num_points
    )
    # 计算最终的输出，进行加权求和并调整形状
    output = (
        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
        .sum(-1)
        .view(batch_size, num_heads * hidden_dim, num_queries)
    )
    # 转置输出张量并确保连续的内存布局
    return output.transpose(1, 2).contiguous()


# 从transformers.models.maskformer.modeling_maskformer.MaskFormerSinePositionEmbedding复制而来，并将类名更改为Mask2FormerSinePositionEmbedding
class Mask2FormerSinePositionEmbedding(nn.Module):
    """
    这是一个更标准的位置嵌入版本，与“Attention is all you need”论文中使用的非常相似，通用于处理图像。
    """

    def __init__(
        self, num_pos_feats: int = 64, temperature: int = 10000, normalize: bool = False, scale: Optional[float] = None
    ):
        super().__init__()
        self.num_pos_feats = num_pos_feats
        self.temperature = temperature
        self.normalize = normalize
        if scale is None:
            scale = 2 * math.pi
        self.scale = scale
        ):
            super().__init__()
            # 调用父类的构造方法，初始化父类
            if scale is not None and normalize is False:
                # 如果 scale 不为 None 而且 normalize 为 False，则抛出数值错误异常
                raise ValueError("normalize should be True if scale is passed")
            self.num_pos_feats = num_pos_feats
            self.temperature = temperature
            self.normalize = normalize
            self.scale = 2 * math.pi if scale is None else scale

        def forward(self, x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
            if mask is None:
                # 如果 mask 参数为 None，则创建一个全零的 mask 张量，与 x 的尺寸一致
                mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool)
            # 计算非 mask 的张量，将 mask 取反后转换为 x 的数据类型
            not_mask = (~mask).to(x.dtype)
            # 沿着第1维和第2维度累积非 mask 值，得到 y 和 x 的位置编码
            y_embed = not_mask.cumsum(1)
            x_embed = not_mask.cumsum(2)
            if self.normalize:
                # 如果需要进行归一化
                eps = 1e-6
                # 对 y_embed 和 x_embed 进行归一化处理，乘以 self.scale，防止除零错误
                y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
                x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale

            # 创建维度张量 dim_t，其值为从0到 num_pos_feats-1 的整数序列
            dim_t = torch.arange(self.num_pos_feats, dtype=torch.int64, device=x.device).type_as(x)
            # 计算温度的幂次方，用于位置编码
            dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.num_pos_feats)

            # 计算 x 和 y 的位置编码
            pos_x = x_embed[:, :, :, None] / dim_t
            pos_y = y_embed[:, :, :, None] / dim_t
            # 使用正弦和余弦函数对位置编码进行变换，并展平后拼接得到 pos
            pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
            pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
            pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
            # 返回位置编码张量 pos
            return pos
# 基于 Deformable DETR 模型中的 DeformableDetrMultiscaleDeformableAttention 类进行修改
class Mask2FormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module):
    """
    在 Deformable DETR 中提出的多尺度可变形注意力机制。
    """

    def __init__(self, embed_dim: int, num_heads: int, n_levels: int, n_points: int):
        super().__init__()
        # 确保 embed_dim 可以被 num_heads 整除
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embed_dim (d_model) must be divisible by num_heads, but got {embed_dim} and {num_heads}"
            )
        # 计算每个头部的维度
        dim_per_head = embed_dim // num_heads
        # 检查 dim_per_head 是否为2的幂
        if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
            warnings.warn(
                "You'd better set embed_dim (d_model) in DeformableDetrMultiscaleDeformableAttention to make the"
                " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
                " implementation."
            )

        # 每次 im2col 转换的步长
        self.im2col_step = 128

        # 初始化模型的参数
        self.d_model = embed_dim
        self.n_levels = n_levels
        self.n_heads = num_heads
        self.n_points = n_points

        # 定义用于偏移量的线性层
        self.sampling_offsets = nn.Linear(embed_dim, num_heads * n_levels * n_points * 2)
        # 定义注意力权重的线性层
        self.attention_weights = nn.Linear(embed_dim, num_heads * n_levels * n_points)
        # 对值进行投影的线性层
        self.value_proj = nn.Linear(embed_dim, embed_dim)
        # 对输出进行投影的线性层
        self.output_proj = nn.Linear(embed_dim, embed_dim)

    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
        # 如果存在位置嵌入，将其加到张量中
        return tensor if position_embeddings is None else tensor + position_embeddings

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        position_embeddings: Optional[torch.Tensor] = None,
        reference_points=None,
        spatial_shapes=None,
        level_start_index=None,
        output_attentions: bool = False,
    ):
        # 在投影到查询和键之前，向隐藏状态添加位置嵌入
        if position_embeddings is not None:
            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)

        # 获取隐藏状态张量的形状信息
        batch_size, num_queries, _ = hidden_states.shape
        batch_size, sequence_length, _ = encoder_hidden_states.shape
        
        # 检查空间形状与编码器隐藏状态序列长度是否对齐
        if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
            raise ValueError(
                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
            )

        # 对编码器隐藏状态进行值投影
        value = self.value_proj(encoder_hidden_states)

        # 如果存在注意力掩码，则反转注意力掩码
        if attention_mask is not None:
            value = value.masked_fill(attention_mask[..., None], float(0))

        # 重塑值张量的形状以便多头注意力机制处理
        value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)

        # 计算采样偏移量
        sampling_offsets = self.sampling_offsets(hidden_states).view(
            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
        )

        # 计算注意力权重
        attention_weights = self.attention_weights(hidden_states).view(
            batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
        )

        # 对注意力权重进行 softmax 归一化
        attention_weights = nn.functional.softmax(attention_weights, -1).view(
            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
        )

        # 如果参考点张量的最后一个维度为2
        if reference_points.shape[-1] == 2:
            # 计算采样位置
            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
            sampling_locations = (
                reference_points[:, :, None, :, None, :]
                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
            )
        # 如果参考点张量的最后一个维度为4
        elif reference_points.shape[-1] == 4:
            # 计算采样位置
            sampling_locations = (
                reference_points[:, :, None, :, None, :2]
                + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
            )
        else:
            # 抛出异常，参考点张量的最后一个维度必须是2或4
            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")

        # 多尺度可变形注意力机制计算输出
        output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)

        # 对输出进行最终的投影
        output = self.output_proj(output)

        # 返回输出和注意力权重
        return output, attention_weights
# 定义一个名为 Mask2FormerPixelDecoderEncoderLayer 的神经网络模块类，继承自 nn.Module
class Mask2FormerPixelDecoderEncoderLayer(nn.Module):
    # 初始化函数，接收一个 Mask2FormerConfig 类型的参数 config
    def __init__(self, config: Mask2FormerConfig):
        # 调用父类 nn.Module 的初始化函数
        super().__init__()
        # 设置 embed_dim 属性为 config 中的 feature_size，表示特征的维度大小
        self.embed_dim = config.feature_size
        # 初始化 self_attn 属性为 Mask2FormerPixelDecoderEncoderMultiscaleDeformableAttention 类的实例
        # 参数包括 embed_dim（特征维度）、num_heads（注意力头数）、n_levels（多尺度层数）、n_points（变形注意力的采样点数）
        self.self_attn = Mask2FormerPixelDecoderEncoderMultiscaleDeformableAttention(
            embed_dim=self.embed_dim,
            num_heads=config.num_attention_heads,
            n_levels=3,
            n_points=4,
        )

        # 初始化 self_attn_layer_norm 属性为 LayerNorm 层，输入维度为 embed_dim
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        # 设置 dropout 属性为 config 中的 dropout 概率
        self.dropout = config.dropout
        # 设置 activation_fn 属性为 relu 激活函数
        self.activation_fn = nn.functional.relu
        # 设置 activation_dropout 属性为 config 中的 dropout 概率
        self.activation_dropout = config.dropout
        # 初始化 fc1 属性为 Linear 层，输入维度为 embed_dim，输出维度为 config 中的 encoder_feedforward_dim
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_feedforward_dim)
        # 初始化 fc2 属性为 Linear 层，输入维度为 config 中的 encoder_feedforward_dim，输出维度为 embed_dim
        self.fc2 = nn.Linear(config.encoder_feedforward_dim, self.embed_dim)
        # 初始化 final_layer_norm 属性为 LayerNorm 层，输入维度为 embed_dim
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    # 前向传播函数，定义网络的数据流向
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        position_embeddings: torch.Tensor = None,
        reference_points=None,
        spatial_shapes=None,
        level_start_index=None,
        output_attentions: bool = False,
    ):
        """
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                输入到层的输入。
            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                注意力遮罩。
            position_embeddings (`torch.FloatTensor`, *optional*):
                位置嵌入，将要添加到 `hidden_states` 中。
            reference_points (`torch.FloatTensor`, *optional*):
                参考点。
            spatial_shapes (`torch.LongTensor`, *optional*):
                主干特征图的空间形状。
            level_start_index (`torch.LongTensor`, *optional*):
                层级起始索引。
            output_attentions (`bool`, *optional*):
                是否返回所有注意力层的注意力张量。查看返回的张量中的 `attentions` 以获取更多细节。
        """
        residual = hidden_states

        # 在多尺度特征图上应用多尺度可变形注意力模块。
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            encoder_hidden_states=hidden_states,
            encoder_attention_mask=attention_mask,
            position_embeddings=position_embeddings,
            reference_points=reference_points,
            spatial_shapes=spatial_shapes,
            level_start_index=level_start_index,
            output_attentions=output_attentions,
        )

        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
        hidden_states = self.self_attn_layer_norm(hidden_states)

        residual = hidden_states
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)

        hidden_states = self.fc2(hidden_states)
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

        hidden_states = residual + hidden_states
        hidden_states = self.final_layer_norm(hidden_states)

        if self.training:
            # 如果在训练中
            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
                # 如果任何一个张量中有无穷大或者NaN值
                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
                # 将张量值限制在一个较小的值范围内，避免溢出
                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        outputs = (hidden_states,)

        if output_attentions:
            # 如果需要输出注意力权重
            outputs += (attn_weights.transpose(1, 0),)

        return outputs
# 修改自 transformers.models.detr.modeling_deformable_detr.DeformableDetrEncoder 的 DeformableDetrEncoder -> Mask2FormerPixelDecoderEncoderOnly
class Mask2FormerPixelDecoderEncoderOnly(nn.Module):
    """
    Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
    [`Mask2FormerPixelDecoderEncoderLayer`]. The encoder updates the flattened multi-scale feature maps through
    multiple deformable attention layers.

    Args:
        config: Mask2FormerConfig
    """

    def __init__(self, config: Mask2FormerConfig):
        super().__init__()

        # 保存配置信息
        self.config = config
        # 定义 dropout 概率
        self.dropout = config.dropout
        # 创建多个 Mask2FormerPixelDecoderEncoderLayer 层，数量由 config.encoder_layers 决定
        self.layers = nn.ModuleList(
            [Mask2FormerPixelDecoderEncoderLayer(config) for _ in range(config.encoder_layers)]
        )

    @staticmethod
    def get_reference_points(spatial_shapes, valid_ratios, device):
        """
        Get reference points for each feature map. Used in decoder.

        Args:
            spatial_shapes (`torch.LongTensor`):
                Spatial shapes of each feature map, has shape of `(num_feature_levels, 2)`.
            valid_ratios (`torch.FloatTensor`):
                Valid ratios of each feature map, has shape of `(batch_size, num_feature_levels, 2)`.
            device (`torch.device`):
                Device on which to create the tensors.
        Returns:
            `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
        """
        # 初始化 reference_points_list 作为一个空列表
        reference_points_list = []
        # 遍历每个特征图的空间形状
        for lvl, (height, width) in enumerate(spatial_shapes):
            # 创建网格矩阵 ref_y, ref_x 作为参考点的 y 和 x 坐标
            ref_y, ref_x = torch.meshgrid(
                torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device),
                torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device),
                indexing="ij",
            )
            # 将 ref_y, ref_x 转换成一维数组，然后按比例缩放
            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * height)
            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * width)
            # 将 ref_x, ref_y 合并成 ref 的形式
            ref = torch.stack((ref_x, ref_y), -1)
            # 将当前级别的参考点 ref 添加到 reference_points_list 中
            reference_points_list.append(ref)

        # 拼接所有级别的 reference_points
        reference_points = torch.cat(reference_points_list, 1)
        # 根据 valid_ratios 调整 reference_points 的形状
        reference_points = reference_points[:, :, None] * valid_ratios[:, None]

        return reference_points

    def forward(
        self,
        inputs_embeds=None,
        attention_mask=None,
        position_embeddings=None,
        spatial_shapes=None,
        level_start_index=None,
        valid_ratios=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
# Modified from from transformers.models.detr.modeling_deformable_detr.DeformableDetrModel with DeformableDetrModel->Mask2FormerPixelDecoder
class Mask2FormerPixelDecoder(nn.Module):
    # 初始化函数，接受配置对象和特征通道数作为参数
    def __init__(self, config: Mask2FormerConfig, feature_channels):
        # 调用父类的初始化方法
        super().__init__()

        # 将配置对象保存在实例变量中
        self.config = config

        # 从配置对象中获取特征大小和掩码特征大小
        feature_dim = config.feature_size
        mask_dim = config.mask_feature_size
        # 计算每一层的正向特征数量
        num_pos_features = feature_dim // 2

        # 创建位置嵌入对象，使用正向特征数量和归一化参数
        self.position_embedding = Mask2FormerSinePositionEmbedding(num_pos_feats=num_pos_features, normalize=True)
        # 定义特征层级数量为3
        self.num_feature_levels = 3
        # 从输入特征通道中提取变压器输入通道数
        transformer_in_channels = feature_channels[-self.num_feature_levels :]

        # 获取变压器特征步幅和通道信息
        self.transformer_feature_strides = config.feature_strides[-self.num_feature_levels :]
        self.feature_channels = feature_channels
        # 创建层级嵌入参数，使用指定的特征层级和特征维度
        self.level_embed = nn.Parameter(torch.Tensor(self.num_feature_levels, feature_dim))

        # 创建输入投影层
        if self.num_feature_levels > 1:
            input_projections_list = []
            # 遍历反向变压器输入通道列表
            for in_channels in transformer_in_channels[::-1]:
                # 使用卷积层和分组规范化创建顺序模块
                input_projections_list.append(
                    nn.Sequential(
                        nn.Conv2d(in_channels, feature_dim, kernel_size=1),
                        nn.GroupNorm(32, feature_dim),
                    )
                )
            # 使用模块列表创建输入投影层
            self.input_projections = nn.ModuleList(input_projections_list)
        else:
            # 若特征层级为1，创建单一模块列表
            self.input_projections = nn.ModuleList(
                [
                    nn.Sequential(
                        nn.Conv2d(transformer_in_channels[-1], feature_dim, kernel_size=1),
                        nn.GroupNorm(32, feature_dim),
                    )
                ]
            )

        # 创建解码器编码器对象
        self.encoder = Mask2FormerPixelDecoderEncoderOnly(config)
        # 创建掩码投影卷积层，使用特征维度和掩码特征维度
        self.mask_projection = nn.Conv2d(feature_dim, mask_dim, kernel_size=1, stride=1, padding=0)

        # 额外的特征金字塔网络层级
        stride = min(self.transformer_feature_strides)
        self.common_stride = config.common_stride
        # 计算FPN层级数目
        self.num_fpn_levels = int(np.log2(stride) - np.log2(self.common_stride))

        lateral_convs = []
        output_convs = []

        # 遍历特征通道列表的前几个通道，创建侧向和输出卷积层
        for idx, in_channels in enumerate(self.feature_channels[: self.num_fpn_levels]):
            # 创建侧向卷积层，使用1x1卷积和分组规范化
            lateral_conv = nn.Sequential(
                nn.Conv2d(in_channels, feature_dim, kernel_size=1, bias=False),
                nn.GroupNorm(32, feature_dim),
            )

            # 创建输出卷积层，使用3x3卷积、分组规范化和ReLU激活函数
            output_conv = nn.Sequential(
                nn.Conv2d(feature_dim, feature_dim, kernel_size=3, stride=1, padding=1, bias=False),
                nn.GroupNorm(32, feature_dim),
                nn.ReLU(),
            )
            # 将侧向和输出卷积层作为模块添加到模型中
            self.add_module("adapter_{}".format(idx + 1), lateral_conv)
            self.add_module("layer_{}".format(idx + 1), output_conv)

            # 将侧向和输出卷积层添加到列表中
            lateral_convs.append(lateral_conv)
            output_convs.append(output_conv)

        # 将侧向卷积层和输出卷积层反转顺序，以便从低分辨率到高分辨率排序
        self.lateral_convolutions = lateral_convs[::-1]
        self.output_convolutions = output_convs[::-1]
    # 计算输入掩码中每个特征图的有效比率
    def get_valid_ratio(self, mask, dtype=torch.float32):
        """Get the valid ratio of all feature maps."""

        # 获取掩码的形状信息，并解构为通道、高度、宽度
        _, height, width = mask.shape
        
        # 计算每个特征图在高度上的有效像素数量，即非零像素数
        valid_height = torch.sum(~mask[:, :, 0], 1)
        
        # 计算每个特征图在宽度上的有效像素数量，即非零像素数
        valid_width = torch.sum(~mask[:, 0, :], 1)
        
        # 将有效像素数转换为有效比率，使用给定的数据类型
        valid_ratio_heigth = valid_height.to(dtype) / height
        valid_ratio_width = valid_width.to(dtype) / width
        
        # 将高度和宽度的有效比率合并成一个张量，形状为 [batch_size, 2]
        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1)
        
        # 返回有效比率张量
        return valid_ratio

    # 模型前向传播函数
    def forward(
        self,
        features,
        encoder_outputs=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
class Mask2FormerPixelLevelModule(nn.Module):
    def __init__(self, config: Mask2FormerConfig):
        """
        Pixel Level Module proposed in [Masked-attention Mask Transformer for Universal Image
        Segmentation](https://arxiv.org/abs/2112.01527). It runs the input image through a backbone and a pixel
        decoder, generating multi-scale feature maps and pixel embeddings.

        Args:
            config ([`Mask2FormerConfig`]):
                The configuration used to instantiate this model.
        """
        super().__init__()

        # 加载指定配置的骨干网络
        self.encoder = load_backbone(config)
        # 使用骨干网络的通道数初始化像素解码器
        self.decoder = Mask2FormerPixelDecoder(config, feature_channels=self.encoder.channels)

    def forward(self, pixel_values: Tensor, output_hidden_states: bool = False) -> Mask2FormerPixelLevelModuleOutput:
        # 通过骨干网络获取特征图
        backbone_features = self.encoder(pixel_values).feature_maps
        # 使用解码器处理特征图，生成输出
        decoder_output = self.decoder(backbone_features, output_hidden_states=output_hidden_states)

        return Mask2FormerPixelLevelModuleOutput(
            encoder_last_hidden_state=backbone_features[-1],
            encoder_hidden_states=tuple(backbone_features) if output_hidden_states else None,
            decoder_last_hidden_state=decoder_output.mask_features,
            decoder_hidden_states=decoder_output.multi_scale_features,
        )


# Modified from transformers.models.detr.modeling_detr.DetrAttention with Detr->Mask2Former
class Mask2FormerAttention(nn.Module):
    """
    Multi-headed attention from 'Attention Is All You Need' paper. Here, we add position embeddings to the queries and
    keys (as explained in the DETR paper).
    """

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        if self.head_dim * num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {num_heads})."
            )
        self.scaling = self.head_dim**-0.5

        # 初始化线性变换层
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
        # 重塑张量以便进行多头注意力计算
        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
        # 如果存在位置编码张量，则将其加到输入张量上
        return tensor if position_embeddings is None else tensor + position_embeddings
    # 定义一个方法 `forward`，用于执行模型的前向传播过程
    def forward(
        self,
        # 输入参数 `hidden_states`，表示模型的隐藏状态，是一个张量
        hidden_states: torch.Tensor,
        # 输入参数 `attention_mask`，表示注意力掩码，可以为空
        attention_mask: Optional[torch.Tensor] = None,
        # 输入参数 `position_embeddings`，表示位置嵌入，可以为空
        position_embeddings: Optional[torch.Tensor] = None,
        # 输入参数 `key_value_states`，表示键值状态，可以为空
        key_value_states: Optional[torch.Tensor] = None,
        # 输入参数 `key_value_position_embeddings`，表示键值位置嵌入，可以为空
        key_value_position_embeddings: Optional[torch.Tensor] = None,
        # 输入参数 `output_attentions`，表示是否输出注意力权重，默认为 False
        output_attentions: bool = False,
    """
    Mask2FormerMaskedAttentionDecoderLayer由self-attention、交叉（masked）attention和FFN块组成。
    在Mask2FormerMaskedAttentionDecoderLayer中使用的交叉attention实际上是一种限制注意力在预测段周围局部特征的masked attention块，
    这导致更快的收敛和更好的性能。相比标准的DetrDecoder，Mask2FormerMaskedAttentionDecoder中的self和cross（即masked）attention块的顺序被交换，
    这是一种优化改进。

    Args:
        config (`Mask2FormerConfig`):
            用于初始化Mask2FormerMaskedAttentionDecoder的配置。
    """
    
    def __init__(self, config: Mask2FormerConfig):
        super().__init__()
        self.config = config
        self.embed_dim = self.config.hidden_dim  # 设置嵌入维度为配置中的隐藏维度
        self.pre_norm = self.config.pre_norm  # 设置预规范化标志为配置中的预规范化标志
        
        # 初始化self-attention层，使用Mask2FormerAttention类
        self.self_attn = Mask2FormerAttention(
            embed_dim=self.embed_dim,
            num_heads=config.num_attention_heads,
            dropout=config.dropout,
            is_decoder=True,
        )
        
        self.dropout = self.config.dropout  # 设置dropout率为配置中的dropout率
        self.activation_fn = ACT2FN[self.config.activation_function]  # 根据配置选择激活函数
        self.activation_dropout = self.config.dropout  # 设置激活函数的dropout率为配置中的dropout率
        
        # 初始化self-attention层的LayerNorm
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        
        # 初始化交叉attention层，使用nn.MultiheadAttention类
        self.cross_attn = nn.MultiheadAttention(self.embed_dim, self.config.num_attention_heads, self.config.dropout)
        
        # 初始化交叉attention层的LayerNorm
        self.cross_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        
        # 初始化前向传播网络的第一个线性层
        self.fc1 = nn.Linear(self.embed_dim, self.config.dim_feedforward)
        
        # 初始化前向传播网络的第二个线性层
        self.fc2 = nn.Linear(self.config.dim_feedforward, self.embed_dim)
        
        # 初始化最终输出的LayerNorm
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
        """
        如果位置编码pos不为None，则将其添加到张量tensor中；否则返回原始张量tensor。

        Args:
            tensor (torch.Tensor): 输入张量
            pos (Optional[Tensor]): 位置编码张量，可选

        Returns:
            torch.Tensor: 处理后的张量
        """
        return tensor if pos is None else tensor + pos

    def forward_post(
        self,
        hidden_states: torch.Tensor,
        level_index: int = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_embeddings: Optional[torch.Tensor] = None,
        query_position_embeddings: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
    ):
        # Masked(Cross)-Attention Block
        cross_attn_weights = None  # 初始化跨注意力权重为 None
        self_attn_weights = None    # 初始化自注意力权重为 None

        residual = hidden_states    # 保存输入的隐藏状态作为残差连接的基准

        # 执行跨注意力机制
        hidden_states, cross_attn_weights = self.cross_attn(
            query=self.with_pos_embed(hidden_states, query_position_embeddings),  # 使用位置嵌入增强查询
            key=self.with_pos_embed(encoder_hidden_states[level_index], position_embeddings[level_index]),  # 使用位置嵌入增强键
            value=encoder_hidden_states[level_index],  # 使用编码器隐藏状态作为值
            attn_mask=encoder_attention_mask,  # 编码器注意力掩码
            key_padding_mask=None,  # 键的填充掩码暂未指定
        )

        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)  # 使用丢弃来处理隐藏状态
        hidden_states = residual + hidden_states  # 残差连接
        hidden_states = self.cross_attn_layer_norm(hidden_states)  # 使用层归一化处理隐藏状态

        # Self Attention Block
        residual = hidden_states  # 保存当前隐藏状态作为自注意力块的残差基准

        hidden_states, self_attn_weights = self.self_attn(
            hidden_states=hidden_states,  # 使用当前隐藏状态
            position_embeddings=query_position_embeddings,  # 查询位置嵌入
            attention_mask=None,  # 注意力掩码暂未指定
            output_attentions=True,  # 输出注意力权重
        )

        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)  # 使用丢弃来处理隐藏状态
        hidden_states = residual + hidden_states  # 残差连接
        hidden_states = self.self_attn_layer_norm(hidden_states)  # 使用层归一化处理隐藏状态

        # Fully Connected
        residual = hidden_states  # 保存当前隐藏状态作为全连接块的残差基准
        hidden_states = self.activation_fn(self.fc1(hidden_states))  # 使用激活函数处理第一个全连接层
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)  # 使用激活函数的丢弃
        hidden_states = self.fc2(hidden_states)  # 第二个全连接层
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)  # 第二个全连接层的丢弃
        hidden_states = residual + hidden_states  # 残差连接
        hidden_states = self.final_layer_norm(hidden_states)  # 使用层归一化处理隐藏状态

        outputs = (hidden_states,)  # 输出为处理后的隐藏状态

        if output_attentions:  # 如果需要输出注意力权重
            outputs += (self_attn_weights, cross_attn_weights)  # 将自注意力和跨注意力权重添加到输出中

        return outputs  # 返回输出结果

    def forward_pre(
        self,
        hidden_states: torch.Tensor,
        level_index: int = None,  # 编码器层索引
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码（可选）
        position_embeddings: Optional[torch.Tensor] = None,  # 位置嵌入（可选）
        query_position_embeddings: Optional[torch.Tensor] = None,  # 查询位置嵌入（可选）
        encoder_hidden_states: Optional[torch.Tensor] = None,  # 编码器隐藏状态（可选）
        encoder_attention_mask: Optional[torch.Tensor] = None,  # 编码器注意力掩码（可选）
        output_attentions: Optional[bool] = False,  # 是否输出注意力权重（默认为 False）
        # Masked(Cross)-Attention Block
        cross_attn_weights = None  # 初始化交叉注意力权重为None
        self_attn_weights = None   # 初始化自注意力权重为None

        residual = hidden_states   # 保存原始的隐藏状态作为残差连接的输入

        hidden_states = self.cross_attn_layer_norm(hidden_states)  # 使用层归一化处理隐藏状态

        # 执行交叉注意力计算
        hidden_states, cross_attn_weights = self.cross_attn(
            query=self.with_pos_embed(hidden_states, query_position_embeddings),
            key=self.with_pos_embed(encoder_hidden_states[level_index], position_embeddings[level_index]),
            value=encoder_hidden_states[level_index],
            attn_mask=encoder_attention_mask,
            key_padding_mask=None,
        )

        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)  # 对隐藏状态应用Dropout
        hidden_states = residual + hidden_states  # 执行残差连接

        # Self Attention Block
        residual = hidden_states   # 保存当前隐藏状态作为自注意力的残差连接输入

        hidden_states = self.self_attn_layer_norm(hidden_states)  # 使用层归一化处理隐藏状态

        # 执行自注意力计算
        hidden_states, self_attn_weights = self.self_attn(
            hidden_states=hidden_states,
            position_embeddings=query_position_embeddings,
            attention_mask=None,
            output_attentions=True,
        )

        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)  # 对隐藏状态应用Dropout
        hidden_states = residual + hidden_states  # 执行残差连接

        # Fully Connected
        residual = hidden_states   # 保存当前隐藏状态作为全连接的残差连接输入

        hidden_states = self.final_layer_norm(hidden_states)  # 使用层归一化处理隐藏状态
        hidden_states = self.activation_fn(self.fc1(hidden_states))  # 使用激活函数处理全连接层1
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)  # 对全连接结果应用Dropout
        hidden_states = self.fc2(hidden_states)  # 执行全连接层2
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)  # 对全连接结果应用Dropout
        hidden_states = residual + hidden_states  # 执行残差连接

        outputs = (hidden_states,)  # 将隐藏状态作为输出的第一个元素

        if output_attentions:
            outputs += (self_attn_weights, cross_attn_weights)  # 如果需要输出注意力权重，则将自注意力和交叉注意力的权重添加到输出中

        return outputs  # 返回最终的输出结果

    def forward(
        self,
        hidden_states: torch.Tensor,
        level_index: int = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_embeddings: Optional[torch.Tensor] = None,
        query_position_embeddings: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
        """
        Args:
            hidden_states (`torch.FloatTensor`):
                输入到层的张量，形状为 `(seq_len, batch, embed_dim)`。
            attention_mask (`torch.FloatTensor`):
                注意力遮罩张量，形状为 `(1, seq_len, tgt_len, src_len)`。
            position_embeddings (`torch.FloatTensor`, *可选*):
                添加到掩码注意力层中键的位置嵌入。
            query_position_embeddings (`torch.FloatTensor`, *可选*):
                添加到自注意力层中查询和键的位置嵌入。
            encoder_hidden_states (`torch.FloatTensor`):
                层的交叉注意力输入张量，形状为 `(seq_len, batch, embed_dim)`。
            encoder_attention_mask (`torch.FloatTensor`):
                编码器注意力遮罩张量，大小为 `(1, seq_len, tgt_len, src_len)`。
            output_attentions (`bool`, *可选*):
                是否返回所有注意力层的注意力张量。查看返回的张量中的 `attentions` 以获取更多细节。
        """

        # 如果使用预归一化
        if self.pre_norm:
            # 调用预归一化前向传播函数
            outputs = self.forward_pre(
                hidden_states=hidden_states,
                level_index=level_index,
                position_embeddings=position_embeddings,
                query_position_embeddings=query_position_embeddings,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                output_attentions=output_attentions,
            )
        else:
            # 调用后归一化前向传播函数
            outputs = self.forward_post(
                hidden_states=hidden_states,
                level_index=level_index,
                position_embeddings=position_embeddings,
                query_position_embeddings=query_position_embeddings,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                output_attentions=output_attentions,
            )

        # 返回模型层的输出
        return outputs
# 定义一个基于 Transformer 的解码器类，包含多个层
class Mask2FormerMaskedAttentionDecoder(nn.Module):
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a
    [`Mask2FormerMaskedAttentionDecoderLayer`]. The decoder updates the query embeddings through multiple cross
    (masked) and self-attention layers. The decoder uses a new **masked attention** mechanism instead of the standard
    cross-attention, which extracts localized features by constraining cross-attention to within the foreground region
    of the predicted mask for each query, instead of attending to the full feature map.

    Args:
        config (`Mask2FormerConfig`):
            Configuration used to instantiate Mask2FormerMaskedAttentionDecoder.
    """

    def __init__(self, config: Mask2FormerConfig):
        super().__init__()

        # 初始化解码器的配置和参数
        self.config = config
        self.mask_feature_size = config.mask_feature_size  # 掩码特征大小
        self.dropout = config.dropout  # 丢弃率
        self.layerdrop = config.dropout  # 层丢弃率
        self.num_feature_levels = 3  # 级别嵌入数（3个规模的嵌入）
        self.decoder_layers = config.decoder_layers - 1  # 解码层数

        # 创建解码器层列表，每层是一个 Mask2FormerMaskedAttentionDecoderLayer 实例
        self.layers = nn.ModuleList(
            [Mask2FormerMaskedAttentionDecoderLayer(self.config) for _ in range(self.decoder_layers)]
        )
        self.layernorm = nn.LayerNorm(config.hidden_dim)  # 归一化层

        # 创建掩码预测器，用于生成掩码预测
        self.mask_predictor = Mask2FormerMaskPredictor(
            hidden_size=config.hidden_dim,
            num_heads=config.num_attention_heads,
            mask_feature_size=self.mask_feature_size,
        )

        self.gradient_checkpointing = False  # 梯度检查点开关

    # 前向传播函数定义
    def forward(
        self,
        inputs_embeds: torch.Tensor = None,  # 输入嵌入
        multi_stage_positional_embeddings: torch.Tensor = None,  # 多阶段位置嵌入
        pixel_embeddings: torch.Tensor = None,  # 像素嵌入
        encoder_hidden_states: torch.Tensor = None,  # 编码器隐藏状态
        query_position_embeddings: torch.Tensor = None,  # 查询位置嵌入
        feature_size_list: List = None,  # 特征大小列表
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出
# 从 transformers.models.maskformer.modeling_maskformer.PredictionBlock 复制，将 MaskFormer 改为 Mask2Former
class Mask2FormerPredictionBlock(nn.Module):
    def __init__(self, in_dim: int, out_dim: int, activation: nn.Module) -> None:
        super().__init__()
        self.layers = [nn.Linear(in_dim, out_dim), activation]
        # 保持子模块索引，仿佛是顺序块的一部分
        for i, layer in enumerate(self.layers):
            self.add_module(str(i), layer)

    # 前向传播函数定义
    def forward(self, input: Tensor) -> Tensor:
        hidden_state = input
        # 应用每一层线性变换和激活函数到输入张量上
        for layer in self.layers:
            hidden_state = layer(hidden_state)
        return hidden_state
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int = 3):
        """
        A classic Multi Layer Perceptron (MLP).

        Args:
            input_dim (`int`):
                The input dimensions.
            hidden_dim (`int`):
                The hidden dimensions.
            output_dim (`int`):
                The output dimensions.
            num_layers (int, *optional*, defaults to 3):
                The number of layers.
        """
        super().__init__()  # 调用父类的初始化方法

        # 定义每层的输入和输出维度
        in_dims = [input_dim] + [hidden_dim] * (num_layers - 1)
        out_dims = [hidden_dim] * (num_layers - 1) + [output_dim]

        self.layers = []  # 初始化存储层的列表
        for i, (in_dim, out_dim) in enumerate(zip(in_dims, out_dims)):
            # 根据层数选择激活函数，最后一层使用恒等映射作为激活函数
            activation = nn.ReLU() if i < num_layers - 1 else nn.Identity()
            # 创建 Mask2FormerPredictionBlock 实例作为当前层
            layer = Mask2FormerPredictionBlock(in_dim, out_dim, activation=activation)
            self.layers.append(layer)  # 将当前层添加到层列表中

            # 为了向后兼容，特别是当类继承自 nn.Sequential 时
            # 在 nn.Sequential 的子类中，层的名称是它在序列中的索引
            # 在 nn.Module 的子类中，它们根据分配给它们的实例属性命名，例如 self.my_layer_name = Layer()
            # 由于不能给实例属性整数名称（例如 self.0 是不允许的），因此需要显式注册模块
            self.add_module(str(i), layer)  # 将当前层以索引 i 的字符串形式注册为模块

    def forward(self, input: Tensor) -> Tensor:
        hidden_state = input  # 初始化输入数据为隐藏状态

        # 逐层计算前向传播
        for layer in self.layers:
            hidden_state = layer(hidden_state)  # 应用当前层到隐藏状态

        return hidden_state  # 返回最终的隐藏状态作为输出
class Mask2FormerMaskPredictor(nn.Module):
    def __init__(self, hidden_size: int, num_heads: int, mask_feature_size: torch.Tensor):
        """
        This class is used to get the predicted mask for a given Mask2FormerMaskedAttentionDecoder layer. It also
        generates the binarized attention mask associated with the given predicted mask. The attention mask obtained
        using predicted mask of the (l-1)th decoder layer is fed to the cross(masked)-attention block of the next
        decoder layer as input.

        Args:
            hidden_size (`int`):
                The feature dimension of the Mask2FormerMaskedAttentionDecoder
            num_heads (`int`):
                The number of heads used in the Mask2FormerMaskedAttentionDecoder
            mask_feature_size (`torch.Tensor`):
                one of the output dimensions of the predicted masks for each query
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads

        # Initialize the mask_embedder using Mask2FormerMLPPredictionHead module
        self.mask_embedder = Mask2FormerMLPPredictionHead(self.hidden_size, self.hidden_size, mask_feature_size)

    def forward(self, outputs: torch.Tensor, pixel_embeddings: torch.Tensor, attention_mask_target_size: int = None):
        # Generate mask embeddings using the mask_embedder
        mask_embeddings = self.mask_embedder(outputs.transpose(0, 1))

        # Check if the model is in tracing mode or compiling mode for TorchScript
        is_tracing = (
            torch.jit.is_tracing()
            or isinstance(outputs, torch.fx.Proxy)
            or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())
        )

        # Sum up over the channels using either a loop (if not using Torch 2.1 or higher) or einsum
        if is_tracing and not is_torch_greater_or_equal_than_2_1:
            # Loop through channels and accumulate outputs_mask
            batch_size, num_queries, num_channels = mask_embeddings.shape
            _, _, height, width = pixel_embeddings.shape
            outputs_mask = torch.zeros((batch_size, num_queries, height, width), device=mask_embeddings.device)
            for c in range(num_channels):
                outputs_mask += mask_embeddings[..., c][..., None, None] * pixel_embeddings[:, None, c]
        else:
            # Use einsum to perform tensor contraction
            outputs_mask = torch.einsum("bqc, bchw -> bqhw", mask_embeddings, pixel_embeddings)

        # Resize the outputs_mask to attention_mask_target_size using bilinear interpolation
        attention_mask = nn.functional.interpolate(
            outputs_mask, size=attention_mask_target_size, mode="bilinear", align_corners=False
        )

        # Apply sigmoid activation and reshape for multi-head attention compatibility
        attention_mask = attention_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1)

        # Binarize the attention_mask based on a threshold of 0.5 and detach it from the computation graph
        attention_mask = (attention_mask.flatten(0, 1) < 0.5).bool()
        attention_mask = attention_mask.detach()

        # Return the generated outputs_mask and attention_mask
        return outputs_mask, attention_mask


class Mask2FormerTransformerModule(nn.Module):
    """
    The Mask2Former's transformer module.
    """
    def __init__(self, in_features: int, config: Mask2FormerConfig):
        super().__init__()
        hidden_dim = config.hidden_dim
        self.num_feature_levels = 3
        # 初始化位置编码器，使用 Mask2FormerSinePositionEmbedding 类
        self.position_embedder = Mask2FormerSinePositionEmbedding(num_pos_feats=hidden_dim // 2, normalize=True)
        # 初始化查询的嵌入层，使用 nn.Embedding 类
        self.queries_embedder = nn.Embedding(config.num_queries, hidden_dim)
        # 初始化查询的特征嵌入层，使用 nn.Embedding 类
        self.queries_features = nn.Embedding(config.num_queries, hidden_dim)
        # 输入投影层列表
        self.input_projections = []

        # 根据 num_feature_levels 创建输入投影层
        for _ in range(self.num_feature_levels):
            if in_features != hidden_dim or config.enforce_input_projection:
                # 如果输入特征维度不等于隐藏维度或者配置要求强制投影，则添加卷积层
                self.input_projections.append(nn.Conv2d(in_features, hidden_dim, kernel_size=1))
            else:
                # 否则添加空的序列（空的 nn.Sequential()）
                self.input_projections.append(nn.Sequential())

        # 初始化解码器，使用 Mask2FormerMaskedAttentionDecoder 类
        self.decoder = Mask2FormerMaskedAttentionDecoder(config=config)
        # 等级嵌入层，使用 nn.Embedding 类
        self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim)

    def forward(
        self,
        multi_scale_features: List[Tensor],
        mask_features: Tensor,
        output_hidden_states: bool = False,
        output_attentions: bool = False,
    ) -> Mask2FormerMaskedAttentionDecoderOutput:
        # 多尺度特征列表
        multi_stage_features = []
        # 多尺度位置嵌入列表
        multi_stage_positional_embeddings = []
        # 尺寸列表
        size_list = []

        # 遍历 num_feature_levels
        for i in range(self.num_feature_levels):
            # 记录每个特征的尺寸
            size_list.append(multi_scale_features[i].shape[-2:])
            # 获取多尺度位置嵌入并展平
            multi_stage_positional_embeddings.append(self.position_embedder(multi_scale_features[i], None).flatten(2))
            # 获取多尺度特征并展平，加上等级嵌入
            multi_stage_features.append(
                self.input_projections[i](multi_scale_features[i]).flatten(2)
                + self.level_embed.weight[i][None, :, None]
            )

            # 转置操作，将维度重新排列为 (height*width, batch_size, num_channels)
            multi_stage_positional_embeddings[-1] = multi_stage_positional_embeddings[-1].permute(2, 0, 1)
            multi_stage_features[-1] = multi_stage_features[-1].permute(2, 0, 1)

        # 获取 batch_size
        _, batch_size, _ = multi_stage_features[0].shape

        # 查询嵌入，扩展为 [num_queries, batch_size, hidden_dim]
        query_embeddings = self.queries_embedder.weight.unsqueeze(1).repeat(1, batch_size, 1)
        # 查询特征嵌入，扩展为 [num_queries, batch_size, hidden_dim]
        query_features = self.queries_features.weight.unsqueeze(1).repeat(1, batch_size, 1)

        # 调用解码器进行解码操作
        decoder_output = self.decoder(
            inputs_embeds=query_features,
            multi_stage_positional_embeddings=multi_stage_positional_embeddings,
            pixel_embeddings=mask_features,
            encoder_hidden_states=multi_stage_features,
            query_position_embeddings=query_embeddings,
            feature_size_list=size_list,
            output_hidden_states=output_hidden_states,
            output_attentions=output_attentions,
            return_dict=True,
        )

        # 返回解码器的输出
        return decoder_output
# 定义一个长字符串，描述了该模型是一个 PyTorch 的 `torch.nn.Module` 的子类，用于普通的 PyTorch 模型使用，并引用了 PyTorch 文档以获取有关一般用法和行为的信息。
MASK2FORMER_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`Mask2FormerConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 定义了另一个长字符串，描述了模型的输入参数和可选参数的详细说明。
MASK2FORMER_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`AutoImageProcessor.preprocess`] for details.
        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:

            - 1 for pixels that are real (i.e. **not masked**),
            - 0 for pixels that are padding (i.e. **masked**).

            [What are attention masks?](../glossary#attention-mask)
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of Detr's decoder attention layers.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~Mask2FormerModelOutput`] instead of a plain tuple.
"""

# 定义了一个模型类 `Mask2FormerModel`，继承自 `Mask2FormerPreTrainedModel`，表示 Mask2Former 模型的主体结构。
@add_start_docstrings(
    "The bare Mask2Former Model outputting raw hidden-states without any specific head on top.",
    MASK2FORMER_START_DOCSTRING,
)
class Mask2FormerModel(Mask2FormerPreTrainedModel):
    main_input_name = "pixel_values"

    def __init__(self, config: Mask2FormerConfig):
        super().__init__(config)
        # 初始化模型的像素级模块和 Transformer 模块，使用给定的配置参数
        self.pixel_level_module = Mask2FormerPixelLevelModule(config)
        self.transformer_module = Mask2FormerTransformerModule(in_features=config.feature_size, config=config)

        self.post_init()

    @add_start_docstrings_to_model_forward(MASK2FORMER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Mask2FormerModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: Tensor,
        pixel_mask: Optional[Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        return_dict: Optional[bool] = None,
@add_start_docstrings(
    "The Mask2Former Model with heads on top for instance/semantic/panoptic segmentation.",
    MASK2FORMER_START_DOCSTRING,
    )
    class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel):
        main_input_name = "pixel_values"

        def __init__(self, config: Mask2FormerConfig):
            super().__init__(config)
            # 使用给定配置初始化 Mask2FormerModel 模型
            self.model = Mask2FormerModel(config)

            # 初始化损失权重字典，包括交叉熵损失、Mask 损失和 Dice 损失的权重
            self.weight_dict: Dict[str, float] = {
                "loss_cross_entropy": config.class_weight,
                "loss_mask": config.mask_weight,
                "loss_dice": config.dice_weight,
            }

            # 创建一个线性层用于类别预测，输出维度为 config.num_labels + 1
            self.class_predictor = nn.Linear(config.hidden_dim, config.num_labels + 1)

            # 初始化损失函数，使用 Mask2FormerLoss 类，传入配置和权重字典
            self.criterion = Mask2FormerLoss(config=config, weight_dict=self.weight_dict)
            # 调用后初始化方法
            self.post_init()

        def get_loss_dict(
            self,
            masks_queries_logits: Tensor,
            class_queries_logits: Tensor,
            mask_labels: Tensor,
            class_labels: Tensor,
            auxiliary_predictions: Dict[str, Tensor],
        ) -> Dict[str, Tensor]:
            # 计算损失字典，调用 self.criterion 对象的 __call__ 方法
            loss_dict: Dict[str, Tensor] = self.criterion(
                masks_queries_logits=masks_queries_logits,
                class_queries_logits=class_queries_logits,
                mask_labels=mask_labels,
                class_labels=class_labels,
                auxiliary_predictions=auxiliary_predictions,
            )

            # 根据 self.weight_dict 中的权重对每个损失进行加权，包括辅助损失
            for key, weight in self.weight_dict.items():
                for loss_key, loss in loss_dict.items():
                    if key in loss_key:
                        loss *= weight

            return loss_dict

        def get_loss(self, loss_dict: Dict[str, Tensor]) -> Tensor:
            # 计算总损失，将损失字典中的所有值相加
            return sum(loss_dict.values())

        def get_auxiliary_logits(self, classes: torch.Tensor, output_masks: torch.Tensor):
            # 获取辅助预测的 logits 列表
            auxiliary_logits: List[Dict(str, Tensor)] = []

            # 遍历输出的 masks 和 classes，排除最后一个元素（用于辅助任务）
            for aux_binary_masks, aux_classes in zip(output_masks[:-1], classes[:-1]):
                auxiliary_logits.append({"masks_queries_logits": aux_binary_masks, "class_queries_logits": aux_classes})

            return auxiliary_logits

        @add_start_docstrings_to_model_forward(MASK2FORMER_INPUTS_DOCSTRING)
        @replace_return_docstrings(output_type=Mask2FormerForUniversalSegmentationOutput, config_class=_CONFIG_FOR_DOC)
        def forward(
            self,
            pixel_values: Tensor,
            mask_labels: Optional[List[Tensor]] = None,
            class_labels: Optional[List[Tensor]] = None,
            pixel_mask: Optional[Tensor] = None,
            output_hidden_states: Optional[bool] = None,
            output_auxiliary_logits: Optional[bool] = None,
            output_attentions: Optional[bool] = None,
            return_dict: Optional[bool] = None,
        ):
            # 正向传播函数，接收多个参数，返回 Mask2FormerForUniversalSegmentationOutput 对象

`.\models\mask2former\init.py`

# 引入模块的版权声明和许可证信息
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 引入类型检查模块
from typing import TYPE_CHECKING

# 引入必要的依赖项检查函数和模块
# 从当前目录的utils模块中引入相关函数和类
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块的导入结构，包含配置和模型
_import_structure = {
    "configuration_mask2former": [
        "MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "Mask2FormerConfig",
    ],
}

# 检查是否存在视觉处理依赖，如果不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 将视觉处理相关模块添加到导入结构中
    _import_structure["image_processing_mask2former"] = ["Mask2FormerImageProcessor"]

# 检查是否存在PyTorch依赖，如果不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 将模型处理相关模块添加到导入结构中
    _import_structure["modeling_mask2former"] = [
        "MASK2FORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "Mask2FormerForUniversalSegmentation",
        "Mask2FormerModel",
        "Mask2FormerPreTrainedModel",
    ]

# 如果当前环境支持类型检查，引入配置相关的类和变量
if TYPE_CHECKING:
    from .configuration_mask2former import MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, Mask2FormerConfig

    # 检查视觉处理是否可用，如果可用则引入相关处理类
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .image_processing_mask2former import Mask2FormerImageProcessor

    # 检查PyTorch是否可用，如果可用则引入相关模型类和变量
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_mask2former import (
            MASK2FORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            Mask2FormerForUniversalSegmentation,
            Mask2FormerModel,
            Mask2FormerPreTrainedModel,
        )

# 如果当前环境不支持类型检查，将模块设置为LazyModule以支持按需导入
else:
    import sys

    # 将当前模块替换为LazyModule，用于按需加载导入的模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

`.\models\maskformer\configuration_maskformer.py`

# coding=utf-8
# Copyright 2022 Meta Platforms, Inc.and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" MaskFormer model configuration"""
from typing import Dict, Optional

from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto import CONFIG_MAPPING
from ..detr import DetrConfig
from ..swin import SwinConfig


MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/maskformer-swin-base-ade": (
        "https://huggingface.co/facebook/maskformer-swin-base-ade/blob/main/config.json"
    )
    # See all MaskFormer models at https://huggingface.co/models?filter=maskformer
}

# 获取全局日志记录器实例
logger = logging.get_logger(__name__)


class MaskFormerConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`MaskFormerModel`]. It is used to instantiate a
    MaskFormer model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the MaskFormer
    [facebook/maskformer-swin-base-ade](https://huggingface.co/facebook/maskformer-swin-base-ade) architecture trained
    on [ADE20k-150](https://huggingface.co/datasets/scene_parse_150).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Currently, MaskFormer only supports the [Swin Transformer](swin) as backbone.
    # 定义 MaskFormerConfig 类，用于配置 MaskFormerModel 模型的参数
    class MaskFormerConfig:
        # 控制掩码特征的大小，默认为 256
        mask_feature_size (`int`, *optional*, defaults to 256):
            The masks' features size, this value will also be used to specify the Feature Pyramid Network features'
            size.
        
        # 控制无物体类别的权重，默认为 0.1
        no_object_weight (`float`, *optional*, defaults to 0.1):
            Weight to apply to the null (no object) class.
        
        # 是否使用辅助损失，默认为 False
        use_auxiliary_loss(`bool`, *optional*, defaults to `False`):
            If `True` [`MaskFormerForInstanceSegmentationOutput`] will contain the auxiliary losses computed using the
            logits from each decoder's stage.
        
        # 如果未设置 backbone_config，则使用默认配置 `swin-base-patch4-window12-384` 的配置
        backbone_config (`Dict`, *optional*):
            The configuration passed to the backbone, if unset, the configuration corresponding to
            `swin-base-patch4-window12-384` will be used.
        
        # 当 backbone_config 为 None 时，使用此参数指定要使用的骨干网络名称
        backbone (`str`, *optional*):
            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
        
        # 是否使用预训练的骨干网络权重，默认为 False
        use_pretrained_backbone (`bool`, *optional*, `False`):
            Whether to use pretrained weights for the backbone.
        
        # 是否从 timm 库中加载 backbone，默认为 False
        use_timm_backbone (`bool`, *optional*, `False`):
            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
            library.
        
        # 当从检查点加载时，传递给 AutoBackbone 的关键字参数，例如 `{'out_indices': (0, 1, 2, 3)}`
        backbone_kwargs (`dict`, *optional*):
            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
        
        # 配置传递给变换器解码模型的参数，如果未设置，则使用 `detr-resnet-50` 的基本配置
        decoder_config (`Dict`, *optional*):
            The configuration passed to the transformer decoder model, if unset the base config for `detr-resnet-50`
            will be used.
        
        # 初始化所有权重矩阵的截断正态初始化器的标准差，默认为 0.02
        init_std (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        
        # HM Attention map 模块中用于 Xavier 初始化增益的缩放因子，默认为 1
        init_xavier_std (`float`, *optional*, defaults to 1):
            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
        
        # Dice 损失的权重，默认为 1.0
        dice_weight (`float`, *optional*, defaults to 1.0):
            The weight for the dice loss.
        
        # 交叉熵损失的权重，默认为 1.0
        cross_entropy_weight (`float`, *optional*, defaults to 1.0):
            The weight for the cross entropy loss.
        
        # 掩码损失的权重，默认为 20.0
        mask_weight (`float`, *optional*, defaults to 20.0):
            The weight for the mask loss.
        
        # 模型是否输出其辅助 logits，默认未指定
        output_auxiliary_logits (`bool`, *optional*):
            Should the model output its `auxiliary_logits` or not.
    
    # 当所选的骨干模型类型不在 `["swin"]` 中或解码器模型类型不在 `["detr"]` 中时，引发 `ValueError`
    Raises:
        `ValueError`:
            Raised if the backbone model type selected is not in `["swin"]` or the decoder model type selected is not
            in `["detr"]`
    
    Examples:
    
    # 从 transformers 库导入 MaskFormerConfig 和 MaskFormerModel 类
    >>> from transformers import MaskFormerConfig, MaskFormerModel
    # Initializing a MaskFormer configuration object using default values
    configuration = MaskFormerConfig()
    
    # Initializing a MaskFormerModel object with the specified configuration, initially with random weights
    model = MaskFormerModel(configuration)
    
    # Accessing the configuration of the model instance
    configuration = model.config

`.\models\maskformer\configuration_maskformer_swin.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
MaskFormer Swin Transformer model configuration
"""

# 导入必要的配置类和工具函数
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices

# 获取日志记录器
logger = logging.get_logger(__name__)

# 定义 MaskFormerSwinConfig 类，继承自 BackboneConfigMixin 和 PretrainedConfig
class MaskFormerSwinConfig(BackboneConfigMixin, PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`MaskFormerSwinModel`]. It is used to instantiate
    a Donut model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Swin
    [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224)
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import MaskFormerSwinConfig, MaskFormerSwinModel

    >>> # Initializing a microsoft/swin-tiny-patch4-window7-224 style configuration
    >>> configuration = MaskFormerSwinConfig()

    >>> # Initializing a model (with random weights) from the microsoft/swin-tiny-patch4-window7-224 style configuration
    >>> model = MaskFormerSwinModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型标识
    model_type = "maskformer-swin"

    # 属性映射字典，将配置中的参数映射到实际使用的参数名称
    attribute_map = {
        "num_attention_heads": "num_heads",
        "num_hidden_layers": "num_layers",
    }

    # 初始化方法，定义了模型的配置参数
    def __init__(
        self,
        image_size=224,                         # 图像大小
        patch_size=4,                           # 补丁大小
        num_channels=3,                         # 输入通道数
        embed_dim=96,                           # 嵌入维度
        depths=[2, 2, 6, 2],                    # 每个阶段的深度
        num_heads=[3, 6, 12, 24],               # 每个阶段的注意力头数
        window_size=7,                          # 窗口大小
        mlp_ratio=4.0,                          # MLP 的尺度比率
        qkv_bias=True,                          # 是否在 QKV 中使用偏置
        hidden_dropout_prob=0.0,                # 隐藏层的dropout概率
        attention_probs_dropout_prob=0.0,       # 注意力层的dropout概率
        drop_path_rate=0.1,                     # DropPath 的概率
        hidden_act="gelu",                      # 隐藏层激活函数
        use_absolute_embeddings=False,          # 是否使用绝对位置嵌入
        initializer_range=0.02,                 # 初始化范围
        layer_norm_eps=1e-5,                    # LayerNorm 的 epsilon 值
        out_features=None,                      # 输出特征
        out_indices=None,                       # 输出索引
        **kwargs,                               # 其他参数
    ):
        super().__init__(**kwargs)
        # 初始化方法体，设置模型的各种参数配置
        # （具体初始化方法体内的内容未提供，但注释已经涵盖了参数的功能和用途）
        ):
            # 调用父类的初始化方法，传入所有关键字参数
            super().__init__(**kwargs)

            # 设置图像大小属性
            self.image_size = image_size
            # 设置补丁大小属性
            self.patch_size = patch_size
            # 设置通道数属性
            self.num_channels = num_channels
            # 设置嵌入维度属性
            self.embed_dim = embed_dim
            # 设置每个阶段的深度列表
            self.depths = depths
            # 计算阶段数目
            self.num_layers = len(depths)
            # 设置注意力头数目
            self.num_heads = num_heads
            # 设置窗口大小属性
            self.window_size = window_size
            # 设置MLP扩展比例属性
            self.mlp_ratio = mlp_ratio
            # 设置注意力机制中的query/key/value是否带偏置
            self.qkv_bias = qkv_bias
            # 设置隐藏层dropout概率属性
            self.hidden_dropout_prob = hidden_dropout_prob
            # 设置注意力概率dropout概率属性
            self.attention_probs_dropout_prob = attention_probs_dropout_prob
            # 设置dropout路径丢弃率属性
            self.drop_path_rate = drop_path_rate
            # 设置隐藏层激活函数属性
            self.hidden_act = hidden_act
            # 设置是否使用绝对位置嵌入属性
            self.use_absolute_embeddings = use_absolute_embeddings
            # 设置层归一化epsilon值属性
            self.layer_norm_eps = layer_norm_eps
            # 设置初始化范围属性
            self.initializer_range = initializer_range
            # 设置隐藏大小属性，以便使Swin与VisionEncoderDecoderModel兼容
            # 这表示模型最后阶段后的通道维度
            self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
            # 设置阶段名称列表，包括"stem"和"stage1"到"stageN"
            self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
            # 获取对齐的输出特征和输出索引，确保与阶段名称对齐
            self._out_features, self._out_indices = get_aligned_output_features_output_indices(
                out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
            )

`.\models\maskformer\convert_maskformer_original_pytorch_checkpoint_to_pytorch.py`

# coding=utf-8
# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys  # 导入系统模块
from argparse import ArgumentParser  # 导入命令行参数解析模块
from dataclasses import dataclass  # 导入数据类装饰器
from pathlib import Path  # 导入处理路径的模块
from pprint import pformat  # 导入格式化输出模块
from typing import Any, Dict, Iterator, List, Set, Tuple  # 导入类型提示模块

import requests  # 导入处理 HTTP 请求的模块
import torch  # 导入 PyTorch 深度学习框架
import torchvision.transforms as T  # 导入图像转换模块
from detectron2.checkpoint import DetectionCheckpointer  # 导入检查点模块
from detectron2.config import get_cfg  # 导入配置获取函数
from detectron2.data import MetadataCatalog  # 导入元数据目录模块
from detectron2.projects.deeplab import add_deeplab_config  # 导入 DeepLab 配置模块
from PIL import Image  # 导入 Python 图像处理库
from torch import Tensor, nn  # 导入张量和神经网络模块

# 导入 MaskFormer 相关模块
from transformers.models.maskformer.feature_extraction_maskformer import MaskFormerImageProcessor
from transformers.models.maskformer.modeling_maskformer import (
    MaskFormerConfig,
    MaskFormerForInstanceSegmentation,
    MaskFormerForInstanceSegmentationOutput,
    MaskFormerModel,
    MaskFormerModelOutput,
)
from transformers.utils import logging  # 导入日志模块

StateDict = Dict[str, Tensor]  # 定义状态字典类型别名

logging.set_verbosity_info()  # 设置日志输出详细程度为信息级别
logger = logging.get_logger()  # 获取日志记录器对象

torch.manual_seed(0)  # 设置随机种子以确保实验结果可复现


class TrackedStateDict:
    def __init__(self, to_track: Dict):
        """This class "tracks" a python dictionary by keeping track of which item is accessed.

        Args:
            to_track (Dict): The dictionary we wish to track
        """
        self.to_track = to_track  # 初始化要跟踪的字典
        self._seen: Set[str] = set()  # 初始化一个集合，用于记录已经访问的键名

    def __getitem__(self, key: str) -> Any:
        return self.to_track[key]  # 返回指定键名对应的值

    def __setitem__(self, key: str, item: Any):
        self._seen.add(key)  # 将访问过的键名添加到集合中
        self.to_track[key] = item  # 更新字典中指定键名的值

    def diff(self) -> List[str]:
        """This method returns a set difference between the keys in the tracked state dict and the one we have access so far.
        This is an effective method to check if we have update all the keys

        Returns:
            List[str]: List of keys not yet updated
        """
        return set(self.to_track.keys()) - self._seen  # 返回未更新的键名列表

    def copy(self) -> Dict:
        # proxy the call to the internal dictionary
        return self.to_track.copy()  # 返回字典的浅拷贝


# We will verify our results on an image of cute cats
def prepare_img():
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"  # 定义图像的 URL
    img_data = requests.get(url, stream=True).raw  # 从 URL 获取图像数据
    im = Image.open(img_data)  # 打开图像数据
    return im  # 返回图像对象


@dataclass
class Args:
    """Fake command line arguments needed by maskformer/detectron implementation"""

    config_file: str  # 命令行参数类的属性：配置文件路径
# 从文件和命令行参数中加载配置信息
def setup_cfg(args: Args):
    # 获取一个新的配置对象
    cfg = get_cfg()
    # 添加 DeepLab 配置到配置对象
    add_deeplab_config(cfg)
    # 添加 MaskFormer 配置到配置对象
    add_mask_former_config(cfg)
    # 从配置文件中加载更多配置到当前配置对象
    cfg.merge_from_file(args.config_file)
    # 冻结配置对象，防止后续修改
    cfg.freeze()
    # 返回配置对象
    return cfg


class OriginalMaskFormerConfigToOursConverter:
    def __call__(self, original_config: object) -> MaskFormerConfig:
        # 获取原始配置对象的模型部分
        model = original_config.MODEL
        # 获取模型中的 MASK_FORMER 部分
        mask_former = model.MASK_FORMER
        # 获取模型中的 SWIN 部分
        swin = model.SWIN

        # 从元数据目录中获取测试数据集的类别信息
        dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST[0])
        # 创建从类别 ID 到类别名称的映射字典
        id2label = dict(enumerate(dataset_catalog.stuff_classes))
        # 创建从类别名称到类别 ID 的映射字典
        label2id = {label: idx for idx, label in id2label.items()}

        # 创建 MaskFormerConfig 对象，并填充其属性值
        config: MaskFormerConfig = MaskFormerConfig(
            fpn_feature_size=model.SEM_SEG_HEAD.CONVS_DIM,
            mask_feature_size=model.SEM_SEG_HEAD.MASK_DIM,
            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
            no_object_weight=mask_former.NO_OBJECT_WEIGHT,
            num_queries=mask_former.NUM_OBJECT_QUERIES,
            backbone_config={
                "pretrain_img_size": swin.PRETRAIN_IMG_SIZE,
                "image_size": swin.PRETRAIN_IMG_SIZE,
                "in_channels": 3,
                "patch_size": swin.PATCH_SIZE,
                "embed_dim": swin.EMBED_DIM,
                "depths": swin.DEPTHS,
                "num_heads": swin.NUM_HEADS,
                "window_size": swin.WINDOW_SIZE,
                "drop_path_rate": swin.DROP_PATH_RATE,
                "model_type": "swin",
            },
            dice_weight=mask_former.DICE_WEIGHT,
            ce_weight=1.0,
            mask_weight=mask_former.MASK_WEIGHT,
            decoder_config={
                "model_type": "detr",
                "max_position_embeddings": 1024,
                "encoder_layers": 6,
                "encoder_ffn_dim": 2048,
                "encoder_attention_heads": 8,
                "decoder_layers": mask_former.DEC_LAYERS,
                "decoder_ffn_dim": mask_former.DIM_FEEDFORWARD,
                "decoder_attention_heads": mask_former.NHEADS,
                "encoder_layerdrop": 0.0,
                "decoder_layerdrop": 0.0,
                "d_model": mask_former.HIDDEN_DIM,
                "dropout": mask_former.DROPOUT,
                "attention_dropout": 0.0,
                "activation_dropout": 0.0,
                "init_std": 0.02,
                "init_xavier_std": 1.0,
                "scale_embedding": False,
                "auxiliary_loss": False,
                "dilation": False,
                # 默认的预训练配置数值
            },
            id2label=id2label,
            label2id=label2id,
        )

        # 返回配置对象
        return config


class OriginalMaskFormerConfigToImageProcessorConverter:
    # 等待实现的类，用于将原始的 MaskFormer 配置转换为图像处理器配置
    pass
    # 定义一个特殊方法，使得对象可以被调用，并返回一个 MaskFormerImageProcessor 实例
    def __call__(self, original_config: object) -> MaskFormerImageProcessor:
        # 从配置中获取模型对象
        model = original_config.MODEL
        # 从配置中获取输入设置
        model_input = original_config.INPUT
        # 获取测试数据集的元数据目录
        dataset_catalog = MetadataCatalog.get(original_config.DATASETS.TEST[0])

        # 返回一个 MaskFormerImageProcessor 实例，并传入以下参数：
        return MaskFormerImageProcessor(
            # 计算并转换像素均值为列表形式
            image_mean=(torch.tensor(model.PIXEL_MEAN) / 255).tolist(),
            # 计算并转换像素标准差为列表形式
            image_std=(torch.tensor(model.PIXEL_STD) / 255).tolist(),
            # 设置测试图像的最小尺寸
            size=model_input.MIN_SIZE_TEST,
            # 设置测试图像的最大尺寸
            max_size=model_input.MAX_SIZE_TEST,
            # 设置语义分割头部的类别数目
            num_labels=model.SEM_SEG_HEAD.NUM_CLASSES,
            # 设置忽略索引，通常用于标注中的背景类别
            ignore_index=dataset_catalog.ignore_label,
            # 设置尺寸可分割性，通常为模型要求的倍数，这里为32，适用于 Swin 模型
            size_divisibility=32,
        )
# 定义一个类用于将原始模型的检查点转换为新模型的检查点
class OriginalMaskFormerCheckpointToOursConverter:
    # 初始化方法，接收原始模型和配置对象作为参数
    def __init__(self, original_model: nn.Module, config: MaskFormerConfig):
        self.original_model = original_model  # 存储原始模型
        self.config = config  # 存储配置对象

    # 弹出并重命名所有给定键对应的值，并将其添加到目标状态字典中
    def pop_all(self, renamed_keys: List[Tuple[str, str]], dst_state_dict: StateDict, src_state_dict: StateDict):
        for src_key, dst_key in renamed_keys:
            dst_state_dict[dst_key] = src_state_dict.pop(src_key)

    # 替换像素模块的特定部分，并根据配置更新相应的目标状态字典
    def replace_pixel_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
        dst_prefix: str = "pixel_level_module.decoder"  # 目标状态字典的前缀
        src_prefix: str = "sem_seg_head.pixel_decoder"  # 源状态字典的前缀

        # 使用给定配置更新背景模型
        self.replace_backbone(dst_state_dict, src_state_dict, self.config)

        # 定义一个函数用于为卷积层重命名键
        def rename_keys_for_conv(detectron_conv: str, mine_conv: str):
            return [
                (f"{detectron_conv}.weight", f"{mine_conv}.0.weight"),
                (f"{detectron_conv}.norm.weight", f"{mine_conv}.1.weight"),
                (f"{detectron_conv}.norm.bias", f"{mine_conv}.1.bias"),
            ]

        # 添加用于转换的特定键对，如掩码特征的权重和偏置
        renamed_keys = [
            (f"{src_prefix}.mask_features.weight", f"{dst_prefix}.mask_projection.weight"),
            (f"{src_prefix}.mask_features.bias", f"{dst_prefix}.mask_projection.bias"),
        ]
        
        # 添加用于转换的卷积层的键对，例如特征金字塔网络（FPN）的stem层
        renamed_keys.extend(rename_keys_for_conv(f"{src_prefix}.layer_4", f"{dst_prefix}.fpn.stem"))

        # 循环添加FPN的各层，根据配置参数确定层数
        for src_i, dst_i in zip(range(3, 0, -1), range(0, 3)):
            renamed_keys.extend(
                rename_keys_for_conv(f"{src_prefix}.adapter_{src_i}", f"{dst_prefix}.fpn.layers.{dst_i}.proj")
            )
            renamed_keys.extend(
                rename_keys_for_conv(f"{src_prefix}.layer_{src_i}", f"{dst_prefix}.fpn.layers.{dst_i}.block")
            )

        # 调用pop_all方法，将所有重命名的键对应的值从源状态字典中弹出，并添加到目标状态字典中
        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)
    # 定义一个方法，用于重命名 DETR 解码器的状态字典中的键
    def rename_keys_in_detr_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
        # 目标状态字典的键前缀
        dst_prefix: str = "transformer_module.decoder"
        # 源状态字典的键前缀
        src_prefix: str = "sem_seg_head.predictor.transformer.decoder"
        
        # not sure why we are not popping direcetly here!
        # 不确定为什么这里没有直接弹出（删除）！
        
        # 在下面列出需要重命名的所有键（左侧为原始名称，右侧为我们的名称）
        rename_keys = []
        
        # 循环遍历解码器配置中的每一层
        for i in range(self.config.decoder_config.decoder_layers):
            # 添加重命名规则：自注意力机制的输出投影权重
            rename_keys.append(
                (
                    f"{src_prefix}.layers.{i}.self_attn.out_proj.weight",
                    f"{dst_prefix}.layers.{i}.self_attn.out_proj.weight",
                )
            )
            # 添加重命名规则：自注意力机制的输出投影偏置
            rename_keys.append(
                (
                    f"{src_prefix}.layers.{i}.self_attn.out_proj.bias",
                    f"{dst_prefix}.layers.{i}.self_attn.out_proj.bias",
                )
            )
            # 添加重命名规则：多头注意力机制的输出投影权重
            rename_keys.append(
                (
                    f"{src_prefix}.layers.{i}.multihead_attn.out_proj.weight",
                    f"{dst_prefix}.layers.{i}.encoder_attn.out_proj.weight",
                )
            )
            # 添加重命名规则：多头注意力机制的输出投影偏置
            rename_keys.append(
                (
                    f"{src_prefix}.layers.{i}.multihead_attn.out_proj.bias",
                    f"{dst_prefix}.layers.{i}.encoder_attn.out_proj.bias",
                )
            )
            # 添加重命名规则：线性层1的权重
            rename_keys.append((f"{src_prefix}.layers.{i}.linear1.weight", f"{dst_prefix}.layers.{i}.fc1.weight"))
            # 添加重命名规则：线性层1的偏置
            rename_keys.append((f"{src_prefix}.layers.{i}.linear1.bias", f"{dst_prefix}.layers.{i}.fc1.bias"))
            # 添加重命名规则：线性层2的权重
            rename_keys.append((f"{src_prefix}.layers.{i}.linear2.weight", f"{dst_prefix}.layers.{i}.fc2.weight"))
            # 添加重命名规则：线性层2的偏置
            rename_keys.append((f"{src_prefix}.layers.{i}.linear2.bias", f"{dst_prefix}.layers.{i}.fc2.bias"))
            # 添加重命名规则：层归一化1的权重
            rename_keys.append(
                (f"{src_prefix}.layers.{i}.norm1.weight", f"{dst_prefix}.layers.{i}.self_attn_layer_norm.weight")
            )
            # 添加重命名规则：层归一化1的偏置
            rename_keys.append(
                (f"{src_prefix}.layers.{i}.norm1.bias", f"{dst_prefix}.layers.{i}.self_attn_layer_norm.bias")
            )
            # 添加重命名规则：层归一化2的权重
            rename_keys.append(
                (f"{src_prefix}.layers.{i}.norm2.weight", f"{dst_prefix}.layers.{i}.encoder_attn_layer_norm.weight")
            )
            # 添加重命名规则：层归一化2的偏置
            rename_keys.append(
                (f"{src_prefix}.layers.{i}.norm2.bias", f"{dst_prefix}.layers.{i}.encoder_attn_layer_norm.bias")
            )
            # 添加重命名规则：层归一化3的权重
            rename_keys.append(
                (f"{src_prefix}.layers.{i}.norm3.weight", f"{dst_prefix}.layers.{i}.final_layer_norm.weight")
            )
            # 添加重命名规则：层归一化3的偏置
            rename_keys.append(
                (f"{src_prefix}.layers.{i}.norm3.bias", f"{dst_prefix}.layers.{i}.final_layer_norm.bias")
            )

        # 返回包含所有重命名规则的列表
        return rename_keys
    # 定义一个方法用于替换 DETR 解码器中的权重和偏置
    def replace_q_k_v_in_detr_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
        # 设置目标状态字典中的键前缀
        dst_prefix: str = "transformer_module.decoder"
        # 设置源状态字典中的键前缀
        src_prefix: str = "sem_seg_head.predictor.transformer.decoder"
        # 循环遍历解码器层数量次数
        for i in range(self.config.decoder_config.decoder_layers):
            # 从源状态字典中弹出自注意力层的输入投影层的权重和偏置
            in_proj_weight = src_state_dict.pop(f"{src_prefix}.layers.{i}.self_attn.in_proj_weight")
            in_proj_bias = src_state_dict.pop(f"{src_prefix}.layers.{i}.self_attn.in_proj_bias")
            # 将自注意力层的查询、键和值（按顺序）添加到目标状态字典中
            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.q_proj.weight"] = in_proj_weight[:256, :]
            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.q_proj.bias"] = in_proj_bias[:256]
            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.k_proj.weight"] = in_proj_weight[256:512, :]
            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.k_proj.bias"] = in_proj_bias[256:512]
            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.v_proj.weight"] = in_proj_weight[-256:, :]
            dst_state_dict[f"{dst_prefix}.layers.{i}.self_attn.v_proj.bias"] = in_proj_bias[-256:]
            # 从源状态字典中读取跨注意力层的输入投影层的权重和偏置
            in_proj_weight_cross_attn = src_state_dict.pop(f"{src_prefix}.layers.{i}.multihead_attn.in_proj_weight")
            in_proj_bias_cross_attn = src_state_dict.pop(f"{src_prefix}.layers.{i}.multihead_attn.in_proj_bias")
            # 将跨注意力层的查询、键和值（按顺序）添加到目标状态字典中
            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.q_proj.weight"] = in_proj_weight_cross_attn[:256, :]
            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.q_proj.bias"] = in_proj_bias_cross_attn[:256]
            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.k_proj.weight"] = in_proj_weight_cross_attn[256:512, :]
            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.k_proj.bias"] = in_proj_bias_cross_attn[256:512]
            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.v_proj.weight"] = in_proj_weight_cross_attn[-256:, :]
            dst_state_dict[f"{dst_prefix}.layers.{i}.encoder_attn.v_proj.bias"] = in_proj_bias_cross_attn[-256:]
    # 用于替换`detr`模型的解码器部分的权重和偏置
    def replace_detr_decoder(self, dst_state_dict: StateDict, src_state_dict: StateDict):
        # 目标模型权重前缀
        dst_prefix: str = "transformer_module.decoder"
        # 源模型权重前缀
        src_prefix: str = "sem_seg_head.predictor.transformer.decoder"
        
        # 重命名两个模型权重的键名列表
        renamed_keys = self.rename_keys_in_detr_decoder(dst_state_dict, src_state_dict)
        
        # 添加更多的键名映射，例如层归一化的权重和偏置
        renamed_keys.extend(
            [
                (f"{src_prefix}.norm.weight", f"{dst_prefix}.layernorm.weight"),
                (f"{src_prefix}.norm.bias", f"{dst_prefix}.layernorm.bias"),
            ]
        )

        # 根据映射关系从源模型中移除对应的键值对
        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)

        # 替换`detr`模型解码器的query、key和value权重
        self.replace_q_k_v_in_detr_decoder(dst_state_dict, src_state_dict)

    # 替换`transformer_module`中的权重和偏置
    def replace_transformer_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
        # 目标模型权重前缀
        dst_prefix: str = "transformer_module"
        # 源模型权重前缀
        src_prefix: str = "sem_seg_head.predictor"

        # 调用`replace_detr_decoder`函数，替换解码器部分的权重和偏置
        self.replace_detr_decoder(dst_state_dict, src_state_dict)

        # 重命名`transformer_module`中的特定权重和偏置
        renamed_keys = [
            (f"{src_prefix}.query_embed.weight", f"{dst_prefix}.queries_embedder.weight"),
            (f"{src_prefix}.input_proj.weight", f"{dst_prefix}.input_projection.weight"),
            (f"{src_prefix}.input_proj.bias", f"{dst_prefix}.input_projection.bias"),
        ]

        # 根据映射关系从源模型中移除对应的键值对
        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)

    # 替换实例分割模块中的权重和偏置
    def replace_instance_segmentation_module(self, dst_state_dict: StateDict, src_state_dict: StateDict):
        # 注意：我们的情况中没有前缀，因此我们在后续处理中移除了键名中的“.”
        dst_prefix: str = ""
        # 源模型权重前缀
        src_prefix: str = "sem_seg_head.predictor"

        # 定义要重命名的键名映射列表
        renamed_keys = [
            (f"{src_prefix}.class_embed.weight", f"{dst_prefix}class_predictor.weight"),
            (f"{src_prefix}.class_embed.bias", f"{dst_prefix}class_predictor.bias"),
        ]

        # 循环处理MLP层，构建映射列表
        mlp_len = 3
        for i in range(mlp_len):
            renamed_keys.extend(
                [
                    (f"{src_prefix}.mask_embed.layers.{i}.weight", f"{dst_prefix}mask_embedder.{i}.0.weight"),
                    (f"{src_prefix}.mask_embed.layers.{i}.bias", f"{dst_prefix}mask_embedder.{i}.0.bias"),
                ]
            )
        
        # 记录日志，显示替换的键名映射列表
        logger.info(f"Replacing keys {pformat(renamed_keys)}")
        
        # 根据映射关系从源模型中移除对应的键值对
        self.pop_all(renamed_keys, dst_state_dict, src_state_dict)

    # 执行模型权重的转换
    def convert(self, mask_former: MaskFormerModel) -> MaskFormerModel:
        # 创建目标模型状态字典，基于输入模型的状态字典
        dst_state_dict = TrackedStateDict(mask_former.state_dict())
        # 获取原始模型的状态字典
        src_state_dict = self.original_model.state_dict()

        # 替换像素模块中的权重和偏置
        self.replace_pixel_module(dst_state_dict, src_state_dict)
        
        # 替换`transformer_module`中的权重和偏置
        self.replace_transformer_module(dst_state_dict, src_state_dict)

        # 记录未匹配的键名差异
        logger.info(f"Missed keys are {pformat(dst_state_dict.diff())}")
        # 记录未复制的源模型键名列表
        logger.info(f"Not copied keys are {pformat(src_state_dict.keys())}")
        # 日志记录：操作完成
        logger.info("🙌 Done")

        # 使用更新后的目标状态字典加载模型权重
        mask_former.load_state_dict(dst_state_dict)

        # 返回更新后的模型
        return mask_former
    # 将给定的实例分割模型转换为另一种实例分割模型类型，并返回转换后的模型
    def convert_instance_segmentation(
        self, mask_former: MaskFormerForInstanceSegmentation
    ) -> MaskFormerForInstanceSegmentation:
        # 创建目标模型的状态字典，复制输入模型的状态字典
        dst_state_dict = TrackedStateDict(mask_former.state_dict())
        # 获取原始模型的状态字典
        src_state_dict = self.original_model.state_dict()

        # 用原始模型的状态字典替换目标模型中的实例分割模块
        self.replace_instance_segmentation_module(dst_state_dict, src_state_dict)

        # 将更新后的状态字典加载到输入的实例分割模型中
        mask_former.load_state_dict(dst_state_dict)

        # 返回更新后的实例分割模型
        return mask_former

    @staticmethod
    # 返回一个迭代器，该迭代器生成一系列元组，每个元组包含一个配置文件路径、一个检查点文件路径和一个配置目录路径
    def using_dirs(checkpoints_dir: Path, config_dir: Path) -> Iterator[Tuple[object, Path, Path]]:
        # 获取检查点目录下所有的.pkl文件路径列表
        checkpoints: List[Path] = checkpoints_dir.glob("**/*.pkl")

        # 遍历每个检查点文件路径
        for checkpoint in checkpoints:
            # 记录信息：转换正在处理的检查点文件名（不带扩展名）
            logger.info(f"💪 Converting {checkpoint.stem}")
            # 查找与当前检查点文件关联的配置文件路径
            config: Path = config_dir / checkpoint.parents[0].stem / "swin" / f"{checkpoint.stem}.yaml"

            # 返回当前配置文件路径、检查点文件路径和配置目录路径的元组
            yield config, checkpoint
def test(original_model, our_model: MaskFormerForInstanceSegmentation, image_processor: MaskFormerImageProcessor):
    # 使用torch.no_grad()上下文管理器，关闭梯度计算以加快推断速度
    with torch.no_grad():
        # 将原始模型和我们的模型设为评估模式
        original_model = original_model.eval()
        our_model = our_model.eval()

        # 准备图像数据
        im = prepare_img()

        # 图像转换的组合操作，包括调整大小、转换为Tensor、归一化
        tr = T.Compose(
            [
                T.Resize((384, 384)),  # 调整图像大小为384x384
                T.ToTensor(),  # 转换为Tensor
                T.Normalize(  # 归一化操作
                    mean=torch.tensor([123.675, 116.280, 103.530]) / 255.0,
                    std=torch.tensor([58.395, 57.120, 57.375]) / 255.0,
                ),
            ],
        )

        # 对输入图像应用转换操作，并扩展维度以匹配模型的输入要求
        x = tr(im).unsqueeze(0)

        # 使用原始模型的backbone提取特征
        original_model_backbone_features = original_model.backbone(x.clone())

        # 使用我们的模型进行推断，同时请求输出隐藏状态
        our_model_output: MaskFormerModelOutput = our_model.model(x.clone(), output_hidden_states=True)

        # 对比原始模型和我们的模型的backbone特征是否接近
        for original_model_feature, our_model_feature in zip(
            original_model_backbone_features.values(), our_model_output.encoder_hidden_states
        ):
            assert torch.allclose(
                original_model_feature, our_model_feature, atol=1e-3
            ), "The backbone features are not the same."

        # 使用原始模型的语义分割头部进行像素解码
        original_model_pixel_out = original_model.sem_seg_head.pixel_decoder.forward_features(
            original_model_backbone_features
        )

        # 对比原始模型和我们的模型的像素解码器的最后隐藏状态是否接近
        assert torch.allclose(
            original_model_pixel_out[0], our_model_output.pixel_decoder_last_hidden_state, atol=1e-4
        ), "The pixel decoder feature are not the same"

        # 测试完整模型的输出
        original_model_out = original_model([{"image": x.squeeze(0)}])

        # 获取原始模型的语义分割结果
        original_segmentation = original_model_out[0]["sem_seg"]

        # 使用我们的模型进行推断，并后处理分割结果
        our_model_out: MaskFormerForInstanceSegmentationOutput = our_model(x)

        our_segmentation = image_processor.post_process_segmentation(our_model_out, target_size=(384, 384))

        # 对比原始模型和我们的模型的语义分割结果是否接近
        assert torch.allclose(
            original_segmentation, our_segmentation, atol=1e-3
        ), "The segmentation image is not the same."

        # 记录测试通过的信息
        logger.info("✅ Test passed!")


def get_name(checkpoint_file: Path):
    # 从检查点文件名中提取模型名称
    model_name_raw: str = checkpoint_file.stem
    # model_name_raw 的格式类似于 maskformer_panoptic_swin_base_IN21k_384_bs64_554k
    parent_name: str = checkpoint_file.parents[0].stem
    backbone = "swin"
    dataset = ""
    
    # 根据父文件夹名称确定数据集类型
    if "coco" in parent_name:
        dataset = "coco"
    elif "ade" in parent_name:
        dataset = "ade"
    else:
        raise ValueError(f"{parent_name} must be wrong since we didn't find 'coco' or 'ade' in it ")

    # 支持的backbone类型列表
    backbone_types = ["tiny", "small", "base", "large"]

    # 从模型名称中匹配backbone类型
    backbone_type = list(filter(lambda x: x in model_name_raw, backbone_types))[0]

    # 组合最终的模型名称
    model_name = f"maskformer-{backbone}-{backbone_type}-{dataset}"

    return model_name


if __name__ == "__main__":
    # 命令行解析器，用于转换原始的MaskFormers模型到我们的实现
    parser = ArgumentParser(
        description="Command line to convert the original maskformers (with swin backbone) to our implementations."
    )
    parser.add_argument(
        "--checkpoints_dir",
        type=Path,
        help=(
            "A directory containing the model's checkpoints. The directory has to have the following structure:"
            " <DIR_NAME>/<DATASET_NAME>/<CONFIG_NAME>.pkl"
        ),
    )
    parser.add_argument(
        "--configs_dir",
        type=Path,
        help=(
            "A directory containing the model's configs, see detectron2 doc. The directory has to have the following"
            " structure: <DIR_NAME>/<DATASET_NAME>/<CONFIG_NAME>.yaml"
        ),
    )
    parser.add_argument(
        "--pytorch_dump_folder_path",
        required=True,
        type=Path,
        help="Path to the folder to output PyTorch models.",
    )
    parser.add_argument(
        "--maskformer_dir",
        required=True,
        type=Path,
        help=(
            "A path to MaskFormer's original implementation directory. You can download from here:"
            " https://github.com/facebookresearch/MaskFormer"
        ),
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 将命令行参数转换为对应的变量
    checkpoints_dir: Path = args.checkpoints_dir
    config_dir: Path = args.configs_dir
    save_directory: Path = args.pytorch_dump_folder_path
    maskformer_dir: Path = args.maskformer_dir

    # 将 MaskFormer 的父目录添加到系统路径中
    sys.path.append(str(maskformer_dir.parent))
    
    # 导入所需的模块和类
    from MaskFormer.mask_former import add_mask_former_config
    from MaskFormer.mask_former.mask_former_model import MaskFormer as OriginalMaskFormer

    # 如果保存模型的目录不存在，则创建它及其父目录
    if not save_directory.exists():
        save_directory.mkdir(parents=True)

    # 循环遍历原始 MaskFormer 的配置文件和检查点文件
    for config_file, checkpoint_file in OriginalMaskFormerCheckpointToOursConverter.using_dirs(
        checkpoints_dir, config_dir
    ):
        ):
            # 创建一个用于处理原始掩模形状配置到图像处理器转换的实例，并调用其方法
            image_processor = OriginalMaskFormerConfigToImageProcessorConverter()(setup_cfg(Args(config_file=config_file)))

        # 使用给定的配置文件设置配置对象
        original_config = setup_cfg(Args(config_file=config_file))

        # 根据原始配置创建原始掩模形状对象的参数
        mask_former_kwargs = OriginalMaskFormer.from_config(original_config)

        # 创建原始掩模形状模型的实例并设置为评估模式
        original_model = OriginalMaskFormer(**mask_former_kwargs).eval()

        # 加载预训练检查点文件到原始模型
        DetectionCheckpointer(original_model).load(str(checkpoint_file))

        # 将原始配置转换为我们的掩模形状配置对象
        config: MaskFormerConfig = OriginalMaskFormerConfigToOursConverter()(original_config)

        # 创建我们的掩模形状模型的实例并设置为评估模式
        mask_former = MaskFormerModel(config=config).eval()

        # 创建用于将原始掩模形状检查点转换为我们的形式的转换器
        converter = OriginalMaskFormerCheckpointToOursConverter(original_model, config)

        # 将原始模型转换为我们的掩模形状模型
        maskformer = converter.convert(mask_former)

        # 创建用于实例分割的掩模形状模型的实例并设置为评估模式
        mask_former_for_instance_segmentation = MaskFormerForInstanceSegmentation(config=config).eval()

        # 设置实例分割模型的形状模型
        mask_former_for_instance_segmentation.model = mask_former

        # 将实例分割模型转换为我们的形式
        mask_former_for_instance_segmentation = converter.convert_instance_segmentation(
            mask_former_for_instance_segmentation
        )

        # 运行测试函数，传入原始模型、实例分割模型和图像处理器
        test(original_model, mask_former_for_instance_segmentation, image_processor)

        # 获取检查点文件的名称
        model_name = get_name(checkpoint_file)

        # 记录保存操作信息
        logger.info(f"🪄 Saving {model_name}")

        # 保存图像处理器预训练模型到指定目录
        image_processor.save_pretrained(save_directory / model_name)

        # 保存实例分割模型到指定目录
        mask_former_for_instance_segmentation.save_pretrained(save_directory / model_name)

        # 将图像处理器推送到 Hub 上
        image_processor.push_to_hub(
            repo_path_or_name=save_directory / model_name,
            commit_message="Add model",
            use_temp_dir=True,
        )

        # 将实例分割模型推送到 Hub 上
        mask_former_for_instance_segmentation.push_to_hub(
            repo_path_or_name=save_directory / model_name,
            commit_message="Add model",
            use_temp_dir=True,
        )

`.\models\maskformer\convert_maskformer_resnet_to_pytorch.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert MaskFormer checkpoints with ResNet backbone from the original repository. URL:
https://github.com/facebookresearch/MaskFormer"""


import argparse
import json
import pickle
from pathlib import Path

import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image

from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, MaskFormerImageProcessor, ResNetConfig
from transformers.utils import logging


logging.set_verbosity_info()  # 设置日志输出级别为信息级别
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


def get_maskformer_config(model_name: str):
    # 根据模型名称获取相应的 MaskFormer 配置
    if "resnet101c" in model_name:
        # TODO add support for ResNet-C backbone, which uses a "deeplab" stem
        raise NotImplementedError("To do")  # 抛出未实现的错误，提示需要添加对 ResNet-C 的支持
    elif "resnet101" in model_name:
        # 使用 Microsoft 的 ResNet-101 作为骨干网络配置
        backbone_config = ResNetConfig.from_pretrained(
            "microsoft/resnet-101", out_features=["stage1", "stage2", "stage3", "stage4"]
        )
    else:
        # 默认使用 Microsoft 的 ResNet-50 作为骨干网络配置
        backbone_config = ResNetConfig.from_pretrained(
            "microsoft/resnet-50", out_features=["stage1", "stage2", "stage3", "stage4"]
        )
    config = MaskFormerConfig(backbone_config=backbone_config)

    # 根据模型名称设置相应的标签数量和文件名
    repo_id = "huggingface/label-files"
    if "ade20k-full" in model_name:
        config.num_labels = 847
        filename = "maskformer-ade20k-full-id2label.json"
    elif "ade" in model_name:
        config.num_labels = 150
        filename = "ade20k-id2label.json"
    elif "coco-stuff" in model_name:
        config.num_labels = 171
        filename = "maskformer-coco-stuff-id2label.json"
    elif "coco" in model_name:
        # TODO
        config.num_labels = 133
        filename = "coco-panoptic-id2label.json"
    elif "cityscapes" in model_name:
        config.num_labels = 19
        filename = "cityscapes-id2label.json"
    elif "vistas" in model_name:
        config.num_labels = 65
        filename = "mapillary-vistas-id2label.json"

    # 从 HF Hub 下载指定的标签文件，并加载为 id 到 label 的映射字典
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
    id2label = {int(k): v for k, v in id2label.items()}  # 将 id 转换为整数类型
    config.id2label = id2label
    config.label2id = {v: k for k, v in id2label.items()}  # 构建 label 到 id 的映射字典

    return config


def create_rename_keys(config):
    rename_keys = []
    # 添加重命名键，映射 backbone.stem.conv1.weight 到 model.pixel_level_module.encoder.embedder.embedder.convolution.weight
    rename_keys.append(("backbone.stem.conv1.weight", "model.pixel_level_module.encoder.embedder.embedder.convolution.weight"))
    # 添加新的键值对到 rename_keys 列表中，用于将模型中的特定参数路径重命名为新路径
    rename_keys.append(("backbone.stem.conv1.norm.weight", "model.pixel_level_module.encoder.embedder.embedder.normalization.weight"))
    rename_keys.append(("backbone.stem.conv1.norm.bias", "model.pixel_level_module.encoder.embedder.embedder.normalization.bias"))
    rename_keys.append(("backbone.stem.conv1.norm.running_mean", "model.pixel_level_module.encoder.embedder.embedder.normalization.running_mean"))
    rename_keys.append(("backbone.stem.conv1.norm.running_var", "model.pixel_level_module.encoder.embedder.embedder.normalization.running_var"))

    # 在 fmt: on 之后的代码段，用于指示代码风格格式化工具保持打开状态

    # stages
    # FPN

    # 在 fmt: off 之后的代码段，用于指示代码风格格式化工具关闭格式化

    # 将 sem_seg_head.layer_4 的权重重命名为 model.pixel_level_module.decoder.fpn.stem.0 的权重
    rename_keys.append(("sem_seg_head.layer_4.weight", "model.pixel_level_module.decoder.fpn.stem.0.weight"))
    # 将 sem_seg_head.layer_4 的归一化权重重命名为 model.pixel_level_module.decoder.fpn.stem.1 的权重
    rename_keys.append(("sem_seg_head.layer_4.norm.weight", "model.pixel_level_module.decoder.fpn.stem.1.weight"))
    # 将 sem_seg_head.layer_4 的归一化偏置重命名为 model.pixel_level_module.decoder.fpn.stem.1 的偏置
    rename_keys.append(("sem_seg_head.layer_4.norm.bias", "model.pixel_level_module.decoder.fpn.stem.1.bias"))

    # 针对一系列逆序的源索引和目标索引，将 sem_seg_head.adapter_{source_index} 的权重和归一化参数
    # 重命名为 model.pixel_level_module.decoder.fpn.layers.{target_index} 下的对应投影层权重和偏置
    for source_index, target_index in zip(range(3, 0, -1), range(0, 3)):
        rename_keys.append((f"sem_seg_head.adapter_{source_index}.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.0.weight"))
        rename_keys.append((f"sem_seg_head.adapter_{source_index}.norm.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.1.weight"))
        rename_keys.append((f"sem_seg_head.adapter_{source_index}.norm.bias", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.1.bias"))
        rename_keys.append((f"sem_seg_head.layer_{source_index}.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.0.weight"))
        rename_keys.append((f"sem_seg_head.layer_{source_index}.norm.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.1.weight"))
        rename_keys.append((f"sem_seg_head.layer_{source_index}.norm.bias", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.1.bias"))

    # 将 sem_seg_head.mask_features 的权重重命名为 model.pixel_level_module.decoder.mask_projection 的权重
    rename_keys.append(("sem_seg_head.mask_features.weight", "model.pixel_level_module.decoder.mask_projection.weight"))
    # 将 sem_seg_head.mask_features 的偏置重命名为 model.pixel_level_module.decoder.mask_projection 的偏置
    rename_keys.append(("sem_seg_head.mask_features.bias", "model.pixel_level_module.decoder.mask_projection.bias"))

    # 在 fmt: on 之后的代码段，用于指示代码风格格式化工具保持打开状态

    # Transformer decoder
    # fmt: off
    for idx in range(config.decoder_config.decoder_layers):
        # self-attention out projection
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.out_proj.weight", f"model.transformer_module.decoder.layers.{idx}.self_attn.out_proj.weight"))
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.out_proj.bias", f"model.transformer_module.decoder.layers.{idx}.self_attn.out_proj.bias"))
        # cross-attention out projection
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.out_proj.weight", f"model.transformer_module.decoder.layers.{idx}.encoder_attn.out_proj.weight"))
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.out_proj.bias", f"model.transformer_module.decoder.layers.{idx}.encoder_attn.out_proj.bias"))
        # MLP 1
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear1.weight", f"model.transformer_module.decoder.layers.{idx}.fc1.weight"))
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear1.bias", f"model.transformer_module.decoder.layers.{idx}.fc1.bias"))
        # MLP 2
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear2.weight", f"model.transformer_module.decoder.layers.{idx}.fc2.weight"))
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear2.bias", f"model.transformer_module.decoder.layers.{idx}.fc2.bias"))
        # layernorm 1 (self-attention layernorm)
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm1.weight", f"model.transformer_module.decoder.layers.{idx}.self_attn_layer_norm.weight"))
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm1.bias", f"model.transformer_module.decoder.layers.{idx}.self_attn_layer_norm.bias"))
        # layernorm 2 (cross-attention layernorm)
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm2.weight", f"model.transformer_module.decoder.layers.{idx}.encoder_attn_layer_norm.weight"))
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm2.bias", f"model.transformer_module.decoder.layers.{idx}.encoder_attn_layer_norm.bias"))
        # layernorm 3 (final layernorm)
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm3.weight", f"model.transformer_module.decoder.layers.{idx}.final_layer_norm.weight"))
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm3.bias", f"model.transformer_module.decoder.layers.{idx}.final_layer_norm.bias"))

    # Add renaming for the final layer norm weight
    rename_keys.append(("sem_seg_head.predictor.transformer.decoder.norm.weight", "model.transformer_module.decoder.layernorm.weight"))
    # 将旧的键值对添加到重命名列表中，用新的键值对替换
    rename_keys.append(("sem_seg_head.predictor.transformer.decoder.norm.bias", "model.transformer_module.decoder.layernorm.bias"))
    # fmt: on

    # 以下是将语句组织成一个块并且将其关闭

    # 网络，
# 从字典 dct 中弹出键 old 对应的值，并赋值给变量 val
def rename_key(dct, old, new):
    val = dct.pop(old)
    # 将键 new 添加到字典 dct，并将其值设为 val
    dct[new] = val


# 将每个编码器层的矩阵拆分为查询（queries）、键（keys）和值（values）
def read_in_decoder_q_k_v(state_dict, config):
    # fmt: off
    # 从配置中获取解码器隐藏层的大小
    hidden_size = config.decoder_config.hidden_size
    # 遍历所有解码器层
    for idx in range(config.decoder_config.decoder_layers):
        # 读取自注意力输入投影层的权重和偏置
        in_proj_weight = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.in_proj_weight")
        in_proj_bias = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.in_proj_bias")
        # 将查询（q_proj）、键（k_proj）和值（v_proj）添加到状态字典中
        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.q_proj.weight"] = in_proj_weight[: hidden_size, :]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.q_proj.bias"] = in_proj_bias[:config.hidden_size]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.k_proj.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size :, :]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size :]
        
        # 读取交叉注意力输入投影层的权重和偏置
        in_proj_weight = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.in_proj_weight")
        in_proj_bias = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.in_proj_bias")
        # 将查询（q_proj）、键（k_proj）和值（v_proj）添加到状态字典中
        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.q_proj.weight"] = in_proj_weight[: hidden_size, :]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.q_proj.bias"] = in_proj_bias[:config.hidden_size]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.k_proj.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.v_proj.weight"] = in_proj_weight[-hidden_size :, :]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.v_proj.bias"] = in_proj_bias[-hidden_size :]
    # fmt: on


# 我们将在一张可爱猫咪的图片上验证我们的结果
def prepare_img() -> torch.Tensor:
    # 定义一个 URL 变量，指向一个图像文件的地址
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 使用 requests 库发送 GET 请求，获取图像文件的内容流
    # 并使用 Image.open 方法打开流，返回一个图像对象
    im = Image.open(requests.get(url, stream=True).raw)
    # 返回获取的图像对象
    return im
@torch.no_grad()
# 使用装饰器 torch.no_grad() 包装函数，确保在该函数内部的所有操作都不会进行梯度计算

def convert_maskformer_checkpoint(
    model_name: str, checkpoint_path: str, pytorch_dump_folder_path: str, push_to_hub: bool = False
):
    """
    Copy/paste/tweak model's weights to our MaskFormer structure.
    """
    # 根据模型名称获取对应的 MaskFormer 配置信息
    config = get_maskformer_config(model_name)

    # 从文件中加载原始的状态字典数据
    with open(checkpoint_path, "rb") as f:
        data = pickle.load(f)
    state_dict = data["model"]

    # 根据预定义的映射关系重命名状态字典中的键
    rename_keys = create_rename_keys(config)
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    
    # 读取 Decoder 部分的 q, k, v 参数信息并更新到状态字典中
    read_in_decoder_q_k_v(state_dict, config)

    # 将状态字典中的 numpy 数组转换为 torch 张量
    for key, value in state_dict.items():
        state_dict[key] = torch.from_numpy(value)

    # 加载 MaskFormer 模型，并设为评估模式
    model = MaskFormerForInstanceSegmentation(config)
    model.eval()

    # 加载状态字典到模型中
    model.load_state_dict(state_dict)

    # 验证模型预期输出
    image = prepare_img()
    if "vistas" in model_name:
        ignore_index = 65
    elif "cityscapes" in model_name:
        ignore_index = 65535
    else:
        ignore_index = 255
    reduce_labels = True if "ade" in model_name else False
    
    # 创建图像处理器实例，用于处理模型的输出
    image_processor = MaskFormerImageProcessor(ignore_index=ignore_index, reduce_labels=reduce_labels)

    # 准备输入数据
    inputs = image_processor(image, return_tensors="pt")

    # 调用模型进行推理
    outputs = model(**inputs)

    # 根据模型名称设置预期的 logits 值
    if model_name == "maskformer-resnet50-ade":
        expected_logits = torch.tensor(
            [[6.7710, -0.1452, -3.5687], [1.9165, -1.0010, -1.8614], [3.6209, -0.2950, -1.3813]]
        )
    elif model_name == "maskformer-resnet101-ade":
        expected_logits = torch.tensor(
            [[4.0381, -1.1483, -1.9688], [2.7083, -1.9147, -2.2555], [3.4367, -1.3711, -2.1609]]
        )
    elif model_name == "maskformer-resnet50-coco-stuff":
        expected_logits = torch.tensor(
            [[3.2309, -3.0481, -2.8695], [5.4986, -5.4242, -2.4211], [6.2100, -5.2279, -2.7786]]
        )
    elif model_name == "maskformer-resnet101-coco-stuff":
        expected_logits = torch.tensor(
            [[4.7188, -3.2585, -2.8857], [6.6871, -2.9181, -1.2487], [7.2449, -2.2764, -2.1874]]
        )
    elif model_name == "maskformer-resnet101-cityscapes":
        expected_logits = torch.tensor(
            [[-1.8861, -1.5465, 0.6749], [-2.3677, -1.6707, -0.0867], [-2.2314, -1.9530, -0.9132]]
        )
    elif model_name == "maskformer-resnet50-vistas":
        expected_logits = torch.tensor(
            [[-6.3917, -1.5216, -1.1392], [-5.5335, -4.5318, -1.8339], [-4.3576, -4.0301, 0.2162]]
        )
    elif model_name == "maskformer-resnet50-ade20k-full":
        expected_logits = torch.tensor(
            [[3.6146, -1.9367, -3.2534], [4.0099, 0.2027, -2.7576], [3.3913, -2.3644, -3.9519]]
        )
    elif model_name == "maskformer-resnet101-ade20k-full":
        expected_logits = torch.tensor(
            [[3.2211, -1.6550, -2.7605], [2.8559, -2.4512, -2.9574], [2.6331, -2.6775, -2.1844]]
        )
    # 断言：检查模型输出的前三个类别查询的对数概率是否与预期值在给定的误差范围内相等
    assert torch.allclose(outputs.class_queries_logits[0, :3, :3], expected_logits, atol=1e-4)
    # 打印消息，表示检查通过
    print("Looks ok!")

    # 如果提供了 PyTorch 模型保存路径
    if pytorch_dump_folder_path is not None:
        # 打印消息，指示正在保存模型和图像处理器到指定路径
        print(f"Saving model and image processor of {model_name} to {pytorch_dump_folder_path}")
        # 确保保存路径存在，如果不存在则创建
        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 将图像处理器保存到指定路径
        image_processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要推送到模型中心（hub）
    if push_to_hub:
        # 打印消息，表示正在推送模型和图像处理器到中心（hub）
        print(f"Pushing model and image processor of {model_name} to the hub...")
        # 将模型推送到模型中心（hub）
        model.push_to_hub(f"facebook/{model_name}")
        # 将图像处理器推送到模型中心（hub）
        image_processor.push_to_hub(f"facebook/{model_name}")
if __name__ == "__main__":
    # 如果脚本作为主程序运行，则执行以下代码

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # Required parameters
    parser.add_argument(
        "--model_name",
        default="maskformer-resnet50-ade",
        type=str,
        required=True,
        choices=[
            "maskformer-resnet50-ade",
            "maskformer-resnet101-ade",
            "maskformer-resnet50-coco-stuff",
            "maskformer-resnet101-coco-stuff",
            "maskformer-resnet101-cityscapes",
            "maskformer-resnet50-vistas",
            "maskformer-resnet50-ade20k-full",
            "maskformer-resnet101-ade20k-full",
        ],
        help=("Name of the MaskFormer model you'd like to convert",),
    )
    # 添加必需的参数：模型名称，指定默认值和可选的模型名称列表

    parser.add_argument(
        "--checkpoint_path",
        type=str,
        required=True,
        help=("Path to the original pickle file (.pkl) of the original checkpoint.",),
    )
    # 添加参数：原始检查点文件的路径，必须提供路径值

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    # 添加参数：输出 PyTorch 模型的目录路径，默认为 None

    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )
    # 添加参数：是否将转换后的模型推送到 🤗 hub

    args = parser.parse_args()
    # 解析命令行参数并返回一个命名空间对象 args

    convert_maskformer_checkpoint(
        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
    )
    # 调用函数 convert_maskformer_checkpoint，传递命令行参数中的模型名称、检查点路径、PyTorch 模型输出路径和推送标志作为参数

`.\models\maskformer\convert_maskformer_swin_to_pytorch.py`

# 设置编码格式为 UTF-8
# 版权声明和许可证信息，指定代码使用 Apache License, Version 2.0
# 导入所需模块和库
# 这个脚本用于从原始仓库转换 MaskFormer 模型检查点，详细信息参见 https://github.com/facebookresearch/MaskFormer

import argparse  # 导入命令行参数解析模块
import json  # 导入处理 JSON 格式数据的模块
import pickle  # 导入序列化和反序列化 Python 对象的模块
from pathlib import Path  # 导入处理路径操作的模块

import requests  # 导入发送 HTTP 请求的库
import torch  # 导入 PyTorch 深度学习库
from huggingface_hub import hf_hub_download  # 导入从 Hugging Face Hub 下载资源的函数
from PIL import Image  # 导入 Python Imaging Library，用于图像处理

from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, MaskFormerImageProcessor, SwinConfig  # 导入 MaskFormer 相关类
from transformers.utils import logging  # 导入日志记录工具

logging.set_verbosity_info()  # 设置日志记录器的详细程度为信息级别
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


def get_maskformer_config(model_name: str):
    # 根据预训练的 Swin 模型配置 MaskFormerConfig
    backbone_config = SwinConfig.from_pretrained(
        "microsoft/swin-tiny-patch4-window7-224", out_features=["stage1", "stage2", "stage3", "stage4"]
    )
    config = MaskFormerConfig(backbone_config=backbone_config)

    repo_id = "huggingface/label-files"
    if "ade20k-full" in model_name:
        # 设置适用于 ade20k-full 模型的类别数和标签映射文件名
        config.num_labels = 847
        filename = "maskformer-ade20k-full-id2label.json"
    elif "ade" in model_name:
        # 设置适用于 ade 模型的类别数和标签映射文件名
        config.num_labels = 150
        filename = "ade20k-id2label.json"
    elif "coco-stuff" in model_name:
        # 设置适用于 coco-stuff 模型的类别数和标签映射文件名
        config.num_labels = 171
        filename = "maskformer-coco-stuff-id2label.json"
    elif "coco" in model_name:
        # TODO
        config.num_labels = 133
        filename = "coco-panoptic-id2label.json"
    elif "cityscapes" in model_name:
        # 设置适用于 cityscapes 模型的类别数和标签映射文件名
        config.num_labels = 19
        filename = "cityscapes-id2label.json"
    elif "vistas" in model_name:
        # 设置适用于 vistas 模型的类别数和标签映射文件名
        config.num_labels = 65
        filename = "mapillary-vistas-id2label.json"

    # 从 Hugging Face Hub 下载指定文件并加载为字典格式
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
    id2label = {int(k): v for k, v in id2label.items()}

    return config


def create_rename_keys(config):
    rename_keys = []
    # 定义需要重命名的键列表
    # stem
    # fmt: off
    rename_keys.append(("backbone.patch_embed.proj.weight", "model.pixel_level_module.encoder.model.embeddings.patch_embeddings.projection.weight"))
    rename_keys.append(("backbone.patch_embed.proj.bias", "model.pixel_level_module.encoder.model.embeddings.patch_embeddings.projection.bias"))
    rename_keys.append(("backbone.patch_embed.norm.weight", "model.pixel_level_module.encoder.model.embeddings.norm.weight"))
    # fmt: on
    # 将键值对("backbone.patch_embed.norm.bias", "model.pixel_level_module.encoder.model.embeddings.norm.bias")添加到rename_keys列表中
    rename_keys.append(("backbone.patch_embed.norm.bias", "model.pixel_level_module.encoder.model.embeddings.norm.bias"))

    # 将以下键值对依次添加到rename_keys列表中，用于重命名模型结构中的参数
    rename_keys.append(("sem_seg_head.layer_4.weight", "model.pixel_level_module.decoder.fpn.stem.0.weight"))
    rename_keys.append(("sem_seg_head.layer_4.norm.weight", "model.pixel_level_module.decoder.fpn.stem.1.weight"))
    rename_keys.append(("sem_seg_head.layer_4.norm.bias", "model.pixel_level_module.decoder.fpn.stem.1.bias"))

    # 使用循环将逐个source_index到target_index的适配器和层参数重命名添加到rename_keys列表中
    for source_index, target_index in zip(range(3, 0, -1), range(0, 3)):
        rename_keys.append((f"sem_seg_head.adapter_{source_index}.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.0.weight"))
        rename_keys.append((f"sem_seg_head.adapter_{source_index}.norm.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.1.weight"))
        rename_keys.append((f"sem_seg_head.adapter_{source_index}.norm.bias", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.proj.1.bias"))
        rename_keys.append((f"sem_seg_head.layer_{source_index}.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.0.weight"))
        rename_keys.append((f"sem_seg_head.layer_{source_index}.norm.weight", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.1.weight"))
        rename_keys.append((f"sem_seg_head.layer_{source_index}.norm.bias", f"model.pixel_level_module.decoder.fpn.layers.{target_index}.block.1.bias"))

    # 将键值对("sem_seg_head.mask_features.weight", "model.pixel_level_module.decoder.mask_projection.weight")添加到rename_keys列表中
    rename_keys.append(("sem_seg_head.mask_features.weight", "model.pixel_level_module.decoder.mask_projection.weight"))
    # 将键值对("sem_seg_head.mask_features.bias", "model.pixel_level_module.decoder.mask_projection.bias")添加到rename_keys列表中
    rename_keys.append(("sem_seg_head.mask_features.bias", "model.pixel_level_module.decoder.mask_projection.bias"))
    
    # Transformer解码器部分暂无代码，未进行注释
    # 遍历从配置中获取的解码器层数
    for idx in range(config.decoder_config.decoder_layers):
        # 处理自注意力机制的输出投影层权重和偏置
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.out_proj.weight", f"model.transformer_module.decoder.layers.{idx}.self_attn.out_proj.weight"))
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.out_proj.bias", f"model.transformer_module.decoder.layers.{idx}.self_attn.out_proj.bias"))
        
        # 处理跨注意力机制的输出投影层权重和偏置
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.out_proj.weight", f"model.transformer_module.decoder.layers.{idx}.encoder_attn.out_proj.weight"))
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.out_proj.bias", f"model.transformer_module.decoder.layers.{idx}.encoder_attn.out_proj.bias"))
        
        # 处理MLP第一层的权重和偏置
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear1.weight", f"model.transformer_module.decoder.layers.{idx}.fc1.weight"))
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear1.bias", f"model.transformer_module.decoder.layers.{idx}.fc1.bias"))
        
        # 处理MLP第二层的权重和偏置
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear2.weight", f"model.transformer_module.decoder.layers.{idx}.fc2.weight"))
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.linear2.bias", f"model.transformer_module.decoder.layers.{idx}.fc2.bias"))
        
        # 处理自注意力机制的LayerNorm层的权重和偏置
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm1.weight", f"model.transformer_module.decoder.layers.{idx}.self_attn_layer_norm.weight"))
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm1.bias", f"model.transformer_module.decoder.layers.{idx}.self_attn_layer_norm.bias"))
        
        # 处理跨注意力机制的LayerNorm层的权重和偏置
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm2.weight", f"model.transformer_module.decoder.layers.{idx}.encoder_attn_layer_norm.weight"))
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm2.bias", f"model.transformer_module.decoder.layers.{idx}.encoder_attn_layer_norm.bias"))
        
        # 处理最终LayerNorm层的权重和偏置
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm3.weight", f"model.transformer_module.decoder.layers.{idx}.final_layer_norm.weight"))
        rename_keys.append((f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.norm3.bias", f"model.transformer_module.decoder.layers.{idx}.final_layer_norm.bias"))

    # 将最后一个未处理的LayerNorm层的权重和偏置添加到重命名列表中
    rename_keys.append(("sem_seg_head.predictor.transformer.decoder.norm.weight", "model.transformer_module.decoder.layernorm.weight"))
    # 将旧的模型参数名称与新模型参数名称配对并添加到重命名键列表中
    rename_keys.append(("sem_seg_head.predictor.transformer.decoder.norm.bias", "model.transformer_module.decoder.layernorm.bias"))

    # 将旧的模型参数名称与新模型参数名称配对并添加到重命名键列表中，用于顶部的头部模块
    rename_keys.append(("sem_seg_head.predictor.query_embed.weight", "model.transformer_module.queries_embedder.weight"))

    # 将旧的模型参数名称与新模型参数名称配对并添加到重命名键列表中，用于输入投影权重
    rename_keys.append(("sem_seg_head.predictor.input_proj.weight", "model.transformer_module.input_projection.weight"))
    # 将旧的模型参数名称与新模型参数名称配对并添加到重命名键列表中，用于输入投影偏置
    rename_keys.append(("sem_seg_head.predictor.input_proj.bias", "model.transformer_module.input_projection.bias"))

    # 将旧的模型参数名称与新模型参数名称配对并添加到重命名键列表中，用于类别预测权重
    rename_keys.append(("sem_seg_head.predictor.class_embed.weight", "class_predictor.weight"))
    # 将旧的模型参数名称与新模型参数名称配对并添加到重命名键列表中，用于类别预测偏置
    rename_keys.append(("sem_seg_head.predictor.class_embed.bias", "class_predictor.bias"))

    # 循环处理每个掩码嵌入层，将旧的模型参数名称与新模型参数名称配对并添加到重命名键列表中
    for i in range(3):
        rename_keys.append((f"sem_seg_head.predictor.mask_embed.layers.{i}.weight", f"mask_embedder.{i}.0.weight"))
        rename_keys.append((f"sem_seg_head.predictor.mask_embed.layers.{i}.bias", f"mask_embedder.{i}.0.bias"))
    # fmt: on

    # 返回最终的重命名键列表
    return rename_keys
# 重新命名字典 `dct` 中键 `old` 为 `new`
def rename_key(dct, old, new):
    val = dct.pop(old)  # 弹出键为 `old` 的值，并保存到变量 `val`
    dct[new] = val  # 将值 `val` 与新键 `new` 关联并添加到字典中

# we split up the matrix of each encoder layer into queries, keys and values
# 将每个编码器层的矩阵拆分为查询、键和值
def read_in_swin_q_k_v(state_dict, backbone_config):
    num_features = [int(backbone_config.embed_dim * 2**i) for i in range(len(backbone_config.depths))]
    for i in range(len(backbone_config.depths)):
        dim = num_features[i]
        for j in range(backbone_config.depths[i]):
            # fmt: off
            # 读取输入投影层 (in_proj) 的权重和偏置 (在原始实现中，这是一个单独的矩阵加偏置)
            in_proj_weight = state_dict.pop(f"backbone.layers.{i}.blocks.{j}.attn.qkv.weight")
            in_proj_bias = state_dict.pop(f"backbone.layers.{i}.blocks.{j}.attn.qkv.bias")
            # 接下来，按顺序添加查询、键和值到状态字典
            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.query.weight"] = in_proj_weight[:dim, :]
            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.query.bias"] = in_proj_bias[: dim]
            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.key.weight"] = in_proj_weight[
                dim : dim * 2, :
            ]
            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.key.bias"] = in_proj_bias[
                dim : dim * 2
            ]
            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.value.weight"] = in_proj_weight[
                -dim :, :
            ]
            state_dict[f"model.pixel_level_module.encoder.model.encoder.layers.{i}.blocks.{j}.attention.self.value.bias"] = in_proj_bias[-dim :]
            # fmt: on

# we split up the matrix of each encoder layer into queries, keys and values
# 将每个解码器层的矩阵拆分为查询、键和值
def read_in_decoder_q_k_v(state_dict, config):
    # fmt: off
    hidden_size = config.decoder_config.hidden_size
    # 遍历解码器层次的数量
    for idx in range(config.decoder_config.decoder_layers):
        # 读取自注意力输入投影层的权重和偏置（在原始实现中，这是单独的矩阵和偏置）
        in_proj_weight = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.in_proj_weight")
        in_proj_bias = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.self_attn.in_proj_bias")
        
        # 将查询（query）、键（keys）和值（values）依次添加到状态字典中
        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.q_proj.weight"] = in_proj_weight[: hidden_size, :]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.q_proj.bias"] = in_proj_bias[:config.hidden_size]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.k_proj.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.v_proj.weight"] = in_proj_weight[-hidden_size :, :]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.self_attn.v_proj.bias"] = in_proj_bias[-hidden_size :]
        
        # 读取交叉注意力输入投影层的权重和偏置（在原始实现中，这是单独的矩阵和偏置）
        in_proj_weight = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.in_proj_weight")
        in_proj_bias = state_dict.pop(f"sem_seg_head.predictor.transformer.decoder.layers.{idx}.multihead_attn.in_proj_bias")
        
        # 将查询（query）、键（keys）和值（values）依次添加到状态字典中
        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.q_proj.weight"] = in_proj_weight[: hidden_size, :]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.q_proj.bias"] = in_proj_bias[:config.hidden_size]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.k_proj.weight"] = in_proj_weight[hidden_size : hidden_size * 2, :]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.k_proj.bias"] = in_proj_bias[hidden_size : hidden_size * 2]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.v_proj.weight"] = in_proj_weight[-hidden_size :, :]
        state_dict[f"model.transformer_module.decoder.layers.{idx}.encoder_attn.v_proj.bias"] = in_proj_bias[-hidden_size :]
    
    # 格式化结束
    # fmt: on
# We will verify our results on an image of cute cats
def prepare_img() -> torch.Tensor:
    # 定义图像的 URL
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 通过 HTTP 请求获取图像的原始数据流，并用 PIL 库打开图像
    im = Image.open(requests.get(url, stream=True).raw)
    return im

@torch.no_grad()
def convert_maskformer_checkpoint(
    model_name: str, checkpoint_path: str, pytorch_dump_folder_path: str, push_to_hub: bool = False
):
    """
    Copy/paste/tweak model's weights to our MaskFormer structure.
    """
    # 根据模型名获取 MaskFormer 的配置信息
    config = get_maskformer_config(model_name)

    # 加载原始的状态字典
    with open(checkpoint_path, "rb") as f:
        data = pickle.load(f)
    state_dict = data["model"]

    # 打印状态字典中每个键和对应的形状（注释掉的部分）
    # for name, param in state_dict.items():
    #     print(name, param.shape)

    # 根据配置信息创建重命名键列表
    rename_keys = create_rename_keys(config)
    # 对状态字典中的键进行重命名操作
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)
    # 从状态字典中读取 Swin Transformer 的 QKV 参数
    read_in_swin_q_k_v(state_dict, config.backbone_config)
    # 从状态字典中读取解码器的 QKV 参数
    read_in_decoder_q_k_v(state_dict, config)

    # 将所有值转换为 Torch 张量
    for key, value in state_dict.items():
        state_dict[key] = torch.from_numpy(value)

    # 加载 MaskFormer 模型
    model = MaskFormerForInstanceSegmentation(config)
    model.eval()

    # 打印模型中每个参数的名称和形状
    for name, param in model.named_parameters():
        print(name, param.shape)

    # 加载状态字典到模型中，并检查缺失和多余的键
    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
    assert missing_keys == [
        "model.pixel_level_module.encoder.model.layernorm.weight",
        "model.pixel_level_module.encoder.model.layernorm.bias",
    ]
    assert len(unexpected_keys) == 0, f"Unexpected keys: {unexpected_keys}"

    # 验证模型在给定图像上的输出结果
    image = prepare_img()
    # 根据模型名设置忽略的索引值
    if "vistas" in model_name:
        ignore_index = 65
    elif "cityscapes" in model_name:
        ignore_index = 65535
    else:
        ignore_index = 255
    # 根据模型名设置是否减少标签数
    reduce_labels = True if "ade" in model_name else False
    # 创建 MaskFormerImageProcessor 实例来处理图像
    image_processor = MaskFormerImageProcessor(ignore_index=ignore_index, reduce_labels=reduce_labels)

    # 对输入图像进行预处理，返回模型所需的输入张量
    inputs = image_processor(image, return_tensors="pt")

    # 在模型上执行前向传播，获取输出
    outputs = model(**inputs)

    # 打印输出张量的一部分内容（Logits）
    print("Logits:", outputs.class_queries_logits[0, :3, :3])

    # 根据模型名设置期望的 Logits 值，用于断言验证
    if model_name == "maskformer-swin-tiny-ade":
        expected_logits = torch.tensor(
            [[3.6353, -4.4770, -2.6065], [0.5081, -4.2394, -3.5343], [2.1909, -5.0353, -1.9323]]
        )
    assert torch.allclose(outputs.class_queries_logits[0, :3, :3], expected_logits, atol=1e-4)
    print("Looks ok!")

    # 如果指定了 pytorch_dump_folder_path，则保存模型和图像处理器
    if pytorch_dump_folder_path is not None:
        print(f"Saving model and image processor to {pytorch_dump_folder_path}")
        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
        model.save_pretrained(pytorch_dump_folder_path)
        image_processor.save_pretrained(pytorch_dump_folder_path)

    # 如果 push_to_hub 为 True，则将模型和图像处理器推送到模型中心
    if push_to_hub:
        print("Pushing model and image processor to the hub...")
        model.push_to_hub(f"nielsr/{model_name}")
        image_processor.push_to_hub(f"nielsr/{model_name}")


if __name__ == "__main__":
    # 主程序入口点，此处不添加任何注释
    # 创建参数解析器对象
    parser = argparse.ArgumentParser()
    
    # 添加命令行参数：模型名称
    parser.add_argument(
        "--model_name",
        default="maskformer-swin-tiny-ade",
        type=str,
        help=("Name of the MaskFormer model you'd like to convert",),
    )
    
    # 添加命令行参数：检查点路径
    parser.add_argument(
        "--checkpoint_path",
        default="/Users/nielsrogge/Documents/MaskFormer_checkpoints/MaskFormer-Swin-tiny-ADE20k/model.pkl",
        type=str,
        help="Path to the original state dict (.pth file).",
    )
    
    # 添加命令行参数：PyTorch 模型输出目录路径
    parser.add_argument(
        "--pytorch_dump_folder_path", 
        default=None, 
        type=str, 
        help="Path to the output PyTorch model directory."
    )
    
    # 添加命令行参数：是否推送模型到 🤗 hub
    parser.add_argument(
        "--push_to_hub", 
        action="store_true", 
        help="Whether or not to push the converted model to the 🤗 hub."
    )
    
    # 解析命令行参数，将结果存储在 args 变量中
    args = parser.parse_args()
    
    # 调用函数来转换 MaskFormer 模型的检查点
    convert_maskformer_checkpoint(
        args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub
    )

`.\models\maskformer\feature_extraction_maskformer.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Feature extractor class for MaskFormer.
"""

import warnings  # 导入警告模块

from ...utils import logging  # 导入日志工具
from .image_processing_maskformer import MaskFormerImageProcessor  # 导入MaskFormerImageProcessor类


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


class MaskFormerFeatureExtractor(MaskFormerImageProcessor):
    def __init__(self, *args, **kwargs) -> None:
        # 发出警告，提示MaskFormerFeatureExtractor类在将来的版本中将被移除
        warnings.warn(
            "The class MaskFormerFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
            " Please use MaskFormerImageProcessor instead.",
            FutureWarning,
        )
        super().__init__(*args, **kwargs)  # 调用父类的构造函数，初始化MaskFormerImageProcessor的实例

`.\models\maskformer\image_processing_maskformer.py`

# coding=utf-8
# 版权 2022 年 HuggingFace Inc. 团队保留所有权利。
#
# 根据 Apache 许可证 2.0 版本进行许可；
# 除非符合许可证的要求，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按"原样"提供，不提供任何明示或暗示的担保或条件。
# 请参阅许可证获取特定语言的权限和限制。
"""MaskFormer 的图像处理器类。"""

import math
import warnings
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Union

import numpy as np

from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
    PaddingMode,
    get_resize_output_image_size,
    pad,
    rescale,
    resize,
    to_channel_dimension_format,
)
from ...image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)
from ...utils import (
    IMAGENET_DEFAULT_MEAN,
    IMAGENET_DEFAULT_STD,
    TensorType,
    is_torch_available,
    is_torch_tensor,
    logging,
)

# 获取日志记录器
logger = logging.get_logger(__name__)

# 如果是类型检查环境
if TYPE_CHECKING:
    from transformers import MaskFormerForInstanceSegmentationOutput

# 如果 Torch 可用
if is_torch_available():
    import torch
    from torch import nn

# 从 transformers.models.detr.image_processing_detr.max_across_indices 复制过来
def max_across_indices(values: Iterable[Any]) -> List[Any]:
    """
    返回可迭代值的所有索引中的最大值。
    """
    return [max(values_i) for values_i in zip(*values)]

# 从 transformers.models.detr.image_processing_detr.get_max_height_width 复制过来
def get_max_height_width(
    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
    """
    获取批次中所有图像的最大高度和宽度。
    """
    if input_data_format is None:
        input_data_format = infer_channel_dimension_format(images[0])

    if input_data_format == ChannelDimension.FIRST:
        _, max_height, max_width = max_across_indices([img.shape for img in images])
    elif input_data_format == ChannelDimension.LAST:
        max_height, max_width, _ = max_across_indices([img.shape for img in images])
    else:
        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
    return (max_height, max_width)

# 从 transformers.models.detr.image_processing_detr.make_pixel_mask 复制过来
def make_pixel_mask(
    # image: np.ndarray 是一个参数，表示输入的图像数据，类型为 numpy 的多维数组
    # output_size: Tuple[int, int] 是一个参数，表示输出图像的尺寸，以元组形式给出，包含两个整数值
    # input_data_format: Optional[Union[str, ChannelDimension]] = None 是一个可选参数，用于指定输入数据的格式，可以是字符串或 ChannelDimension 类型的联合类型，如果不提供则默认为 None
# 根据输入图像创建像素掩码，其中1表示有效像素，0表示填充像素。
def make_pixel_mask(image: np.ndarray, output_size: Tuple[int, int]) -> np.ndarray:
    """
    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.

    Args:
        image (`np.ndarray`):
            Image to make the pixel mask for.
        output_size (`Tuple[int, int]`):
            Output size of the mask.
    """
    # 获取图像的高度和宽度
    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
    # 创建一个全零的掩码，形状为output_size，数据类型为np.int64
    mask = np.zeros(output_size, dtype=np.int64)
    # 将掩码的有效像素部分设为1，根据输入图像的实际大小进行裁剪
    mask[:input_height, :input_width] = 1
    return mask


# 从transformers.models.detr.image_processing_detr.binary_mask_to_rle复制而来
def binary_mask_to_rle(mask):
    """
    Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.

    Args:
        mask (`torch.Tensor` or `numpy.array`):
            A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
            segment_id or class_id.
    Returns:
        `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
        format.
    """
    # 如果mask是torch.Tensor，则转换为numpy数组
    if is_torch_tensor(mask):
        mask = mask.numpy()

    # 将二进制掩码展平为一维数组
    pixels = mask.flatten()
    # 在数组两端各加一个0，以确保算法正确性
    pixels = np.concatenate([[0], pixels, [0]])
    # 计算像素值改变的位置索引，构建运行长度编码
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return list(runs)


# 从transformers.models.detr.image_processing_detr.convert_segmentation_to_rle复制而来
def convert_segmentation_to_rle(segmentation):
    """
    Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format.

    Args:
        segmentation (`torch.Tensor` or `numpy.array`):
            A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
    Returns:
        `List[List]`: A list of lists, where each list is the run-length encoding of a segment / class id.
    """
    # 获取分割图中所有唯一的标签值
    segment_ids = torch.unique(segmentation)

    run_length_encodings = []
    # 遍历每个唯一的标签值
    for idx in segment_ids:
        # 创建与当前标签匹配的二进制掩码
        mask = torch.where(segmentation == idx, 1, 0)
        # 将二进制掩码转换为运行长度编码（RLE）
        rle = binary_mask_to_rle(mask)
        run_length_encodings.append(rle)

    return run_length_encodings


# 从transformers.models.detr.image_processing_detr.remove_low_and_no_objects复制而来
def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
    """
    Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
    `labels`.

    Args:
        masks (`torch.Tensor`):
            A tensor of shape `(num_queries, height, width)`.
        scores (`torch.Tensor`):
            A tensor of shape `(num_queries)`.
        labels (`torch.Tensor`):
            A tensor of shape `(num_queries)`.
        object_mask_threshold (`float`):
            A number between 0 and 1 used to binarize the masks.
    Raises:
        `ValueError`: Raised when the first dimension doesn't match in all input tensors.
    """
    # 确保所有输入张量的第一个维度大小相同
    # 检查输入的`masks`、`scores`和`labels`张量是否具有相同的第一个维度
    if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
        # 如果它们的第一个维度不同，抛出数值错误异常
        raise ValueError("mask, scores and labels must have the same shape!")

    # 创建布尔张量 `to_keep`，其中元素为真（True）的条件是标签不等于 `num_labels` 并且得分大于 `object_mask_threshold`
    to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)

    # 返回根据 `to_keep` 布尔张量过滤后的 `masks`、`scores` 和 `labels`
    return masks[to_keep], scores[to_keep], labels[to_keep]
# Copied from transformers.models.detr.image_processing_detr.check_segment_validity
def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
    # 获取与第 k 类相关的掩码
    mask_k = mask_labels == k
    # 计算第 k 类掩码的总面积
    mask_k_area = mask_k.sum()

    # 计算预测概率中第 k 类的总面积
    original_area = (mask_probs[k] >= mask_threshold).sum()
    # 检查是否存在有效掩码
    mask_exists = mask_k_area > 0 and original_area > 0

    # 消除断开的小段
    if mask_exists:
        # 计算掩码面积比例
        area_ratio = mask_k_area / original_area
        # 如果面积比例不大于重叠掩码面积阈值，则认为不存在有效掩码
        if not area_ratio.item() > overlap_mask_area_threshold:
            mask_exists = False

    return mask_exists, mask_k


# Copied from transformers.models.detr.image_processing_detr.compute_segments
def compute_segments(
    mask_probs,
    pred_scores,
    pred_labels,
    mask_threshold: float = 0.5,
    overlap_mask_area_threshold: float = 0.8,
    label_ids_to_fuse: Optional[Set[int]] = None,
    target_size: Tuple[int, int] = None,
):
    height = mask_probs.shape[1] if target_size is None else target_size[0]
    width = mask_probs.shape[2] if target_size is None else target_size[1]

    # 创建一个全零的分割图像，用于存储每个像素的分割标签
    segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
    # 用于存储检测到的所有分割结果
    segments: List[Dict] = []

    if target_size is not None:
        # 如果有指定目标尺寸，则插值调整掩码概率张量的尺寸
        mask_probs = nn.functional.interpolate(
            mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
        )[0]

    current_segment_id = 0

    # 将每个掩码乘以其预测分数
    mask_probs *= pred_scores.view(-1, 1, 1)
    # 获取每个像素位置上概率最大的类别标签作为分割标签
    mask_labels = mask_probs.argmax(0)  # [height, width]

    # 用于记录每个类别的实例数量
    stuff_memory_list: Dict[str, int] = {}
    for k in range(pred_labels.shape[0]):
        pred_class = pred_labels[k].item()
        should_fuse = pred_class in label_ids_to_fuse

        # 检查是否存在有效的分割掩码并且足够大
        mask_exists, mask_k = check_segment_validity(
            mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
        )

        if mask_exists:
            if pred_class in stuff_memory_list:
                current_segment_id = stuff_memory_list[pred_class]
            else:
                current_segment_id += 1

            # 将当前对象的分割标签添加到最终的分割图像中
            segmentation[mask_k] = current_segment_id
            segment_score = round(pred_scores[k].item(), 6)
            # 将当前对象的分割信息添加到分割列表中
            segments.append(
                {
                    "id": current_segment_id,
                    "label_id": pred_class,
                    "was_fused": should_fuse,
                    "score": segment_score,
                }
            )
            if should_fuse:
                stuff_memory_list[pred_class] = current_segment_id

    return segmentation, segments


# TODO: (Amy) Move to image_transforms
def convert_segmentation_map_to_binary_masks(
    segmentation_map: "np.ndarray",
    # segmentation_map 是一个变量，用来表示分割地图，类型为 numpy 数组（np.ndarray）

    instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
    # instance_id_to_semantic_id 是一个可选的字典类型变量，用来映射实例 ID 到语义 ID，键和值都是整数类型

    ignore_index: Optional[int] = None,
    # ignore_index 是一个可选的整数变量，用来指定在分割过程中忽略的索引值，默认为 None

    reduce_labels: bool = False,
    # reduce_labels 是一个布尔型变量，用来表示是否需要减少标签的数量，默认为 False
    ):
        # 如果 reduce_labels 为 True 但 ignore_index 未提供，则抛出数值错误异常
        raise ValueError("If `reduce_labels` is True, `ignore_index` must be provided.")

    if reduce_labels:
        # 如果 reduce_labels 为 True，则将 segmentation_map 中值为 0 的位置替换为 ignore_index，其它位置减去 1
        segmentation_map = np.where(segmentation_map == 0, ignore_index, segmentation_map - 1)

    # 获取唯一的标签 ids（基于输入是类别还是实例）
    all_labels = np.unique(segmentation_map)

    # 如果 ignore_index 不为空，则删除背景标签
    if ignore_index is not None:
        all_labels = all_labels[all_labels != ignore_index]

    # 为每个对象实例生成二进制掩码
    binary_masks = [(segmentation_map == i) for i in all_labels]
    binary_masks = np.stack(binary_masks, axis=0)  # (num_labels, height, width)

    # 如果 instance_id_to_semantic_id 不为空，则将实例 ids 转换为类别 ids
    if instance_id_to_semantic_id is not None:
        labels = np.zeros(all_labels.shape[0])

        for label in all_labels:
            # 根据 reduce_labels 来选择是否需要对 label 进行调整
            class_id = instance_id_to_semantic_id[label + 1 if reduce_labels else label]
            labels[all_labels == label] = class_id - 1 if reduce_labels else class_id
    else:
        labels = all_labels

    # 返回二进制掩码数组和标签数组，掩码为浮点数类型，标签为整数类型
    return binary_masks.astype(np.float32), labels.astype(np.int64)


def get_maskformer_resize_output_image_size(
    image: np.ndarray,
    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
    max_size: Optional[int] = None,
    size_divisor: int = 0,
    default_to_square: bool = True,
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
    """
    根据所需大小计算输出图像的大小。

    Args:
        image (`np.ndarray`):
            输入图像。
        size (`int` or `Tuple[int, int]` or `List[int]` or `Tuple[int]`):
            输出图像的大小。
        max_size (`int`, *可选*):
            输出图像的最大大小。
        size_divisor (`int`, *可选*, 默认为 0):
            如果提供了 `size_divisor`，输出图像大小将可以被此数整除。
        default_to_square (`bool`, *可选*, 默认为 `True`):
            如果未提供大小是否默认为正方形。
        input_data_format (`ChannelDimension` or `str`, *可选*):
            输入图像的通道维度格式。如果未设置，则使用输入的推断格式。

    Returns:
        `Tuple[int, int]`: 输出图像的大小。
    """
    output_size = get_resize_output_image_size(
        input_image=image,
        size=size,
        default_to_square=default_to_square,
        max_size=max_size,
        input_data_format=input_data_format,
    )

    if size_divisor > 0:
        height, width = output_size
        height = int(math.ceil(height / size_divisor) * size_divisor)
        width = int(math.ceil(width / size_divisor) * size_divisor)
        output_size = (height, width)

    # 返回计算后的输出图像大小
    return output_size


class MaskFormerImageProcessor(BaseImageProcessor):
    r"""
    Constructs a MaskFormer image processor. The image processor can be used to prepare image(s) and optional targets
    for the model.

    This image processor inherits from [`BaseImageProcessor`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the input to a certain `size`.
        size (`int`, *optional*, defaults to 800):
            Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size is a
            sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller edge of
            the image will be matched to this number. i.e, if `height > width`, then image will be rescaled to `(size *
            height / width, size)`.
        size_divisor (`int`, *optional*, defaults to 32):
            Some backbones need images divisible by a certain number. If not passed, it defaults to the value used in
            Swin Transformer.
        resample (`int`, *optional*, defaults to `Resampling.BILINEAR`):
            An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
            `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
            `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
            to `True`.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the input to a certain `scale`.
        rescale_factor (`float`, *optional*, defaults to `1/ 255`):
            Rescale the input by the given factor. Only has an effect if `do_rescale` is set to `True`.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether or not to normalize the input with mean and standard deviation.
        image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
            The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
        image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
            The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
            ImageNet std.
        ignore_index (`int`, *optional*):
            Label to be assigned to background pixels in segmentation maps. If provided, segmentation map pixels
            denoted with 0 (background) will be replaced with `ignore_index`.
        do_reduce_labels (`bool`, *optional*, defaults to `False`):
            Whether or not to decrement all label values of segmentation maps by 1. Usually used for datasets where 0
            is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k).
            The background label will be replaced by `ignore_index`.

    """
    # 定义模型输入的名称列表，包含像素值和像素掩码
    model_input_names = ["pixel_values", "pixel_mask"]
    
    # 初始化函数，用于创建和配置数据预处理类的实例
    def __init__(
        self,
        do_resize: bool = True,  # 是否进行图像大小调整，默认为True
        size: Dict[str, int] = None,  # 图像大小的字典，包含宽度和高度
        size_divisor: int = 32,  # 图像大小调整的除数，用于确保大小是32的倍数
        resample: PILImageResampling = PILImageResampling.BILINEAR,  # 图像调整时使用的插值方法，默认为双线性插值
        do_rescale: bool = True,  # 是否进行图像像素值的缩放，默认为True
        rescale_factor: float = 1 / 255,  # 图像像素值缩放的因子，默认为1/255
        do_normalize: bool = True,  # 是否进行图像像素值的归一化，默认为True
        image_mean: Union[float, List[float]] = None,  # 图像归一化时的均值，可以是单个值或列表
        image_std: Union[float, List[float]] = None,  # 图像归一化时的标准差，可以是单个值或列表
        ignore_index: Optional[int] = None,  # 可选参数，指定要忽略的标签索引
        do_reduce_labels: bool = False,  # 是否减少标签数量，默认为False
        **kwargs,  # 其他可能的关键字参数，灵活配置
    ):
        ):
            # 检查是否传入了 `size_divisibility` 参数，如果有则发出警告并使用 `size_divisor` 替代
            if "size_divisibility" in kwargs:
                warnings.warn(
                    "The `size_divisibility` argument is deprecated and will be removed in v4.27. Please use "
                    "`size_divisor` instead.",
                    FutureWarning,
                )
                size_divisor = kwargs.pop("size_divisibility")

            # 检查是否传入了 `max_size` 参数，如果有则发出警告并将其作为私有属性 `_max_size` 存储
            if "max_size" in kwargs:
                warnings.warn(
                    "The `max_size` argument is deprecated and will be removed in v4.27. Please use size['longest_edge']"
                    " instead.",
                    FutureWarning,
                )
                # 将 `max_size` 作为默认值传递给 `preprocess` 方法的私有属性 `_max_size`
                self._max_size = kwargs.pop("max_size")
            else:
                # 如果未传入 `max_size` 参数，默认设为 1333
                self._max_size = 1333

            # 检查是否传入了 `reduce_labels` 参数，如果有则发出警告并使用 `do_reduce_labels` 替代
            if "reduce_labels" in kwargs:
                warnings.warn(
                    "The `reduce_labels` argument is deprecated and will be removed in v4.27. Please use "
                    "`do_reduce_labels` instead.",
                    FutureWarning,
                )
                do_reduce_labels = kwargs.pop("reduce_labels")

            # 如果未指定 `size` 参数，则设置默认的 `size` 字典，包括 `shortest_edge` 和 `longest_edge`
            size = size if size is not None else {"shortest_edge": 800, "longest_edge": self._max_size}
            # 获取处理后的 `size` 字典，确保不超过 `max_size` 的限制
            size = get_size_dict(size, max_size=self._max_size, default_to_square=False)

            # 调用父类的初始化方法，传入所有的关键字参数
            super().__init__(**kwargs)

            # 初始化对象的各种属性
            self.do_resize = do_resize
            self.size = size
            self.resample = resample
            self.size_divisor = size_divisor
            self.do_rescale = do_rescale
            self.rescale_factor = rescale_factor
            self.do_normalize = do_normalize
            self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
            self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
            self.ignore_index = ignore_index
            self.do_reduce_labels = do_reduce_labels

            # 定义有效的处理器关键字列表，用于后续处理器方法的调用和验证
            self._valid_processor_keys = [
                "images",
                "segmentation_maps",
                "instance_id_to_semantic_id",
                "do_resize",
                "size",
                "size_divisor",
                "resample",
                "do_rescale",
                "rescale_factor",
                "do_normalize",
                "image_mean",
                "image_std",
                "ignore_index",
                "do_reduce_labels",
                "return_tensors",
                "data_format",
                "input_data_format",
            ]

        @classmethod
    # 重写基类的 `from_dict` 方法，用于从字典创建图像处理器对象，并确保参数更新
    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
        """
        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
        created using from_dict and kwargs e.g. `MaskFormerImageProcessor.from_pretrained(checkpoint, max_size=800)`
        """
        # 复制输入的字典，以防修改原始输入
        image_processor_dict = image_processor_dict.copy()
        
        # 如果 `kwargs` 中包含 `max_size` 参数，则更新到 `image_processor_dict` 中，并从 `kwargs` 中删除
        if "max_size" in kwargs:
            image_processor_dict["max_size"] = kwargs.pop("max_size")
        
        # 如果 `kwargs` 中包含 `size_divisibility` 参数，则更新到 `image_processor_dict` 中，并从 `kwargs` 中删除
        if "size_divisibility" in kwargs:
            image_processor_dict["size_divisibility"] = kwargs.pop("size_divisibility")
        
        # 调用基类的 `from_dict` 方法，使用更新后的 `image_processor_dict` 和其余 `kwargs` 创建图像处理器对象
        return super().from_dict(image_processor_dict, **kwargs)

    # 图像调整大小的方法
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        size_divisor: int = 0,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format=None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize the image to the given size. Size can be min_size (scalar) or `(height, width)` tuple. If size is an
        int, smaller edge of the image will be matched to this number.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                The size of the output image.
            size_divisor (`int`, *optional*, defaults to 0):
                If `size_divisor` is given, the output image size will be divisible by the number.
            resample (`PILImageResampling` resampling filter, *optional*, defaults to `PILImageResampling.BILINEAR`):
                Resampling filter to use when resizing the image.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # Check if the deprecated `max_size` parameter is used and issue a warning
        if "max_size" in kwargs:
            warnings.warn(
                "The `max_size` parameter is deprecated and will be removed in v4.27. "
                "Please specify in `size['longest_edge'] instead`.",
                FutureWarning,
            )
            max_size = kwargs.pop("max_size")
        else:
            max_size = None
        # Transform `size` into a standardized dictionary format
        size = get_size_dict(size, max_size=max_size, default_to_square=False)
        # Handle different formats of `size` and set `size` and `max_size` accordingly
        if "shortest_edge" in size and "longest_edge" in size:
            size, max_size = size["shortest_edge"], size["longest_edge"]
        elif "height" in size and "width" in size:
            size = (size["height"], size["width"])
            max_size = None
        else:
            # Raise an error if `size` does not contain expected keys
            raise ValueError(
                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
                f" {size.keys()}."
            )
        # Compute the output image size after resizing
        size = get_maskformer_resize_output_image_size(
            image=image,
            size=size,
            max_size=max_size,
            size_divisor=size_divisor,
            default_to_square=False,
            input_data_format=input_data_format,
        )
        # Resize the input `image` using specified parameters
        image = resize(
            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
        )
        # Return the resized image
        return image

    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
    def rescale(
        self,
        image: np.ndarray,
        rescale_factor: float,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Rescale the image by the given factor. image = image * rescale_factor.

        Args:
            image (`np.ndarray`):
                Image to rescale.
            rescale_factor (`float`):
                The value to use for rescaling.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
                one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        """
        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)

    def convert_segmentation_map_to_binary_masks(
        self,
        segmentation_map: "np.ndarray",
        instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
        ignore_index: Optional[int] = None,
        reduce_labels: bool = False,
    ):
        """
        Convert a segmentation map to binary masks.

        Args:
            segmentation_map (`np.ndarray`):
                The input segmentation map.
            instance_id_to_semantic_id (Optional[Dict[int, int]]):
                Mapping from instance IDs to semantic IDs. If not provided, no mapping is applied.
            ignore_index (Optional[int]):
                Index to ignore in the segmentation map.
            reduce_labels (bool):
                Whether to reduce the number of labels in the output.

        Returns:
            Binary masks corresponding to the segmentation map.
        """
        reduce_labels = reduce_labels if reduce_labels is not None else self.reduce_labels
        ignore_index = ignore_index if ignore_index is not None else self.ignore_index
        return convert_segmentation_map_to_binary_masks(
            segmentation_map=segmentation_map,
            instance_id_to_semantic_id=instance_id_to_semantic_id,
            ignore_index=ignore_index,
            reduce_labels=reduce_labels,
        )

    def __call__(self, images, segmentation_maps=None, **kwargs) -> BatchFeature:
        """
        Callable interface for preprocessing images and segmentation maps.

        Args:
            images:
                Images to preprocess.
            segmentation_maps:
                Segmentation maps associated with the images.
            **kwargs:
                Additional keyword arguments for preprocessing.

        Returns:
            Preprocessed batch of features.
        """
        return self.preprocess(images, segmentation_maps=segmentation_maps, **kwargs)

    def _preprocess(
        self,
        image: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        size_divisor: int = None,
        resample: PILImageResampling = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ):
        """
        Internal preprocessing function for handling various transformations on images.

        Args:
            image (ImageInput):
                Input image to preprocess.
            do_resize (bool, optional):
                Whether to resize the image.
            size (Dict[str, int], optional):
                Desired size for resizing (width, height).
            size_divisor (int, optional):
                Divisor for resizing the image dimensions.
            resample (PILImageResampling, optional):
                Resampling method for resizing.
            do_rescale (bool, optional):
                Whether to rescale the image.
            rescale_factor (float, optional):
                Scaling factor for image rescaling.
            do_normalize (bool, optional):
                Whether to normalize the image.
            image_mean (Union[float, List[float]], optional):
                Mean values for image normalization.
            image_std (Union[float, List[float]], optional):
                Standard deviation values for image normalization.
            input_data_format (Union[str, ChannelDimension], optional):
                Format of the input image data.

        Returns:
            Preprocessed image based on the specified transformations.
        """
    ):
        # 如果需要调整大小，则调用 resize 方法对图像进行调整
        if do_resize:
            image = self.resize(
                image, size=size, size_divisor=size_divisor, resample=resample, input_data_format=input_data_format
            )
        # 如果需要重新缩放，则调用 rescale 方法对图像进行重新缩放
        if do_rescale:
            image = self.rescale(image, rescale_factor=rescale_factor, input_data_format=input_data_format)
        # 如果需要归一化，则调用 normalize 方法对图像进行归一化
        if do_normalize:
            image = self.normalize(image, mean=image_mean, std=image_std, input_data_format=input_data_format)
        # 返回预处理后的图像
        return image

    def _preprocess_image(
        self,
        image: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        size_divisor: int = None,
        resample: PILImageResampling = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """Preprocesses a single image."""
        # 将图像转换为 numpy 数组，因为所有的转换操作都要求输入为 numpy 数组
        image = to_numpy_array(image)
        # 如果图像已经进行了缩放，并且需要进行重新缩放，则记录警告信息
        if is_scaled_image(image) and do_rescale:
            logger.warning_once(
                "It looks like you are trying to rescale already rescaled images. If the input"
                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
            )
        # 推断图像的通道格式（数据格式）如果未指定
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)
        # 调用 _preprocess 方法进行实际的图像预处理
        image = self._preprocess(
            image=image,
            do_resize=do_resize,
            size=size,
            size_divisor=size_divisor,
            resample=resample,
            do_rescale=do_rescale,
            rescale_factor=rescale_factor,
            do_normalize=do_normalize,
            image_mean=image_mean,
            image_std=image_std,
            input_data_format=input_data_format,
        )
        # 如果指定了数据格式，则将图像转换为该格式
        if data_format is not None:
            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
        # 返回预处理后的图像 numpy 数组
        return image

    def _preprocess_mask(
        self,
        segmentation_map: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        size_divisor: int = 0,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """Preprocesses a single mask."""
        # 将分割地图转换为 NumPy 数组
        segmentation_map = to_numpy_array(segmentation_map)
        # 如果分割地图的维度为2，添加通道维度，因为某些变换需要
        if segmentation_map.ndim == 2:
            added_channel_dim = True
            segmentation_map = segmentation_map[None, ...]
            input_data_format = ChannelDimension.FIRST
        else:
            added_channel_dim = False
            # 如果输入数据格式未指定，根据分割地图推断通道维度的格式
            if input_data_format is None:
                input_data_format = infer_channel_dimension_format(segmentation_map, num_channels=1)
        # TODO: (Amy)
        # 重新设计分割地图处理过程，包括减少标签数量和大小调整，不丢弃大于255的分割ID。
        segmentation_map = self._preprocess(
            image=segmentation_map,
            do_resize=do_resize,
            resample=PILImageResampling.NEAREST,
            size=size,
            size_divisor=size_divisor,
            do_rescale=False,
            do_normalize=False,
            input_data_format=input_data_format,
        )
        # 如果为了处理而添加了额外的通道维度，则去除它
        if added_channel_dim:
            segmentation_map = segmentation_map.squeeze(0)
        return segmentation_map

    def preprocess(
        self,
        images: ImageInput,
        segmentation_maps: Optional[ImageInput] = None,
        instance_id_to_semantic_id: Optional[Dict[int, int]] = None,
        do_resize: Optional[bool] = None,
        size: Optional[Dict[str, int]] = None,
        size_divisor: Optional[int] = None,
        resample: PILImageResampling = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        ignore_index: Optional[int] = None,
        do_reduce_labels: Optional[bool] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):
        # 以下代码被复制自 transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
    def _pad_image(
        self,
        image: np.ndarray,
        output_size: Tuple[int, int],
        constant_values: Union[float, Iterable[float]] = 0,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    # 定义一个方法，用于将图像用零填充到指定的大小
    ) -> np.ndarray:
        """
        Pad an image with zeros to the given size.
        """
        # 获取输入图像的高度和宽度
        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
        # 获取输出图像的高度和宽度
        output_height, output_width = output_size

        # 计算需要在图像底部和右侧填充的像素数
        pad_bottom = output_height - input_height
        pad_right = output_width - input_width
        # 定义填充方式为在顶部和左侧不填充，在底部填充pad_bottom行，在右侧填充pad_right列
        padding = ((0, pad_bottom), (0, pad_right))
        # 使用指定的填充方式对图像进行填充
        padded_image = pad(
            image,
            padding,
            mode=PaddingMode.CONSTANT,
            constant_values=constant_values,
            data_format=data_format,
            input_data_format=input_data_format,
        )
        # 返回填充后的图像
        return padded_image

    # 以下代码段是从transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad中复制而来
    # 定义一个函数pad，用于图像的填充处理
    def pad(
        self,
        images: List[np.ndarray],
        constant_values: Union[float, Iterable[float]] = 0,
        return_pixel_mask: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> BatchFeature:
        """
        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
        in the batch and optionally returns their corresponding pixel mask.

        Args:
            image (`np.ndarray`):
                Image to pad.
            constant_values (`float` or `Iterable[float]`, *optional*):
                The value to use for the padding if `mode` is `"constant"`.
            return_pixel_mask (`bool`, *optional*, defaults to `True`):
                Whether to return a pixel mask.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # Calculate the maximum height and width required for padding
        pad_size = get_max_height_width(images, input_data_format=input_data_format)

        # Pad each image in the batch to match `pad_size`
        padded_images = [
            self._pad_image(
                image,
                pad_size,
                constant_values=constant_values,
                data_format=data_format,
                input_data_format=input_data_format,
            )
            for image in images
        ]
        
        # Prepare data dictionary to store padded images
        data = {"pixel_values": padded_images}

        # Optionally, generate pixel masks for the padded images
        if return_pixel_mask:
            masks = [
                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
                for image in images
            ]
            data["pixel_mask"] = masks

        # Return BatchFeature object containing padded images and masks (if generated)
        return BatchFeature(data=data, tensor_type=return_tensors)

    def encode_inputs(
        self,
        pixel_values_list: List[ImageInput],
        segmentation_maps: ImageInput = None,
        instance_id_to_semantic_id: Optional[Union[List[Dict[int, int]], Dict[int, int]]] = None,
        ignore_index: Optional[int] = None,
        reduce_labels: bool = False,
        return_tensors: Optional[Union[str, TensorType]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ):
        """
        Encodes input data into a format suitable for model input, optionally handling segmentation maps and instance IDs.

        Args:
            pixel_values_list (`List[ImageInput]`):
                List of images to encode.
            segmentation_maps (`ImageInput`, *optional*):
                Segmentation maps corresponding to images.
            instance_id_to_semantic_id (`Optional[Union[List[Dict[int, int]], Dict[int, int]]]`, *optional*):
                Mapping from instance IDs to semantic IDs.
            ignore_index (`Optional[int]`, *optional*):
                Index to ignore during encoding.
            reduce_labels (`bool`, *optional*, defaults to `False`):
                Whether to reduce the number of unique labels.
            return_tensors (`Optional[Union[str, TensorType]]`, *optional*):
                The type of tensors to return (e.g., `'tf'`, `'pt'`, `'np'`, `'jax'`).
            input_data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
                The channel dimension format of the input data.

        Returns:
            BatchFeature:
                Encoded inputs wrapped in a `BatchFeature` object.
        """
        # Function implementation is omitted for brevity in the comment block

    def post_process_segmentation(
        self, outputs: "MaskFormerForInstanceSegmentationOutput", target_size: Tuple[int, int] = None
    ):
        """
        Post-processes segmentation outputs to adjust them to a target size if specified.

        Args:
            outputs (`MaskFormerForInstanceSegmentationOutput`):
                Model outputs to post-process.
            target_size (`Tuple[int, int]`, *optional*):
                Target size to resize the outputs.

        """
        # Function implementation is omitted for brevity in the comment block
        ) -> "torch.Tensor":
        """
        Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into image segmentation predictions. Only
        supports PyTorch.

        Args:
            outputs ([`MaskFormerForInstanceSegmentationOutput`]):
                The outputs from [`MaskFormerForInstanceSegmentation`].

            target_size (`Tuple[int, int]`, *optional*):
                If set, the `masks_queries_logits` will be resized to `target_size`.

        Returns:
            `torch.Tensor`:
                A tensor of shape (`batch_size, num_class_labels, height, width`).
        """
        # Emit a warning about deprecation of this function
        logger.warning(
            "`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use"
            " `post_process_instance_segmentation`",
            FutureWarning,
        )

        # class_queries_logits has shape [BATCH, QUERIES, CLASSES + 1]
        class_queries_logits = outputs.class_queries_logits
        # masks_queries_logits has shape [BATCH, QUERIES, HEIGHT, WIDTH]
        masks_queries_logits = outputs.masks_queries_logits

        # Resize masks if target_size is provided
        if target_size is not None:
            masks_queries_logits = torch.nn.functional.interpolate(
                masks_queries_logits,
                size=target_size,
                mode="bilinear",
                align_corners=False,
            )

        # Remove the null class from class_queries_logits
        masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]

        # Calculate mask probabilities
        masks_probs = masks_queries_logits.sigmoid()

        # Perform segmentation by combining class probabilities and mask probabilities
        # using Einstein summation notation
        # $ out_{c,h,w} =  \sum_q p_{q,c} * m_{q,h,w} $
        # where $ softmax(p) \in R^{q, c} $ is the mask classes
        # and $ sigmoid(m) \in R^{q, h, w}$ is the mask probabilities
        segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)

        return segmentation
    def post_process_instance_segmentation(
        self,
        outputs,
        threshold: float = 0.5,
        mask_threshold: float = 0.5,
        overlap_mask_area_threshold: float = 0.8,
        target_sizes: Optional[List[Tuple[int, int]]] = None,
        return_coco_annotation: Optional[bool] = False,
        return_binary_maps: Optional[bool] = False,
    ) -> "torch.Tensor":
        """
        Post-processes outputs of an instance segmentation model, optionally converting them into semantic segmentation maps.
        
        Args:
            outputs ([MaskFormerForInstanceSegmentation]):
                Raw outputs from the instance segmentation model.
            threshold (float):
                Threshold value for class probability to consider predictions.
            mask_threshold (float):
                Threshold value for mask probabilities to consider the mask prediction.
            overlap_mask_area_threshold (float):
                Threshold for overlapping mask areas.
            target_sizes (List[Tuple[int, int]], optional):
                List specifying the desired output sizes (height, width) for each prediction.
                If `None`, predictions will not be resized.
            return_coco_annotation (bool, optional):
                Flag indicating whether to return COCO-style annotations.
            return_binary_maps (bool, optional):
                Flag indicating whether to return binary maps along with semantic segmentation.

        Returns:
            List[torch.Tensor]:
                List of semantic segmentation maps, each of shape (height, width), corresponding to the target_sizes
                entries if specified. Each entry contains semantic class IDs.
        """
        class_queries_logits = outputs.class_queries_logits  # [batch_size, num_queries, num_classes+1]
        masks_queries_logits = outputs.masks_queries_logits  # [batch_size, num_queries, height, width]

        # Remove the null class `[..., :-1]`
        masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
        masks_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]

        # Compute segmentation logits of shape (batch_size, num_classes, height, width)
        segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
        batch_size = class_queries_logits.shape[0]

        # Resize logits and compute semantic segmentation maps
        if target_sizes is not None:
            if batch_size != len(target_sizes):
                raise ValueError(
                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
                )

            semantic_segmentation = []
            for idx in range(batch_size):
                # Resize logits using bilinear interpolation
                resized_logits = torch.nn.functional.interpolate(
                    segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
                )
                # Obtain semantic segmentation map by selecting class with highest probability
                semantic_map = resized_logits[0].argmax(dim=0)
                semantic_segmentation.append(semantic_map)
        else:
            # If target_sizes is None, directly compute semantic segmentation maps
            semantic_segmentation = segmentation.argmax(dim=1)
            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]

        return semantic_segmentation
    # 定义一个方法用于后处理全景分割的输出结果
    def post_process_panoptic_segmentation(
        self,
        outputs,
        threshold: float = 0.5,
        mask_threshold: float = 0.5,
        overlap_mask_area_threshold: float = 0.8,
        label_ids_to_fuse: Optional[Set[int]] = None,
        target_sizes: Optional[List[Tuple[int, int]]] = None,

posted @ 2024-06-30 15:32 绝不原创的飞龙阅读(22) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-七十一-

Transformers 源码解析（七十一）

`.\models\mask2former\modeling_mask2former.py`

`.\models\mask2former\init.py`

`.\models\maskformer\configuration_maskformer.py`

`.\models\maskformer\configuration_maskformer_swin.py`

`.\models\maskformer\convert_maskformer_original_pytorch_checkpoint_to_pytorch.py`

`.\models\maskformer\convert_maskformer_resnet_to_pytorch.py`

`.\models\maskformer\convert_maskformer_swin_to_pytorch.py`

`.\models\maskformer\feature_extraction_maskformer.py`

`.\models\maskformer\image_processing_maskformer.py`

公告

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-七十一-

Transformers 源码解析（七十一）

.\models\mask2former\modeling_mask2former.py

.\models\mask2former\__init__.py

.\models\maskformer\configuration_maskformer.py

.\models\maskformer\configuration_maskformer_swin.py

.\models\maskformer\convert_maskformer_original_pytorch_checkpoint_to_pytorch.py

.\models\maskformer\convert_maskformer_resnet_to_pytorch.py

.\models\maskformer\convert_maskformer_swin_to_pytorch.py

.\models\maskformer\feature_extraction_maskformer.py

.\models\maskformer\image_processing_maskformer.py

公告

`.\models\mask2former\modeling_mask2former.py`

`.\models\mask2former\init.py`

`.\models\maskformer\configuration_maskformer.py`

`.\models\maskformer\configuration_maskformer_swin.py`

`.\models\maskformer\convert_maskformer_original_pytorch_checkpoint_to_pytorch.py`

`.\models\maskformer\convert_maskformer_resnet_to_pytorch.py`

`.\models\maskformer\convert_maskformer_swin_to_pytorch.py`

`.\models\maskformer\feature_extraction_maskformer.py`

`.\models\maskformer\image_processing_maskformer.py`