Transformers-源码解析-一百一十八-

Transformers 源码解析（一百一十八）

`.\models\vit\init.py`

# 引入类型检查相关模块
from typing import TYPE_CHECKING

# 从当前包的工具模块中导入需要的函数和异常类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_torch_available,
    is_vision_available,
)

# 定义模块的导入结构，包含各模块对应的导入内容列表
_import_structure = {"configuration_vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig", "ViTOnnxConfig"]}

# 检查视觉处理模块是否可用，若不可用则抛出异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果模块可用，则导入特征提取和图像处理模块
    _import_structure["feature_extraction_vit"] = ["ViTFeatureExtractor"]
    _import_structure["image_processing_vit"] = ["ViTImageProcessor"]

# 检查是否 Torch 可用，若不可用则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，则导入 PyTorch 模型相关模块
    _import_structure["modeling_vit"] = [
        "VIT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "ViTForImageClassification",
        "ViTForMaskedImageModeling",
        "ViTModel",
        "ViTPreTrainedModel",
    ]

# 检查是否 TensorFlow 可用，若不可用则抛出异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 TensorFlow 可用，则导入 TensorFlow 模型相关模块
    _import_structure["modeling_tf_vit"] = [
        "TFViTForImageClassification",
        "TFViTModel",
        "TFViTPreTrainedModel",
    ]

# 检查是否 Flax 可用，若不可用则抛出异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Flax 可用，则导入 Flax 模型相关模块
    _import_structure["modeling_flax_vit"] = [
        "FlaxViTForImageClassification",
        "FlaxViTModel",
        "FlaxViTPreTrainedModel",
    ]

# 如果当前是类型检查环境
if TYPE_CHECKING:
    # 从配置文件模块中导入所需的配置映射和配置类
    from .configuration_vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig, ViTOnnxConfig

    # 检查视觉处理模块是否可用，若不可用则抛出异常
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果模块可用，则从特征提取和图像处理模块导入相应类
        from .feature_extraction_vit import ViTFeatureExtractor
        from .image_processing_vit import ViTImageProcessor

    # 检查是否 Torch 可用，若不可用则抛出异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果 Torch 可用，则从 PyTorch 模型相关模块导入相应类
        from .modeling_vit import (
            VIT_PRETRAINED_MODEL_ARCHIVE_LIST,
            ViTForImageClassification,
            ViTForMaskedImageModeling,
            ViTModel,
            ViTPreTrainedModel,
        )
    # 尝试检查是否安装了 TensorFlow，如果未安装则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，表示 TensorFlow 不可用
    except OptionalDependencyNotAvailable:
        # 如果捕获到异常，则不执行任何操作，继续执行后续代码
        pass
    else:
        # 如果未捕获异常，表示 TensorFlow 可用，导入相关模块
        from .modeling_tf_vit import TFViTForImageClassification, TFViTModel, TFViTPreTrainedModel

    # 尝试检查是否安装了 Flax，如果未安装则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，表示 Flax 不可用
    except OptionalDependencyNotAvailable:
        # 如果捕获到异常，则不执行任何操作，继续执行后续代码
        pass
    else:
        # 如果未捕获异常，表示 Flax 可用，导入相关模块
        from .modeling_flax_vit import FlaxViTForImageClassification, FlaxViTModel, FlaxViTPreTrainedModel
else:
    # 如果不在以上任何情况下，则执行以下操作
    import sys
    # 导入系统模块 sys，用于访问系统相关的功能

    # 将当前模块注册到 sys.modules 中，使用 _LazyModule 包装
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
    # __name__: 当前模块的名称
    # globals()["__file__"]: 当前模块的文件路径
    # _import_structure: 导入结构，可能是一个导入相关的结构或函数
    # __spec__: 可能是当前模块的规范对象，指定模块的元数据信息

`.\models\vitdet\configuration_vitdet.py`

# 设置编码格式为 UTF-8
# 版权声明：2023 年 HuggingFace 公司保留所有权利
# 根据 Apache 许可证 2.0 版本授权，除非符合许可证，否则不得使用此文件
# 可以在以下网址获取许可证副本：http://www.apache.org/licenses/LICENSE-2.0
# 除非适用法律要求或书面同意，否则按“原样”分发软件，不附带任何明示或暗示的担保或条件
# 请查阅许可证了解更多信息

""" VitDet 模型配置"""

# 从相对路径导入 PretrainedConfig 类
from ...configuration_utils import PretrainedConfig
# 导入 logging 模块用于日志记录
from ...utils import logging
# 从 backbone_utils 中导入 BackboneConfigMixin 类和 get_aligned_output_features_output_indices 函数

from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices

# 获取全局日志记录器
logger = logging.get_logger(__name__)

# VitDet 预训练模型配置映射，指定了模型名和其配置文件的下载链接
VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/vit-det-base": "https://huggingface.co/facebook/vit-det-base/resolve/main/config.json",
}

# VitDetConfig 类，继承了 BackboneConfigMixin 和 PretrainedConfig
class VitDetConfig(BackboneConfigMixin, PretrainedConfig):
    r"""
    这是存储 [`VitDetModel`] 配置的类。它用于根据指定的参数实例化 VitDet 模型，定义模型架构。
    使用默认配置实例化一个配置对象将会生成类似于 VitDet [google/vitdet-base-patch16-224] 架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。详细信息请参阅 [`PretrainedConfig`] 的文档。

    示例：

    ```
    >>> from transformers import VitDetConfig, VitDetModel

    >>> # 初始化 VitDet 配置
    >>> configuration = VitDetConfig()

    >>> # 使用配置对象实例化一个模型（带有随机权重）
    >>> model = VitDetModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    """

    # 模型类型为 "vitdet"
    model_type = "vitdet"

    # VitDetConfig 的构造函数，定义了模型的各种配置参数
    def __init__(
        self,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        mlp_ratio=4,
        hidden_act="gelu",
        dropout_prob=0.0,
        initializer_range=0.02,
        layer_norm_eps=1e-6,
        image_size=224,
        pretrain_image_size=224,
        patch_size=16,
        num_channels=3,
        qkv_bias=True,
        drop_path_rate=0.0,
        window_block_indices=[],
        residual_block_indices=[],
        use_absolute_position_embeddings=True,
        use_relative_position_embeddings=False,
        window_size=0,
        out_features=None,
        out_indices=None,
        **kwargs,
        ):
        # 调用父类的构造函数并传递所有关键字参数
        super().__init__(**kwargs)

        # 初始化模型的各种超参数
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.mlp_ratio = mlp_ratio
        self.hidden_act = hidden_act
        self.dropout_prob = dropout_prob
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.image_size = image_size
        self.pretrain_image_size = pretrain_image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.qkv_bias = qkv_bias
        self.drop_path_rate = drop_path_rate
        self.window_block_indices = window_block_indices
        self.residual_block_indices = residual_block_indices
        self.use_absolute_position_embeddings = use_absolute_position_embeddings
        self.use_relative_position_embeddings = use_relative_position_embeddings
        self.window_size = window_size

        # 设定模型各阶段的名称，包括初始的"stem"和从"stage1"到"stageN"的隐藏层
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, self.num_hidden_layers + 1)]
        
        # 调用函数获取对齐的输出特征和输出索引
        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
        )

`.\models\vitdet\modeling_vitdet.py`

# coding=utf-8
# 定义了 UTF-8 编码格式

# 版权声明，版权归 Meta AI 和 The HuggingFace Inc. 团队所有
# 根据 Apache License, Version 2.0 许可，除非符合许可协议，否则禁止使用本文件
# 可以在以下网址获取许可协议的副本：http://www.apache.org/licenses/LICENSE-2.0

""" PyTorch ViTDet backbone."""
# 导入必要的库和模块
import collections.abc  # 导入 collections.abc 模块
import math  # 导入 math 模块
from typing import Dict, List, Optional, Tuple, Union  # 导入类型提示相关的模块

import torch  # 导入 PyTorch 库
import torch.utils.checkpoint  # 导入 PyTorch 的 checkpoint 工具
from torch import nn  # 导入 PyTorch 的 nn 模块

# 导入其他相关模块和函数
from ...activations import ACT2FN  # 从特定路径导入 ACT2FN
from ...modeling_outputs import BackboneOutput, BaseModelOutput  # 从特定路径导入 BackboneOutput 和 BaseModelOutput 类
from ...modeling_utils import PreTrainedModel  # 从特定路径导入 PreTrainedModel 类
from ...utils import (  # 从特定路径导入多个函数和类
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from ...utils.backbone_utils import BackboneMixin  # 从特定路径导入 BackboneMixin 类
from .configuration_vitdet import VitDetConfig  # 从当前路径导入 VitDetConfig 类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

# General docstring
_CONFIG_FOR_DOC = "VitDetConfig"  # 设置文档中的配置说明为 "VitDetConfig"

# 定义预训练模型的存档列表
VITDET_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/vit-det-base",
    # 查看所有 ViTDet 模型的列表网址 https://huggingface.co/models?filter=vitdet
]


class VitDetEmbeddings(nn.Module):
    """
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) to be consumed by a Transformer.
    """

    def __init__(self, config):
        super().__init__()  # 调用父类的构造方法

        # 从配置中获取相关参数
        image_size, patch_size = config.pretrain_image_size, config.patch_size
        num_channels, hidden_size = config.num_channels, config.hidden_size

        # 处理图像大小和补丁大小的数据类型
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)

        # 计算补丁的数量
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])

        # 设置类的属性
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches = num_patches

        # 如果配置指定使用绝对位置嵌入，则初始化绝对位置嵌入
        if config.use_absolute_position_embeddings:
            num_positions = num_patches + 1
            self.position_embeddings = nn.Parameter(torch.zeros(1, num_positions, config.hidden_size))
        else:
            self.position_embeddings = None

        # 图像通道到隐藏大小的投影
        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
    def get_absolute_positions(self, abs_pos_embeddings, has_cls_token, height, width):
        """
        Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token dimension for the
        original embeddings.

        Args:
            abs_pos_embeddings (`torch.Tensor`):
                Absolute positional embeddings with (1, num_position, num_channels).
            has_cls_token (`bool`):
                If true, has 1 embedding in abs_pos_embeddings for cls token.
            height (`int`):
                Height of input image tokens.
            width (`int`):
                Width of input image tokens.

        Returns:
            Absolute positional embeddings after processing with shape (1, height, width, num_channels)
        """
        # If the input has cls_token, remove the first embedding dimension
        if has_cls_token:
            abs_pos_embeddings = abs_pos_embeddings[:, 1:]

        # Calculate the number of position embeddings
        num_position = abs_pos_embeddings.shape[1]

        # Determine the size of the square matrix from the number of position embeddings
        size = int(math.sqrt(num_position))
        if size * size != num_position:
            raise ValueError("Absolute position embeddings must be a square number.")

        # If the size of embeddings does not match input height or width, resize them
        if size != height or size != width:
            new_abs_pos_embeddings = nn.functional.interpolate(
                abs_pos_embeddings.reshape(1, size, size, -1).permute(0, 3, 1, 2),
                size=(height, width),
                mode="bicubic",
                align_corners=False,
            )
            # Rearrange dimensions to match the expected output shape
            return new_abs_pos_embeddings.permute(0, 2, 3, 1)
        else:
            # Reshape embeddings to the expected output shape
            return abs_pos_embeddings.reshape(1, height, width, -1)

    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        # Check if the number of channels in pixel_values matches the expected configuration
        num_channels = pixel_values.shape[1]
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
                f" Expected {self.num_channels} but got {num_channels}."
            )

        # Project pixel values to obtain embeddings
        embeddings = self.projection(pixel_values)

        # If position embeddings are provided, incorporate them into the embeddings
        if self.position_embeddings is not None:
            # Rearrange dimensions of embeddings to (batch_size, height, width, num_channels)
            embeddings = embeddings.permute(0, 2, 3, 1)
            
            # Add absolute positional embeddings to the embeddings
            embeddings = embeddings + self.get_absolute_positions(
                self.position_embeddings, True, embeddings.shape[1], embeddings.shape[2]
            )
            
            # Rearrange dimensions back to (batch_size, num_channels, height, width)
            embeddings = embeddings.permute(0, 3, 1, 2)

        # Return the processed embeddings
        return embeddings
def get_rel_pos(q_size, k_size, rel_pos):
    """
    Get relative positional embeddings according to the relative positions of query and key sizes.

    Args:
        q_size (`int`):
            Size of query q.
        k_size (`int`):
            Size of key k.
        rel_pos (`torch.Tensor`):
            Relative position embeddings (num_embeddings, num_channels).

    Returns:
        Extracted positional embeddings according to relative positions.
    """
    # 计算相对位置的最大距离
    max_rel_dist = int(2 * max(q_size, k_size) - 1)

    # 如果 rel_pos 的第一个维度不等于 max_rel_dist，则进行插值
    if rel_pos.shape[0] != max_rel_dist:
        # 插值处理相对位置嵌入
        rel_pos_resized = nn.functional.interpolate(
            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
            size=max_rel_dist,
            mode="linear",
        )
        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
    else:
        rel_pos_resized = rel_pos

    # 根据 q 和 k 的形状差异，对坐标进行缩放处理
    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)

    return rel_pos_resized[relative_coords.long()]


def add_decomposed_relative_positions(attn, queries, rel_pos_h, rel_pos_w, q_size, k_size):
    """
    Calculate decomposed Relative Positional Embeddings as introduced in
    [MViT2](https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py).

    Args:
        attn (`torch.Tensor`):
            Attention map.
        queries (`torch.Tensor`):
            Query q in the attention layer with shape (batch_size, queries_height * queries_width, num_channels).
        rel_pos_h (`torch.Tensor`):
            Relative position embeddings (Lh, num_channels) for height axis.
        rel_pos_w (`torch.Tensor`):
            Relative position embeddings (Lw, num_channels) for width axis.
        q_size (`Tuple[int]`):
            Spatial sequence size of query q with (queries_height, queries_width).
        k_size (`Tuple[int]`]):
            Spatial sequence size of key k with (keys_height, keys_width).

    Returns:
        attn (Tensor): attention map with added relative positional embeddings.
    """
    # 获取 queries 和 keys 的高度和宽度
    queries_height, queries_width = q_size
    keys_height, keys_width = k_size

    # 获取高度和宽度方向上的相对位置嵌入
    relative_height = get_rel_pos(queries_height, keys_height, rel_pos_h)
    relative_width = get_rel_pos(queries_width, keys_width, rel_pos_w)

    batch_size, _, dim = queries.shape
    r_q = queries.reshape(batch_size, queries_height, queries_width, dim)

    # 使用 Einstein 求和符号计算相对高度和宽度的加权值
    relative_height = torch.einsum("bhwc,hkc->bhwk", r_q, relative_height)
    relative_weight = torch.einsum("bhwc,wkc->bhwk", r_q, relative_width)
    # 将注意力矩阵重新形状为五维张量，用于计算注意力分数
    attn = (
        attn.view(batch_size, queries_height, queries_width, keys_height, keys_width)  # 将注意力矩阵重新形状为五维张量
        + relative_height[:, :, :, :, None]  # 添加相对高度信息到张量的对应维度
        + relative_weight[:, :, :, None, :]  # 添加相对权重信息到张量的对应维度
    ).view(batch_size, queries_height * queries_width, keys_height * keys_width)  # 将张量重新展平为二维形状
    
    # 返回处理后的注意力矩阵
    return attn
class VitDetAttention(nn.Module):
    """Multi-head Attention block with relative position embeddings."""

    def __init__(self, config, input_size=None):
        """
        Args:
            config (`VitDetConfig`):
                Model configuration.
            input_size (`Tuple[int]`, *optional*):
                Input resolution, only required in case relative position embeddings are added.
        """
        super().__init__()

        dim = config.hidden_size  # 从配置中获取隐藏层大小
        num_heads = config.num_attention_heads  # 从配置中获取注意力头的数量

        self.num_heads = num_heads  # 存储注意力头的数量
        head_dim = dim // num_heads  # 计算每个注意力头的维度
        self.scale = head_dim**-0.5  # 缩放因子，用于缩放注意力权重

        self.qkv = nn.Linear(dim, dim * 3, bias=config.qkv_bias)  # 定义线性层 qkv，用于查询、键、值的线性变换
        self.proj = nn.Linear(dim, dim)  # 定义线性层 proj，用于最终的投影

        self.use_relative_position_embeddings = config.use_relative_position_embeddings  # 是否使用相对位置编码
        if self.use_relative_position_embeddings:
            # 初始化相对位置编码的参数
            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))

    def forward(self, hidden_state, output_attentions=False):
        batch_size, height, width, _ = hidden_state.shape  # 获取隐藏状态的形状信息
        # 执行 qkv 线性变换并重新排列形状以便后续处理
        qkv = self.qkv(hidden_state).reshape(batch_size, height * width, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
        # 拆分为查询、键、值，并重新组织形状
        queries, keys, values = qkv.reshape(3, batch_size * self.num_heads, height * width, -1).unbind(0)

        # 计算注意力分数
        attention_scores = (queries * self.scale) @ keys.transpose(-2, -1)

        if self.use_relative_position_embeddings:
            # 使用相对位置编码来调整注意力分数
            attention_scores = add_decomposed_relative_positions(
                attention_scores, queries, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
            )

        attention_probs = attention_scores.softmax(dim=-1)  # 对注意力分数进行 softmax 操作

        hidden_state = attention_probs @ values  # 计算加权后的值
        hidden_state = hidden_state.view(batch_size, self.num_heads, height, width, -1)  # 调整形状
        hidden_state = hidden_state.permute(0, 2, 3, 1, 4)  # 重新排列维度顺序
        hidden_state = hidden_state.reshape(batch_size, height, width, -1)  # 再次调整形状
        hidden_state = self.proj(hidden_state)  # 应用最终的投影变换

        if output_attentions:
            attention_probs = attention_probs.reshape(
                batch_size, self.num_heads, attention_probs.shape[-2], attention_probs.shape[-1]
            )
            outputs = (hidden_state, attention_probs)  # 如果需要输出注意力权重，则存储在 outputs 中
        else:
            outputs = (hidden_state,)  # 否则只存储隐藏状态输出

        return outputs


# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    """
    # 如果不是训练阶段或者 drop_prob 为 0，则直接返回输入
    if not training or drop_prob == 0.0:
        return input

    keep_prob = 1.0 - drop_prob  # 计算保留的概率
    mask = torch.rand(input.shape[0], 1, 1, 1, device=input.device) < keep_prob  # 创建掩码张量
    output = input / keep_prob * mask  # 应用掩码并进行缩放
    return output  # 返回处理后的张量
    # 如果 dropout 概率为 0 或者不处于训练模式，则直接返回输入
    if drop_prob == 0.0 or not training:
        return input
    # 计算保留概率
    keep_prob = 1 - drop_prob
    # 创建与输入张量相同形状的随机张量，用于随机保留节点
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # 适用于不同维度的张量，而不仅限于二维卷积网络
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # 对随机张量进行取整操作，实现二值化
    # 计算输出张量，通过随机张量实现节点随机保留的效果
    output = input.div(keep_prob) * random_tensor
    # 返回处理后的输出张量
    return output
# Copied from transformers.models.beit.modeling_beit.BeitDropPath
# 定义一个类 VitDetDropPath，用于实现样本级别的随机深度（Drop Path），应用于残差块的主路径中
class VitDetDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob  # 初始化 drop_prob 属性

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        return drop_path(hidden_states, self.drop_prob, self.training)  # 调用 drop_path 函数进行前向传播

    def extra_repr(self) -> str:
        return "p={}".format(self.drop_prob)  # 返回描述对象的额外信息字符串


class VitDetLayerNorm(nn.Module):
    """
    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and variance normalization over the
    channel dimension for inputs that have shape (batch_size, channels, height, width).
    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
    """

    def __init__(self, normalized_shape, eps=1e-6):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(normalized_shape))  # 初始化可学习的权重参数
        self.bias = nn.Parameter(torch.zeros(normalized_shape))  # 初始化可学习的偏置参数
        self.eps = eps  # 设置 epsilon 参数
        self.normalized_shape = (normalized_shape,)  # 记录规范化形状元组

    def forward(self, x):
        u = x.mean(1, keepdim=True)  # 计算输入张量在通道维度上的均值
        s = (x - u).pow(2).mean(1, keepdim=True)  # 计算输入张量在通道维度上的方差
        x = (x - u) / torch.sqrt(s + self.eps)  # 应用 LayerNorm 公式进行标准化
        x = self.weight[:, None, None] * x + self.bias[:, None, None]  # 应用可学习的权重和偏置进行缩放和平移
        return x


class VitDetResBottleneckBlock(nn.Module):
    """
    The standard bottleneck residual block without the last activation layer. It contains 3 conv layers with kernels
    1x1, 3x3, 1x1.
    """

    def __init__(self, config, in_channels, out_channels, bottleneck_channels):
        """
        Args:
            config (`VitDetConfig`):
                Model configuration.
            in_channels (`int`):
                Number of input channels.
            out_channels (`int`):
                Number of output channels.
            bottleneck_channels (`int`):
                Number of output channels for the 3x3 "bottleneck" conv layers.
        """
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, bottleneck_channels, 1, bias=False)  # 第一个卷积层，1x1卷积
        self.norm1 = VitDetLayerNorm(bottleneck_channels)  # 第一个 LayerNorm 层
        self.act1 = ACT2FN[config.hidden_act]  # 第一个激活函数

        self.conv2 = nn.Conv2d(bottleneck_channels, bottleneck_channels, 3, padding=1, bias=False)  # 第二个卷积层，3x3卷积
        self.norm2 = VitDetLayerNorm(bottleneck_channels)  # 第二个 LayerNorm 层
        self.act2 = ACT2FN[config.hidden_act]  # 第二个激活函数

        self.conv3 = nn.Conv2d(bottleneck_channels, out_channels, 1, bias=False)  # 第三个卷积层，1x1卷积
        self.norm3 = VitDetLayerNorm(out_channels)  # 第三个 LayerNorm 层

    def forward(self, x):
        out = x
        for layer in self.children():  # 遍历模块的所有子层（conv, norm, act）
            out = layer(out)  # 依次对输入应用各层操作

        out = x + out  # 残差连接
        return out


class VitDetMlp(nn.Module):
    # 初始化函数，用于初始化神经网络的结构和参数
    def __init__(self, config, in_features: int, hidden_features: int) -> None:
        # 调用父类的初始化方法，确保正确初始化
        super().__init__()
        # 第一个全连接层，输入特征数为in_features，输出特征数为hidden_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        # 激活函数，根据配置文件中的隐藏层激活函数名称选择对应的激活函数
        self.act = ACT2FN[config.hidden_act]
        # 第二个全连接层，输入特征数为hidden_features，输出特征数为in_features
        self.fc2 = nn.Linear(hidden_features, in_features)
        # Dropout层，使用config中指定的丢弃概率
        self.drop = nn.Dropout(config.dropout_prob)

    # 前向传播函数，定义了数据从输入到输出的流程
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # 第一层全连接层的前向传播，将输入x变换为隐藏特征空间
        x = self.fc1(x)
        # 应用激活函数将线性变换后的结果进行非线性映射
        x = self.act(x)
        # 对映射后的结果进行Dropout操作，以防止过拟合
        x = self.drop(x)
        # 第二层全连接层的前向传播，将隐藏特征空间映射回原始特征空间
        x = self.fc2(x)
        # 再次对映射后的结果进行Dropout操作
        x = self.drop(x)

        # 返回前向传播的结果，这里没有应用激活函数，通常用于回归问题
        return x
def window_partition(hidden_state, window_size):
    """
    Partition into non-overlapping windows with padding if needed.

    Args:
        hidden_state (`torch.Tensor`):
            Input tokens with [batch_size, height, width, num_channels].
        window_size (`int`):
            Window size.

    Returns:
        `tuple(torch.FloatTensor)` comprising various elements:
        - windows: windows after partition with [batch_size * num_windows, window_size, window_size, num_channels].
        - (patch_height, patch_width): padded height and width before partition
    """
    # 获取输入张量的维度信息
    batch_size, height, width, num_channels = hidden_state.shape

    # 计算需要填充的高度和宽度
    pad_height = (window_size - height % window_size) % window_size
    pad_width = (window_size - width % window_size) % window_size

    # 如果存在高度或宽度的填充需求，则进行填充
    if pad_height > 0 or pad_width > 0:
        hidden_state = nn.functional.pad(hidden_state, (0, 0, 0, pad_width, 0, pad_height))

    # 计算填充后的图像尺寸
    patch_height, patch_width = height + pad_height, width + pad_width

    # 将填充后的张量重塑为窗口视图
    hidden_state = hidden_state.view(
        batch_size, patch_height // window_size, window_size, patch_width // window_size, window_size, num_channels
    )

    # 对窗口视图进行维度置换和连续化操作，以生成最终的窗口
    windows = hidden_state.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)

    # 返回窗口和填充前的高度宽度信息
    return windows, (patch_height, patch_width)


def window_unpartition(windows, window_size, pad_height_width, height_width):
    """
    Window unpartition into original sequences and removing padding.

    Args:
        windows (`torch.Tensor`):
            Input tokens with [batch_size * num_windows, window_size, window_size, num_channels].
        window_size (`int`):
            Window size.
        pad_height_width (`Tuple[int]`):
            Padded height and width (patch_height, patch_width).
        height_width (`Tuple[int]`):
            Original height and width before padding.

    Returns:
        hidden_state: unpartitioned sequences with [batch_size, height, width, num_channels].
    """
    # 获取填充前后的高度和宽度信息
    patch_height, patch_width = pad_height_width
    height, width = height_width

    # 计算批量大小
    batch_size = windows.shape[0] // (patch_height * patch_width // window_size // window_size)

    # 将窗口张量视图还原为原始序列
    hidden_state = windows.view(
        batch_size, patch_height // window_size, patch_width // window_size, window_size, window_size, -1
    )

    # 对还原后的张量进行维度置换和连续化操作，以得到最终的隐藏状态
    hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5).contiguous().view(batch_size, patch_height, patch_width, -1)

    # 如果存在填充前的高度或宽度超过原始尺寸，则进行裁剪
    if patch_height > height or patch_width > width:
        hidden_state = hidden_state[:, :height, :width, :].contiguous()

    # 返回最终的隐藏状态
    return hidden_state


class VitDetLayer(nn.Module):
    """This corresponds to the Block class in the original implementation."""

    def __init__(
        self, config: VitDetConfig, drop_path_rate: float = 0, window_size: int = 0, use_residual_block: bool = False
    ):
        super().__init__()
        # 初始化 VIT 检测层，可以接收 VIT 检测配置、下降路径率、窗口大小和是否使用残差块作为参数
        self.config = config
        self.drop_path_rate = drop_path_rate
        self.window_size = window_size
        self.use_residual_block = use_residual_block
    ) -> None:
        super().__init__()  # 调用父类的构造函数，初始化父类的属性和方法

        dim = config.hidden_size  # 从配置中获取隐藏层的大小
        input_size = (config.image_size // config.patch_size, config.image_size // config.patch_size)  # 计算输入大小

        self.norm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)  # 初始化第一个 LayerNorm 层
        self.attention = VitDetAttention(
            config, input_size=input_size if window_size == 0 else (window_size, window_size)
        )  # 初始化 VitDetAttention 模块，根据窗口大小选择输入大小

        self.drop_path = VitDetDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()  # 初始化 DropPath 层或者 Identity 层
        self.norm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)  # 初始化第二个 LayerNorm 层
        self.mlp = VitDetMlp(config=config, in_features=dim, hidden_features=int(dim * config.mlp_ratio))  # 初始化 MLP 模块

        self.window_size = window_size  # 设置窗口大小

        self.use_residual_block = use_residual_block  # 设置是否使用残差块
        if self.use_residual_block:
            # 如果使用残差块，则初始化 VitDetResBottleneckBlock
            self.residual = VitDetResBottleneckBlock(
                config=config,
                in_channels=dim,
                out_channels=dim,
                bottleneck_channels=dim // 2,
            )

    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
        hidden_states = hidden_states.permute(0, 2, 3, 1)  # 调整输入张量的维度顺序

        shortcut = hidden_states  # 将输入张量保存为 shortcut 变量

        hidden_states = self.norm1(hidden_states)  # 在第一个 LayerNorm 层中归一化隐藏状态

        # 窗口分区
        if self.window_size > 0:
            height, width = hidden_states.shape[1], hidden_states.shape[2]
            hidden_states, pad_height_width = window_partition(hidden_states, self.window_size)

        self_attention_outputs = self.attention(
            hidden_states,
            output_attentions=output_attentions,
        )  # 使用注意力模块处理隐藏状态

        hidden_states = self_attention_outputs[0]  # 更新隐藏状态
        outputs = self_attention_outputs[1:]  # 如果需要输出注意力权重，则保存在 outputs 中

        # 反向窗口分区
        if self.window_size > 0:
            hidden_states = window_unpartition(hidden_states, self.window_size, pad_height_width, (height, width))

        # 第一个残差连接
        hidden_states = shortcut + self.drop_path(hidden_states)

        hidden_states = hidden_states + self.drop_path(self.mlp(self.norm2(hidden_states)))  # 在第二个 LayerNorm 层后应用 MLP 模块和 DropPath

        hidden_states = hidden_states.permute(0, 3, 1, 2)  # 恢复输出张量的维度顺序

        if self.use_residual_block:
            hidden_states = self.residual(hidden_states)  # 如果使用残差块，则应用残差块

        outputs = (hidden_states,) + outputs  # 将处理后的隐藏状态与可能的注意力权重输出组合成输出元组

        return outputs  # 返回输出元组
# 定义一个 VitDetEncoder 类，继承自 nn.Module
class VitDetEncoder(nn.Module):
    def __init__(self, config: VitDetConfig) -> None:
        super().__init__()
        self.config = config
        depth = config.num_hidden_layers

        # stochastic depth decay rule
        # 根据深度生成随机深度衰减率列表
        drop_path_rate = [x.item() for x in torch.linspace(0, config.drop_path_rate, depth)]

        layers = []
        # 根据深度创建 VitDetLayer 层
        for i in range(depth):
            layers.append(
                VitDetLayer(
                    config,
                    drop_path_rate=drop_path_rate[i],
                    window_size=config.window_size if i in config.window_block_indices else 0,
                    use_residual_block=i in config.residual_block_indices,
                )
            )

        # 将所有层组成 nn.ModuleList
        self.layer = nn.ModuleList(layers)
        self.gradient_checkpointing = False

    # 定义前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ) -> Union[tuple, BaseModelOutput]:
        # 初始化隐藏状态和注意力张量的存储变量
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        # 遍历每个层进行前向传播
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前隐藏状态添加到存储变量中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 如果启用了梯度检查点并且处于训练阶段，则使用梯度检查点函数执行当前层的调用
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    layer_head_mask,
                    output_attentions,
                )
            else:
                # 否则直接调用当前层进行前向传播
                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)

            # 更新隐藏状态为当前层的输出
            hidden_states = layer_outputs[0]

            # 如果需要输出注意力张量，则将当前层的注意力张量添加到存储变量中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 如果需要输出隐藏状态，则将最终隐藏状态添加到存储变量中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要以字典形式返回结果，则按需返回隐藏状态、隐藏状态列表和注意力张量列表
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 否则以 BaseModelOutput 类型返回结果，包含最终隐藏状态、隐藏状态列表和注意力张量列表
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )


# 定义函数 caffe2_msra_fill，用于初始化 module 的权重和偏置
def caffe2_msra_fill(module: nn.Module) -> None:
    """
    Initialize `module.weight` using the "MSRAFill" implemented in Caffe2. Also initializes `module.bias` to 0.

    Source: https://detectron2.readthedocs.io/en/latest/_modules/fvcore/nn/weight_init.html.

    Args:
        module (torch.nn.Module): module to initialize.
    """
    # 使用 kaiming_normal_ 初始化权重，非线性函数为 relu
    nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
    # 如果存在偏置，则初始化为常数 0
    if module.bias is not None:
        nn.init.constant_(module.bias, 0)


# 定义 VitDetPreTrainedModel 类，继承自 PreTrainedModel
class VitDetPreTrainedModel(PreTrainedModel):
    """
    Placeholder for a pre-trained model class.
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 设置配置类为 VitDetConfig
    config_class = VitDetConfig
    # 基础模型前缀为 "vitdet"
    base_model_prefix = "vitdet"
    # 主输入名称为 "pixel_values"
    main_input_name = "pixel_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 不需要分割的模块列表为空
    _no_split_modules = []

    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
        """Initialize the weights"""
        # 如果模块是线性层或卷积层
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 对权重进行截断正态分布初始化，避免在 half 精度下 `trunc_normal_cpu` 未实现的问题
            module.weight.data = nn.init.trunc_normal_(
                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
            ).to(module.weight.dtype)
            # 如果有偏置，初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是 LayerNorm 层
        elif isinstance(module, nn.LayerNorm):
            # 初始化偏置为零，权重为全1
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

        # 如果模块是 VitDetEmbeddings 类型
        elif isinstance(module, VitDetEmbeddings):
            # 对位置嵌入进行截断正态分布初始化
            module.position_embeddings.data = nn.init.trunc_normal_(
                module.position_embeddings.data.to(torch.float32),
                mean=0.0,
                std=self.config.initializer_range,
            ).to(module.position_embeddings.dtype)

        # 如果模块是 VitDetAttention 类型并且配置使用相对位置嵌入
        elif isinstance(module, VitDetAttention) and self.config.use_relative_position_embeddings:
            # 对相对位置编码的水平偏移和垂直偏移进行截断正态分布初始化
            module.rel_pos_h.data = nn.init.trunc_normal_(
                module.rel_pos_h.data.to(torch.float32),
                mean=0.0,
                std=self.config.initializer_range,
            )
            module.rel_pos_w.data = nn.init.trunc_normal_(
                module.rel_pos_w.data.to(torch.float32),
                mean=0.0,
                std=self.config.initializer_range,
            )

        # 如果模块是 VitDetResBottleneckBlock 类型
        elif isinstance(module, VitDetResBottleneckBlock):
            # 对模块内的卷积层进行 MSRA 填充
            for layer in [module.conv1, module.conv2, module.conv3]:
                caffe2_msra_fill(layer)
            # 对归一化层的权重初始化为1，偏置初始化为0
            for layer in [module.norm1, module.norm2]:
                layer.weight.data.fill_(1.0)
                layer.bias.data.zero_()
            # 最后一个归一化层初始化权重和偏置为0
            module.norm3.weight.data.zero_()
            module.norm3.bias.data.zero_()
"""
The bare VitDet Transformer model outputting raw hidden-states without any specific head on top.
This model is a PyTorch torch.nn.Module subclass. Use it as a regular PyTorch Module and refer to the PyTorch 
documentation for all matter related to general usage and behavior.

Parameters:
    config ([`VitDetConfig`]): Model configuration class with all the parameters of the model.
        Initializing with a config file does not load the weights associated with the model, only the
        configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
    "The bare VitDet Transformer model outputting raw hidden-states without any specific head on top.",
    VITDET_START_DOCSTRING,
)
class VitDetModel(VitDetPreTrainedModel):
    """
    VitDetModel class represents the Vision Transformer based model for detection tasks.

    Args:
        config (VitDetConfig): The configuration object that holds all the model hyperparameters.

    Attributes:
        embeddings (VitDetEmbeddings): Instance of the embedding layer for this model.
        encoder (VitDetEncoder): Instance of the transformer encoder for this model.
        config (VitDetConfig): The configuration object that holds all the model hyperparameters.
    """
    def __init__(self, config: VitDetConfig):
        super().__init__(config)
        self.config = config

        # Initialize embeddings and encoder based on the provided configuration
        self.embeddings = VitDetEmbeddings(config)
        self.encoder = VitDetEncoder(config)

        # Initialize weights and apply final processing
        self.post_init()

    def get_input_embeddings(self) -> VitDetEmbeddings:
        """
        Returns the input embeddings of the model.

        Returns:
            VitDetEmbeddings: The embedding layer used for input embeddings.
        """
        return self.embeddings.projection

    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel

        Args:
            heads_to_prune (Dict[int, List[int]]): Dictionary mapping layer numbers to lists of head indices to prune.
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(VITDET_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutput]:
        """
        前向传播函数，用于模型推理阶段或者训练阶段的前向计算。

        Returns:
        返回一个元组或者BaseModelOutput对象，取决于return_dict参数。

        Examples:
        演示如何使用该forward函数进行模型推理：

        ```
        >>> from transformers import VitDetConfig, VitDetModel
        >>> import torch

        >>> config = VitDetConfig()
        >>> model = VitDetModel(config)

        >>> pixel_values = torch.randn(1, 3, 224, 224)

        >>> with torch.no_grad():
        ...     outputs = model(pixel_values)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 768, 14, 14]
        ```
        """

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # Prepare head mask if needed
        # 准备头部遮罩（如果需要）
        # 在head_mask中为1.0表示保留该头部
        # attention_probs的形状为bsz x n_heads x N x N
        # 输入的head_mask形状为[num_heads]或[num_hidden_layers x num_heads]
        # head_mask被转换为形状[num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)

        embedding_output = self.embeddings(pixel_values)

        encoder_outputs = self.encoder(
            embedding_output,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = encoder_outputs[0]

        if not return_dict:
            return (sequence_output,) + encoder_outputs[1:]

        return BaseModelOutput(
            last_hidden_state=sequence_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
@add_start_docstrings(
    """
    ViTDet backbone, to be used with frameworks like Mask R-CNN.
    """,
    VITDET_START_DOCSTRING,
)
class VitDetBackbone(VitDetPreTrainedModel, BackboneMixin):
    def __init__(self, config):
        super().__init__(config)
        super()._init_backbone(config)

        # 初始化嵌入层和编码器
        self.embeddings = VitDetEmbeddings(config)
        self.encoder = VitDetEncoder(config)
        self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self) -> VitDetEmbeddings:
        # 返回嵌入层的投影
        return self.embeddings.projection

    @add_start_docstrings_to_model_forward(VITDET_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: torch.Tensor,
        output_hidden_states: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> BackboneOutput:
        """
        Returns:

        Examples:

        ```
        >>> from transformers import VitDetConfig, VitDetBackbone
        >>> import torch

        >>> config = VitDetConfig()
        >>> model = VitDetBackbone(config)

        >>> pixel_values = torch.randn(1, 3, 224, 224)

        >>> with torch.no_grad():
        ...     outputs = model(pixel_values)

        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 14, 14]
        ```"""

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

        # 对输入像素值进行嵌入处理
        embedding_output = self.embeddings(pixel_values)

        # 将嵌入输出传递给编码器，获取输出
        outputs = self.encoder(
            embedding_output,
            output_hidden_states=True,
            output_attentions=output_attentions,
            return_dict=return_dict,
        )

        # 根据是否返回字典决定使用隐藏状态或者元组的第二个元素作为隐藏状态
        hidden_states = outputs.hidden_states if return_dict else outputs[1]

        feature_maps = ()
        for stage, hidden_state in zip(self.stage_names, hidden_states):
            if stage in self.out_features:
                feature_maps += (hidden_state,)

        if not return_dict:
            if output_hidden_states:
                output = (feature_maps,) + outputs[1:]
            else:
                output = (feature_maps,) + outputs[2:]
            return output

        # 返回 BackboneOutput 对象，包括特征图、隐藏状态和注意力信息（如果有）
        return BackboneOutput(
            feature_maps=feature_maps,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=outputs.attentions,
        )

`.\models\vitdet\init.py`

# 版权声明和许可条款，指明版权归 HuggingFace 团队所有，使用 Apache License, Version 2.0 许可
#
# 如果未遵守许可，除非适用法律要求或书面同意，否则不得使用该文件
from typing import TYPE_CHECKING

# 从当前目录中的 utils 模块导入所需的符号
from ...utils import (
    OptionalDependencyNotAvailable,  # 导入 OptionalDependencyNotAvailable 异常类
    _LazyModule,  # 导入 _LazyModule 类
    is_torch_available,  # 导入 is_torch_available 函数
)

# 定义导入结构，包含了配置和模型相关的符号
_import_structure = {"configuration_vitdet": ["VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP", "VitDetConfig"]}

# 尝试检查是否 torch 可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 torch 可用，则将模型相关的符号加入导入结构
    _import_structure["modeling_vitdet"] = [
        "VITDET_PRETRAINED_MODEL_ARCHIVE_LIST",
        "VitDetModel",
        "VitDetPreTrainedModel",
        "VitDetBackbone",
    ]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 导入配置相关的符号和模型相关的符号（如果 torch 可用）
    from .configuration_vitdet import VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP, VitDetConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_vitdet import (
            VITDET_PRETRAINED_MODEL_ARCHIVE_LIST,
            VitDetBackbone,
            VitDetModel,
            VitDetPreTrainedModel,
        )

# 如果不在类型检查模式下
else:
    import sys

    # 动态设置当前模块的 sys.modules 条目，使用 _LazyModule 进行延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\vitmatte\configuration_vitmatte.py`

# 设置文件编码方式为 utf-8
# 版权声明
# 根据 Apache License, Version 2.0 许可协议，对代码的使用和分发进行限制和规定
# 可以在 http://www.apache.org/licenses/LICENSE-2.0 获取许可协议的副本
# 在适用法律要求或书面同意的情况下，按照许可协议分发的软件基于“原样”提供，没有任何明示或暗示的担保或条件
# 请查看许可协议以了解特定语言的具体限制和条件
""" VitMatte model configuration"""

# 导入所需的模块
import copy
from typing import List

from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto.configuration_auto import CONFIG_MAPPING

# 获取日志记录器
logger = logging.get_logger(__name__)

# 预训练配置的压缩档映射
VITMATTE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "hustvl/vitmatte-small-composition-1k": "https://huggingface.co/hustvl/vitmatte-small-composition-1k/resolve/main/config.json",
}

# VitMatte 配置类，用于存储 [`VitMatteForImageMatting`] 的配置，用于实例化对应架构的 VitMatte 模型
# 实例化配置使用默认值将会产生与 ViTMatte [hustvl/vitmatte-small-composition-1k] 架构相似的配置
# 配置对象继承自 [`PretrainedConfig`]，可用于控制模型输出。查阅 [`PretrainedConfig`] 的文档以获取更多信息
class VitMatteConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of [`VitMatteForImageMatting`]. It is used to
    instantiate a ViTMatte model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the ViTMatte
    [hustvl/vitmatte-small-composition-1k](https://huggingface.co/hustvl/vitmatte-small-composition-1k) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    # 设置模型类型为 ViTMatte
    model_type = "vitmatte"

    # 初始化函数，用于创建 ViTMatteForImageMatting 类的实例
    def __init__(
        self,
        backbone_config: PretrainedConfig = None,  # 可选参数，用于指定预训练的骨干网络配置
        backbone=None,  # 可选参数，当 backbone_config 为 None 时，指定要使用的骨干网络名称
        use_pretrained_backbone=False,  # 可选参数，是否使用预训练的骨干网络权重
        use_timm_backbone=False,  # 可选参数，是否从 timm 库加载骨干网络（若为 False，则从 transformers 库加载）
        backbone_kwargs=None,  # 可选参数，传递给 AutoBackbone 的关键字参数，用于从检查点加载时指定输出索引等
        hidden_size: int = 384,  # 可选参数，解码器的输入通道数
        batch_norm_eps: float = 1e-5,  # 可选参数，批归一化层使用的 epsilon 值
        initializer_range: float = 0.02,  # 可选参数，用于初始化所有权重矩阵的截断正态分布的标准差
        convstream_hidden_sizes: List[int] = [48, 96, 192],  # 可选参数，ConvStream 模块的输出通道数列表
        fusion_hidden_sizes: List[int] = [256, 128, 64, 32],  # 可选参数，Fusion 模块的输出通道数列表
        **kwargs,  # 接受额外的关键字参数
    ):
        # 调用父类的构造方法，并传递所有的关键字参数
        super().__init__(**kwargs)

        # 如果使用预训练的骨干网络，则抛出值错误异常
        if use_pretrained_backbone:
            raise ValueError("Pretrained backbones are not supported yet.")

        # 如果同时指定了 `backbone` 和 `backbone_config`，则抛出值错误异常
        if backbone_config is not None and backbone is not None:
            raise ValueError("You can't specify both `backbone` and `backbone_config`.")

        # 如果未指定 `backbone_config` 和 `backbone`，则记录警告日志并使用默认的 `VitDet` 骨干网络配置进行初始化
        if backbone_config is None and backbone is None:
            logger.info("`backbone_config` is `None`. Initializing the config with the default `VitDet` backbone.")
            backbone_config = CONFIG_MAPPING["vitdet"](out_features=["stage4"])
        # 如果 `backbone_config` 是字典类型，则根据其 `model_type` 创建相应的配置类对象
        elif isinstance(backbone_config, dict):
            backbone_model_type = backbone_config.get("model_type")
            config_class = CONFIG_MAPPING[backbone_model_type]
            backbone_config = config_class.from_dict(backbone_config)

        # 如果同时指定了 `backbone_kwargs` 和 `backbone_config`，则抛出值错误异常
        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")

        # 设置对象的各个属性
        self.backbone_config = backbone_config
        self.backbone = backbone
        self.use_pretrained_backbone = use_pretrained_backbone
        self.use_timm_backbone = use_timm_backbone
        self.backbone_kwargs = backbone_kwargs
        self.batch_norm_eps = batch_norm_eps
        self.hidden_size = hidden_size
        self.initializer_range = initializer_range
        self.convstream_hidden_sizes = convstream_hidden_sizes
        self.fusion_hidden_sizes = fusion_hidden_sizes

    def to_dict(self):
        """
        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
        """
        # 深拷贝对象的所有属性
        output = copy.deepcopy(self.__dict__)
        # 将 `backbone_config` 属性转换为字典形式
        output["backbone_config"] = self.backbone_config.to_dict()
        # 添加模型类型属性到输出字典中
        output["model_type"] = self.__class__.model_type
        return output

`.\models\vitmatte\convert_vitmatte_to_hf.py`

# 加载 argparse 库，用于处理命令行参数
import argparse

# 加载 requests 库，用于发送 HTTP 请求
import requests

# 加载 PyTorch 库，用于深度学习模型操作
import torch

# 从 huggingface_hub 库中导入 hf_hub_download 函数，用于从 HF Hub 下载模型
from huggingface_hub import hf_hub_download

# 从 PIL 库中导入 Image 类，用于图像处理
from PIL import Image

# 从 transformers 库中导入 VitDetConfig, VitMatteConfig, VitMatteForImageMatting, VitMatteImageProcessor 类
from transformers import VitDetConfig, VitMatteConfig, VitMatteForImageMatting, VitMatteImageProcessor


def get_config(model_name):
    # 根据模型名称确定隐藏层大小和注意力头数
    hidden_size = 384 if "small" in model_name else 768
    num_attention_heads = 6 if "small" in model_name else 12

    # 创建 VitDetConfig 实例，定义了图像检测器的配置
    backbone_config = VitDetConfig(
        num_channels=4,
        image_size=512,
        pretrain_image_size=224,
        patch_size=16,
        hidden_size=hidden_size,
        num_attention_heads=num_attention_heads,
        use_absolute_position_embeddings=True,
        use_relative_position_embeddings=True,
        window_size=14,
        # 定义用于全局注意力的窗口块索引
        window_block_indices=[0, 1, 3, 4, 6, 7, 9, 10],
        # 定义残差块索引
        residual_block_indices=[2, 5, 8, 11],
        out_features=["stage12"],
    )

    # 创建并返回 VitMatteConfig 实例，包含了 VitDetConfig 和隐藏层大小
    return VitMatteConfig(backbone_config=backbone_config, hidden_size=hidden_size)


# 创建需要重命名的键值对列表
def create_rename_keys(config):
    rename_keys = []

    # 格式化设置关闭以保留对应代码块的缩进
    # stem
    rename_keys.append(("backbone.pos_embed", "backbone.embeddings.position_embeddings"))
    rename_keys.append(("backbone.patch_embed.proj.weight", "backbone.embeddings.projection.weight"))
    rename_keys.append(("backbone.patch_embed.proj.bias", "backbone.embeddings.projection.bias"))

    return rename_keys


# 重命名字典中的键
def rename_key(dct, old, new):
    val = dct.pop(old)
    dct[new] = val


def convert_vitmatte_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
    # 获取配置信息
    config = get_config(model_name)

    # 加载原始状态字典
    model_name_to_filename = {
        "vitmatte-small-composition-1k": "ViTMatte_S_Com.pth",
        "vitmatte-base-composition-1k": "ViTMatte_B_Com.pth",
        "vitmatte-small-distinctions-646": "ViTMatte_S_DIS.pth",
        "vitmatte-base-distinctions-646": "ViTMatte_B_DIS.pth",
    }

    filename = model_name_to_filename[model_name]
    # 从 HF Hub 下载模型文件路径
    filepath = hf_hub_download(repo_id="nielsr/vitmatte-checkpoints", filename=filename, repo_type="model")
    # 使用 torch.load() 加载模型文件到 state_dict 中，使用 CPU 进行映射
    state_dict = torch.load(filepath, map_location="cpu")

    # 待续：重命名键


这段代码中，我们需要继续完成 `convert_vitmatte_checkpoint` 函数内的代码注释。
    # 遍历 state_dict 的拷贝中的所有键
    for key in state_dict.copy().keys():
        # 弹出当前键对应的值
        val = state_dict.pop(key)
        # 如果键中包含 "backbone.blocks"，替换为 "backbone.encoder.layer"
        if "backbone.blocks" in key:
            key = key.replace("backbone.blocks", "backbone.encoder.layer")
        # 如果键中包含 "attn"，替换为 "attention"
        if "attn" in key:
            key = key.replace("attn", "attention")
        # 如果键中包含 "fusion_blks"，替换为 "fusion_blocks"
        if "fusion_blks" in key:
            key = key.replace("fusion_blks", "fusion_blocks")
        # 如果键中包含 "bn"，替换为 "batch_norm"
        if "bn" in key:
            key = key.replace("bn", "batch_norm")
        # 将更新后的键和原始值存回 state_dict
        state_dict[key] = val

    # 创建重命名后的键列表
    rename_keys = create_rename_keys(config)
    # 遍历重命名列表，逐一更新 state_dict 的键
    for src, dest in rename_keys:
        rename_key(state_dict, src, dest)

    # 创建模型处理器对象
    processor = VitMatteImageProcessor()
    # 创建 VitMatte 模型对象
    model = VitMatteForImageMatting(config)
    # 设置模型为评估模式
    model.eval()

    # 加载 state_dict 到模型
    model.load_state_dict(state_dict)

    # 从网络获取示例图像并转换为 RGB 格式
    url = "https://github.com/hustvl/ViTMatte/blob/main/demo/bulb_rgb.png?raw=true"
    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
    # 从网络获取示例图像的 trimap
    url = "https://github.com/hustvl/ViTMatte/blob/main/demo/bulb_trimap.png?raw=true"
    trimap = Image.open(requests.get(url, stream=True).raw)

    # 使用 processor 处理图像和 trimap，返回像素值张量
    pixel_values = processor(images=image, trimaps=trimap.convert("L"), return_tensors="pt").pixel_values

    # 禁用梯度计算
    with torch.no_grad():
        # 使用模型预测 alpha 通道值
        alphas = model(pixel_values).alphas

    # 根据模型名称选择期望的 alpha 值切片
    if model_name == "vitmatte-small-composition-1k":
        expected_slice = torch.tensor([[0.9977, 0.9987, 0.9990], [0.9980, 0.9998, 0.9998], [0.9983, 0.9998, 0.9998]])
    elif model_name == "vitmatte-base-composition-1k":
        expected_slice = torch.tensor([[0.9972, 0.9971, 0.9981], [0.9948, 0.9987, 0.9994], [0.9963, 0.9992, 0.9995]])
    elif model_name == "vitmatte-small-distinctions-646":
        expected_slice = torch.tensor([[0.9880, 0.9970, 0.9972], [0.9960, 0.9996, 0.9997], [0.9963, 0.9996, 0.9997]])
    elif model_name == "vitmatte-base-distinctions-646":
        expected_slice = torch.tensor([[0.9963, 0.9998, 0.9999], [0.9995, 1.0000, 1.0000], [0.9992, 0.9999, 1.0000]])

    # 断言模型预测的 alpha 值切片与期望的切片在指定的容差范围内相近
    assert torch.allclose(alphas[0, 0, :3, :3], expected_slice, atol=1e-4)
    # 打印确认消息
    print("Looks ok!")

    # 如果指定了 PyTorch 模型保存文件夹路径
    if pytorch_dump_folder_path is not None:
        # 打印保存模型和处理器的消息
        print(f"Saving model and processor of {model_name} to {pytorch_dump_folder_path}")
        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 将处理器保存到指定路径
        processor.save_pretrained(pytorch_dump_folder_path)

    # 如果指定推送到 Hub
    if push_to_hub:
        # 打印推送模型和处理器到 Hub 的消息
        print(f"Pushing model and processor for {model_name} to hub")
        # 推送模型到指定 Hub 仓库
        model.push_to_hub(f"hustvl/{model_name}")
        # 推送处理器到指定 Hub 仓库
        processor.push_to_hub(f"hustvl/{model_name}")
if __name__ == "__main__":
    # 如果这个脚本是直接运行的主程序，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # 必需的参数
    parser.add_argument(
        "--model_name",
        default="vitmatte-small-composition-1k",
        type=str,
        choices=[
            "vitmatte-small-composition-1k",
            "vitmatte-base-composition-1k",
            "vitmatte-small-distinctions-646",
            "vitmatte-base-distinctions-646",
        ],
        help="Name of the VitMatte model you'd like to convert."
    )
    # 添加一个参数选项，用于指定 VitMatte 模型的名称，有预设的几个选择

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    # 添加一个参数选项，用于指定输出 PyTorch 模型的目录路径

    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )
    # 添加一个参数选项，表示是否将转换后的模型推送到 🤗 hub

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数 convert_vitmatte_checkpoint，传入命令行参数中指定的模型名称、输出目录路径和是否推送到 hub 的选项
    convert_vitmatte_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

`.\models\vitmatte\image_processing_vitmatte.py`

# 导入所需模块和类
from typing import List, Optional, Union

import numpy as np  # 导入 NumPy 库，用于处理数组和矩阵操作

# 导入所需的图像处理工具和实用函数
from ...image_processing_utils import BaseImageProcessor, BatchFeature
from ...image_transforms import pad, to_channel_dimension_format
from ...image_utils import (
    IMAGENET_STANDARD_MEAN,  # 导入图像处理时所需的标准均值
    IMAGENET_STANDARD_STD,   # 导入图像处理时所需的标准标准差
    ChannelDimension,        # 导入通道维度枚举类
    ImageInput,              # 导入图像输入类
    get_image_size,          # 导入获取图像尺寸的函数
    infer_channel_dimension_format,  # 推断通道维度格式的函数
    is_scaled_image,         # 判断图像是否为缩放图像的函数
    make_list_of_images,     # 将图像处理为图像列表的函数
    to_numpy_array,          # 将输入转换为 NumPy 数组的函数
    valid_images,            # 验证图像有效性的函数
    validate_kwargs,         # 验证关键字参数的函数
    validate_preprocess_arguments,  # 验证预处理参数的函数
)
from ...utils import TensorType, logging  # 导入张量类型和日志记录工具

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象
    """
    Args:
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
            parameter in the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
            `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_pad (`bool`, *optional*, defaults to `True`):
            Whether to pad the image to make the width and height divisible by `size_divisibility`. Can be overridden
            by the `do_pad` parameter in the `preprocess` method.
        size_divisibility (`int`, *optional*, defaults to 32):
            The width and height of the image will be padded to be divisible by this number.
    """

    # 定义模型输入的名称列表，只包含一个元素 "pixel_values"
    model_input_names = ["pixel_values"]

    def __init__(
        self,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_pad: bool = True,
        size_divisibility: int = 32,
        **kwargs,
    ) -> None:
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 初始化类的属性，设置各个参数的默认值或者根据传入的参数进行设置
        self.do_rescale = do_rescale
        self.do_normalize = do_normalize
        self.do_pad = do_pad
        self.rescale_factor = rescale_factor
        # 如果传入的 image_mean 参数不为 None，则使用传入的值；否则使用预设的 IMAGENET_STANDARD_MEAN
        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
        # 如果传入的 image_std 参数不为 None，则使用传入的值；否则使用预设的 IMAGENET_STANDARD_STD
        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
        self.size_divisibility = size_divisibility
        # 设置有效的处理器键名列表，用于后续数据处理
        self._valid_processor_keys = [
            "images",
            "trimaps",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "do_pad",
            "size_divisibility",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]
    def pad_image(
        self,
        image: np.ndarray,
        size_divisibility: int = 32,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Args:
            image (`np.ndarray`):
                Image to pad.
            size_divisibility (`int`, *optional*, defaults to 32):
                The width and height of the image will be padded to be divisible by this number.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        """
        # 推断输入图像的通道维度格式
        if input_data_format is None:
            input_data_format = infer_channel_dimension_format(image)

        # 获取图像的高度和宽度
        height, width = get_image_size(image, input_data_format)

        # 如果图像的高度或宽度不是size_divisibility的整数倍，则进行填充
        if height % size_divisibility != 0 or width % size_divisibility != 0:
            pad_height = size_divisibility - height % size_divisibility
            pad_width = size_divisibility - width % size_divisibility
            padding = ((0, pad_height), (0, pad_width))
            # 对图像进行填充操作，保证其高度和宽度是size_divisibility的整数倍
            image = pad(image, padding=padding, data_format=data_format, input_data_format=input_data_format)

        # 如果指定了输出图像的通道维度格式，则将图像转换为该格式
        if data_format is not None:
            image = to_channel_dimension_format(image, data_format, input_data_format)

        # 返回填充或转换后的图像
        return image

    def preprocess(
        self,
        images: ImageInput,
        trimaps: ImageInput,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_pad: Optional[bool] = None,
        size_divisibility: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):
        # 这部分的注释应该由你来完成，因为它们和上述的代码块并不相关。

`.\models\vitmatte\modeling_vitmatte.py`

# 设置文件编码为 UTF-8
# 版权声明及使用条款，详细说明使用限制和免责声明
# 此处定义了 PyTorch ViTMatte 模型

from dataclasses import dataclass
from typing import Optional, Tuple

import torch
from torch import nn

# 导入通用的模型预训练工具函数和类
from ...modeling_utils import PreTrainedModel
# 导入通用的工具函数，包括添加文档字符串等
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
# 导入特定的后端工具函数，加载背景模型
from ...utils.backbone_utils import load_backbone
# 导入 ViTMatte 模型的配置类
from .configuration_vitmatte import VitMatteConfig

# 定义预训练模型的列表
VITMATTE_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "hustvl/vitmatte-small-composition-1k",
    # 更多预训练模型列表详见 https://huggingface.co/models?filter=vitmatte
]

# 用于文档字符串的通用配置
_CONFIG_FOR_DOC = "VitMatteConfig"

@dataclass
class ImageMattingOutput(ModelOutput):
    """
    用于图像抠像模型输出的类。

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, 当 `labels` 被提供时返回):
            损失值.
        alphas (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
           估计的 alpha 通道值.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当 `output_hidden_states=True` 传递或当 `config.output_hidden_states=True` 时返回):
            由 `torch.FloatTensor` 组成的元组 (如果模型有嵌入层则为嵌入层的输出, 每个阶段的输出) 的形状为 `(batch_size, sequence_length, hidden_size)` 的隐藏状态
            (也称为特征映射)。
        attentions (`tuple(torch.FloatTensor)`, *optional*, 当 `output_attentions=True` 传递或当 `config.output_attentions=True` 时返回):
            由 `torch.FloatTensor` 组成的元组 (每个层一个) 的形状为 `(batch_size, num_heads, patch_size, sequence_length)` 的注意力权重。
            在注意力 softmax 后用于计算自注意力头中的加权平均值。

            注意力权重，在注意力 softmax 后用于计算自注意力头中的加权平均值。
    """

    loss: Optional[torch.FloatTensor] = None
    alphas: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


class VitMattePreTrainedModel(PreTrainedModel):
    """
    一个抽象类，处理权重初始化和下载/加载预训练模型的简单接口。
    """

    # 配置类为 VitMatteConfig
    config_class = VitMatteConfig
    # 定义主要输入名称为 "pixel_values"
    main_input_name = "pixel_values"

    # 启用梯度检查点支持，设置为 True
    supports_gradient_checkpointing = True

    # 定义初始化权重函数 _init_weights，接受一个模块作为参数
    def _init_weights(self, module):
        # 如果传入的模块是 nn.Conv2d 类型
        if isinstance(module, nn.Conv2d):
            # 使用正态分布初始化该卷积层的权重，均值为 0，标准差为 self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果模块有偏置项
            if module.bias is not None:
                # 将偏置项数据初始化为零
                module.bias.data.zero_()
class VitMatteBasicConv3x3(nn.Module):
    """
    Basic convolution layers including: Conv3x3, BatchNorm2d, ReLU layers.
    """

    def __init__(self, config, in_channels, out_channels, stride=2, padding=1):
        super().__init__()
        # 定义一个3x3卷积层，设置输入通道数、输出通道数、卷积核大小、步长和填充，不使用偏置
        self.conv = nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=3,
            stride=stride,
            padding=padding,
            bias=False,
        )
        # 批标准化层，设置输出通道数和epsilon值（用于数值稳定性）
        self.batch_norm = nn.BatchNorm2d(out_channels, eps=config.batch_norm_eps)
        # ReLU激活函数
        self.relu = nn.ReLU()

    def forward(self, hidden_state):
        # 执行卷积操作
        hidden_state = self.conv(hidden_state)
        # 执行批标准化操作
        hidden_state = self.batch_norm(hidden_state)
        # 执行ReLU激活函数操作
        hidden_state = self.relu(hidden_state)

        return hidden_state


class VitMatteConvStream(nn.Module):
    """
    Simple ConvStream containing a series of basic conv3x3 layers to extract detail features.
    """

    def __init__(self, config):
        super().__init__()

        # 获取输入通道数
        in_channels = config.backbone_config.num_channels
        # 获取卷积层隐藏层尺寸列表
        out_channels = config.convstream_hidden_sizes

        self.convs = nn.ModuleList()
        self.conv_chans = [in_channels] + out_channels

        # 根据隐藏层尺寸列表创建一系列VitMatteBasicConv3x3实例
        for i in range(len(self.conv_chans) - 1):
            in_chan_ = self.conv_chans[i]
            out_chan_ = self.conv_chans[i + 1]
            self.convs.append(VitMatteBasicConv3x3(config, in_chan_, out_chan_))

    def forward(self, pixel_values):
        out_dict = {"detailed_feature_map_0": pixel_values}
        embeddings = pixel_values
        # 遍历并应用所有卷积层，将每个输出保存到字典中
        for i in range(len(self.convs)):
            embeddings = self.convs[i](embeddings)
            name_ = "detailed_feature_map_" + str(i + 1)
            out_dict[name_] = embeddings

        return out_dict


class VitMatteFusionBlock(nn.Module):
    """
    Simple fusion block to fuse features from ConvStream and Plain Vision Transformer.
    """

    def __init__(self, config, in_channels, out_channels):
        super().__init__()
        # 使用VitMatteBasicConv3x3创建一个融合块，设置输入通道数、输出通道数、步长和填充
        self.conv = VitMatteBasicConv3x3(config, in_channels, out_channels, stride=1, padding=1)

    def forward(self, features, detailed_feature_map):
        # 对特征进行上采样
        upscaled_features = nn.functional.interpolate(features, scale_factor=2, mode="bilinear", align_corners=False)
        # 拼接详细特征图和上采样特征
        out = torch.cat([detailed_feature_map, upscaled_features], dim=1)
        # 执行卷积操作
        out = self.conv(out)

        return out


class VitMatteHead(nn.Module):
    """
    Simple Matting Head, containing only conv3x3 and conv1x1 layers.
    """

    def __init__(self, config):
        super().__init__()

        # 获取融合块隐藏层尺寸列表的最后一个值作为输入通道数
        in_channels = config.fusion_hidden_sizes[-1]
        # 设置中间通道数为16
        mid_channels = 16

        # 创建一个简单的卷积网络序列，包含一个3x3卷积层、批标准化层和ReLU激活函数
        self.matting_convs = nn.Sequential(
            nn.Conv2d(in_channels, mid_channels, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(True),
            nn.Conv2d(mid_channels, 1, kernel_size=1, stride=1, padding=0),
        )
    # 定义一个方法用于正向传播，接收隐藏状态作为输入参数
    def forward(self, hidden_state):
        # 使用类内部定义的 matting_convs 层对输入的隐藏状态进行变换处理
        hidden_state = self.matting_convs(hidden_state)
        
        # 方法的返回值为经过变换后的隐藏状态
        return hidden_state
    Parameters:
    This model is a PyTorch `torch.nn.Module` sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.
        config ([`UperNetConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            # 输入的像素数值。默认情况下将忽略填充。可以使用 [`AutoImageProcessor`] 获得像素值。详见 [`VitMatteImageProcessor.__call__`]。
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量，如果后端有的话。返回的张量中的 `attentions` 字段包含更多细节。
        output_hidden_states (`bool`, *optional*):
            # 是否返回后端所有层的隐藏状态。返回的张量中的 `hidden_states` 字段包含更多细节。
        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通元组。
"""
@add_start_docstrings(
    """ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes.""",
    VITMATTE_START_DOCSTRING,
)
"""
# 使用装饰器添加类的文档字符串，描述了 ViTMatte 框架如何利用视觉骨干网络（如 ADE20k、CityScapes）进行图像抠图。

class VitMatteForImageMatting(VitMattePreTrainedModel):
    """
    派生自 VitMattePreTrainedModel 的图像抠图模型类。
    """

    def __init__(self, config):
        """
        初始化方法。

        Args:
            config (PretrainedConfig): 模型的配置对象。

        """
        super().__init__(config)
        self.config = config

        # 载入指定的视觉骨干网络
        self.backbone = load_backbone(config)
        
        # 初始化 VitMatteDetailCaptureModule 模块
        self.decoder = VitMatteDetailCaptureModule(config)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(VITMATTE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=ImageMattingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
        return_dict: Optional[bool] = None,
        """
        正向传播方法。

        Args:
            pixel_values (torch.Tensor, optional): 输入像素值张量。默认为 None。
            output_attentions (bool, optional): 是否输出注意力权重。默认为 None。
            output_hidden_states (bool, optional): 是否输出隐藏状态。默认为 None。
            labels (torch.Tensor, optional): 标签张量。默认为 None。
            return_dict (bool, optional): 是否返回字典格式结果。默认为 None。

        Returns:
            依据配置返回的输出类型，通常为 ImageMattingOutput 对象。

        """
        """
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth image matting for computing the loss.

        Returns:
            Returns either a tuple of tensors or an `ImageMattingOutput` object containing loss, alphas,
            hidden states, and attentions.

        Examples:

        ```
        >>> from transformers import VitMatteImageProcessor, VitMatteForImageMatting
        >>> import torch
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download

        >>> processor = VitMatteImageProcessor.from_pretrained("hustvl/vitmatte-small-composition-1k")
        >>> model = VitMatteForImageMatting.from_pretrained("hustvl/vitmatte-small-composition-1k")

        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset"
        ... )
        >>> image = Image.open(filepath).convert("RGB")
        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset"
        ... )
        >>> trimap = Image.open(filepath).convert("L")

        >>> # prepare image + trimap for the model
        >>> inputs = processor(images=image, trimaps=trimap, return_tensors="pt")

        >>> with torch.no_grad():
        ...     alphas = model(**inputs).alphas
        >>> print(alphas.shape)
        torch.Size([1, 1, 640, 960])
        ```

        """
        # If return_dict is not provided, use the default from model configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # If output_hidden_states is not provided, use the default from model configuration
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # If output_attentions is not provided, use the default from model configuration
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

        # Forward pass through the backbone with specified arguments
        outputs = self.backbone.forward_with_filtered_kwargs(
            pixel_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
        )

        # Retrieve the feature maps from the outputs
        features = outputs.feature_maps[-1]

        # Generate alphas using the decoder with the extracted features and pixel values
        alphas = self.decoder(features, pixel_values)

        # Initialize loss variable
        loss = None

        # If labels are provided, raise NotImplementedError since training is not supported
        if labels is not None:
            raise NotImplementedError("Training is not yet supported")

        # If return_dict is False, return a tuple including alphas and other outputs
        if not return_dict:
            output = (alphas,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # If return_dict is True, return an `ImageMattingOutput` object containing all relevant outputs
        return ImageMattingOutput(
            loss=loss,
            alphas=alphas,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\vitmatte\init.py`

# 版权声明和保留所有权利的声明
# 根据 Apache 许可证 2.0 版本授权，许可证详细信息可以通过给定的 URL 获取
from typing import TYPE_CHECKING

# 从特定的路径导入必要的模块和函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
    is_vision_available,
)

# 定义模块导入结构，包含配置和模型定义
_import_structure = {"configuration_vitmatte": ["VITMATTE_PRETRAINED_CONFIG_ARCHIVE_MAP", "VitMatteConfig"]}

# 检查视觉处理模块是否可用，若不可用则抛出异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将图像处理模块导入结构中
    _import_structure["image_processing_vitmatte"] = ["VitMatteImageProcessor"]

# 检查 Torch 是否可用，若不可用则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，将模型处理模块导入结构中
    _import_structure["modeling_vitmatte"] = [
        "VITMATTE_PRETRAINED_MODEL_ARCHIVE_LIST",
        "VitMattePreTrainedModel",
        "VitMatteForImageMatting",
    ]

# 如果处于类型检查模式，导入特定的配置和模型类
if TYPE_CHECKING:
    from .configuration_vitmatte import VITMATTE_PRETRAINED_CONFIG_ARCHIVE_MAP, VitMatteConfig

    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果视觉处理可用，导入图像处理器类
        from .image_processing_vitmatte import VitMatteImageProcessor

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果 Torch 可用，导入模型相关类
        from .modeling_vitmatte import (
            VITMATTE_PRETRAINED_MODEL_ARCHIVE_LIST,
            VitMatteForImageMatting,
            VitMattePreTrainedModel,
        )

# 如果不处于类型检查模式，使用 LazyModule 来处理模块的延迟导入
else:
    import sys

    # 将当前模块替换为 LazyModule，实现按需导入功能
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\vits\configuration_vits.py`

# coding=utf-8
# Copyright 2023 The Kakao Enterprise Authors and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" VITS model configuration"""


# 引入预训练配置类 PretrainedConfig 和日志工具 logging
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 预训练配置映射字典，指定预训练模型名称及其配置文件的下载链接
VITS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/mms-tts-eng": "https://huggingface.co/facebook/mms-tts-eng/resolve/main/config.json",
}


# VitsConfig 类继承自 PretrainedConfig，用于存储 VITS 模型的配置信息
class VitsConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`VitsModel`]. It is used to instantiate a VITS
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the VITS
    [facebook/mms-tts-eng](https://huggingface.co/facebook/mms-tts-eng) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import VitsModel, VitsConfig

    >>> # Initializing a "facebook/mms-tts-eng" style configuration
    >>> configuration = VitsConfig()

    >>> # Initializing a model (with random weights) from the "facebook/mms-tts-eng" style configuration
    >>> model = VitsModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    
    model_type 属性指定了模型类型为 "vits"
    """
    model_type = "vits"
    # 定义初始化方法，用于创建和初始化 TransformerTTS 模型
    def __init__(
        self,
        vocab_size=38,  # 词汇表大小，默认为 38
        hidden_size=192,  # 隐藏层大小，默认为 192
        num_hidden_layers=6,  # Transformer 中隐藏层的数量，默认为 6
        num_attention_heads=2,  # 注意力头的数量，默认为 2
        window_size=4,  # 窗口大小，默认为 4
        use_bias=True,  # 是否使用偏置，默认为 True
        ffn_dim=768,  # FeedForward 网络的维度，默认为 768
        layerdrop=0.1,  # 层丢弃率，默认为 0.1
        ffn_kernel_size=3,  # FeedForward 网络的卷积核大小，默认为 3
        flow_size=192,  # 流的大小，默认为 192
        spectrogram_bins=513,  # 频谱图的频率分辨率，默认为 513
        hidden_act="relu",  # 隐藏层激活函数，默认为 ReLU
        hidden_dropout=0.1,  # 隐藏层的 dropout 概率，默认为 0.1
        attention_dropout=0.1,  # 注意力机制的 dropout 概率，默认为 0.1
        activation_dropout=0.1,  # 激活函数的 dropout 概率，默认为 0.1
        initializer_range=0.02,  # 参数初始化范围，默认为 0.02
        layer_norm_eps=1e-5,  # Layer Normalization 的 epsilon，默认为 1e-5
        use_stochastic_duration_prediction=True,  # 是否使用随机时长预测，默认为 True
        num_speakers=1,  # 说话者的数量，默认为 1
        speaker_embedding_size=0,  # 说话者嵌入的维度，默认为 0
        upsample_initial_channel=512,  # 上采样层的初始通道数，默认为 512
        upsample_rates=[8, 8, 2, 2],  # 上采样层的上采样率列表，默认为 [8, 8, 2, 2]
        upsample_kernel_sizes=[16, 16, 4, 4],  # 上采样层的卷积核大小列表，默认为 [16, 16, 4, 4]
        resblock_kernel_sizes=[3, 7, 11],  # ResBlock 的卷积核大小列表，默认为 [3, 7, 11]
        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],  # ResBlock 的扩张率列表，默认为 [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
        leaky_relu_slope=0.1,  # Leaky ReLU 斜率，默认为 0.1
        depth_separable_channels=2,  # 深度可分离卷积的通道数，默认为 2
        depth_separable_num_layers=3,  # 深度可分离卷积的层数，默认为 3
        duration_predictor_flow_bins=10,  # 时长预测器的流的数量，默认为 10
        duration_predictor_tail_bound=5.0,  # 时长预测器的尾部边界，默认为 5.0
        duration_predictor_kernel_size=3,  # 时长预测器的卷积核大小，默认为 3
        duration_predictor_dropout=0.5,  # 时长预测器的 dropout 概率，默认为 0.5
        duration_predictor_num_flows=4,  # 时长预测器的流的数量，默认为 4
        duration_predictor_filter_channels=256,  # 时长预测器的卷积滤波器通道数，默认为 256
        prior_encoder_num_flows=4,  # 先验编码器的流的数量，默认为 4
        prior_encoder_num_wavenet_layers=4,  # 先验编码器的 WaveNet 层的数量，默认为 4
        posterior_encoder_num_wavenet_layers=16,  # 后验编码器的 WaveNet 层的数量，默认为 16
        wavenet_kernel_size=5,  # WaveNet 的卷积核大小，默认为 5
        wavenet_dilation_rate=1,  # WaveNet 的膨胀率，默认为 1
        wavenet_dropout=0.0,  # WaveNet 的 dropout 概率，默认为 0.0
        speaking_rate=1.0,  # 说话速率，默认为 1.0
        noise_scale=0.667,  # 噪声缩放因子，默认为 0.667
        noise_scale_duration=0.8,  # 时长噪声缩放因子，默认为 0.8
        sampling_rate=16_000,  # 采样率，默认为 16,000
        **kwargs,  # 其它未命名参数
    ):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.window_size = window_size
        self.use_bias = use_bias
        self.ffn_dim = ffn_dim
        self.layerdrop = layerdrop
        self.ffn_kernel_size = ffn_kernel_size
        self.flow_size = flow_size
        self.spectrogram_bins = spectrogram_bins
        self.hidden_act = hidden_act
        self.hidden_dropout = hidden_dropout
        self.attention_dropout = attention_dropout
        self.activation_dropout = activation_dropout
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.use_stochastic_duration_prediction = use_stochastic_duration_prediction
        self.num_speakers = num_speakers
        self.speaker_embedding_size = speaker_embedding_size
        self.upsample_initial_channel = upsample_initial_channel
        self.upsample_rates = upsample_rates
        self.upsample_kernel_sizes = upsample_kernel_sizes
        self.resblock_kernel_sizes = resblock_kernel_sizes
        self.resblock_dilation_sizes = resblock_dilation_sizes
        self.leaky_relu_slope = leaky_relu_slope
        self.depth_separable_channels = depth_separable_channels
        self.depth_separable_num_layers = depth_separable_num_layers
        self.duration_predictor_flow_bins = duration_predictor_flow_bins
        self.duration_predictor_tail_bound = duration_predictor_tail_bound
        self.duration_predictor_kernel_size = duration_predictor_kernel_size
        self.duration_predictor_dropout = duration_predictor_dropout
        self.duration_predictor_num_flows = duration_predictor_num_flows
        self.duration_predictor_filter_channels = duration_predictor_filter_channels
        self.prior_encoder_num_flows = prior_encoder_num_flows
        self.prior_encoder_num_wavenet_layers = prior_encoder_num_wavenet_layers
        self.posterior_encoder_num_wavenet_layers = posterior_encoder_num_wavenet_layers
        self.wavenet_kernel_size = wavenet_kernel_size
        self.wavenet_dilation_rate = wavenet_dilation_rate
        self.wavenet_dropout = wavenet_dropout
        self.speaking_rate = speaking_rate
        self.noise_scale = noise_scale
        self.noise_scale_duration = noise_scale_duration
        self.sampling_rate = sampling_rate

        # 检查 `upsample_kernel_sizes` 和 `upsample_rates` 的长度是否一致，不一致则抛出 ValueError 异常
        if len(upsample_kernel_sizes) != len(upsample_rates):
            raise ValueError(
                f"The length of `upsample_kernel_sizes` ({len(upsample_kernel_sizes)}) must match the length of "
                f"`upsample_rates` ({len(upsample_rates)})"
            )

        # 调用父类的初始化方法，传入可能的关键字参数
        super().__init__(**kwargs)

`.\models\vits\convert_original_checkpoint.py`

# 设置编码格式为 UTF-8

# 版权声明和许可证信息，指定了 Apache License, Version 2.0 的使用条件和限制
# 您可以通过访问指定的 URL 查看许可证的详细内容：http://www.apache.org/licenses/LICENSE-2.0

"""Convert VITS checkpoint."""

# 导入必要的库和模块
import argparse  # 解析命令行参数的库
import json  # 处理 JSON 格式数据的库
import tempfile  # 创建临时文件和目录的库

import torch  # PyTorch 深度学习库
from huggingface_hub import hf_hub_download  # Hugging Face Hub 下载模块

from transformers import VitsConfig, VitsModel, VitsTokenizer, logging  # Hugging Face Transformers 库中的相关模块

# 设置日志的详细程度为 info 级别
logging.set_verbosity_info()

# 获取或创建名为 "transformers.models.vits" 的日志记录器对象
logger = logging.get_logger("transformers.models.vits")

# 将 VITS 模型中文本编码器相关参数的映射定义为字典
MAPPING_TEXT_ENCODER = {
    "enc_p.emb": "text_encoder.embed_tokens",
    "enc_p.encoder.attn_layers.*.conv_k": "text_encoder.encoder.layers.*.attention.k_proj",
    "enc_p.encoder.attn_layers.*.conv_v": "text_encoder.encoder.layers.*.attention.v_proj",
    "enc_p.encoder.attn_layers.*.conv_q": "text_encoder.encoder.layers.*.attention.q_proj",
    "enc_p.encoder.attn_layers.*.conv_o": "text_encoder.encoder.layers.*.attention.out_proj",
    "enc_p.encoder.attn_layers.*.emb_rel_k": "text_encoder.encoder.layers.*.attention.emb_rel_k",
    "enc_p.encoder.attn_layers.*.emb_rel_v": "text_encoder.encoder.layers.*.attention.emb_rel_v",
    "enc_p.encoder.norm_layers_1.*.gamma": "text_encoder.encoder.layers.*.layer_norm.weight",
    "enc_p.encoder.norm_layers_1.*.beta": "text_encoder.encoder.layers.*.layer_norm.bias",
    "enc_p.encoder.ffn_layers.*.conv_1": "text_encoder.encoder.layers.*.feed_forward.conv_1",
    "enc_p.encoder.ffn_layers.*.conv_2": "text_encoder.encoder.layers.*.feed_forward.conv_2",
    "enc_p.encoder.norm_layers_2.*.gamma": "text_encoder.encoder.layers.*.final_layer_norm.weight",
    "enc_p.encoder.norm_layers_2.*.beta": "text_encoder.encoder.layers.*.final_layer_norm.bias",
    "enc_p.proj": "text_encoder.project",
}

# 将 VITS 模型中随机持续时间预测器相关参数的映射定义为字典
MAPPING_STOCHASTIC_DURATION_PREDICTOR = {
    "dp.pre": "duration_predictor.conv_pre",
    "dp.proj": "duration_predictor.conv_proj",
    "dp.convs.convs_sep.*": "duration_predictor.conv_dds.convs_dilated.*",
    "dp.convs.convs_1x1.*": "duration_predictor.conv_dds.convs_pointwise.*",
    "dp.convs.norms_1.*.gamma": "duration_predictor.conv_dds.norms_1.*.weight",
    "dp.convs.norms_1.*.beta": "duration_predictor.conv_dds.norms_1.*.bias",
    "dp.convs.norms_2.*.gamma": "duration_predictor.conv_dds.norms_2.*.weight",
    "dp.convs.norms_2.*.beta": "duration_predictor.conv_dds.norms_2.*.bias",
    "dp.flows.0.logs": "duration_predictor.flows.0.log_scale",
    "dp.flows.0.m": "duration_predictor.flows.0.translate",
    "dp.flows.*.pre": "duration_predictor.flows.*.conv_pre",
}
    # 将模型参数中的路径映射转换为新的路径，用于模型权重加载和迁移
    "dp.flows.*.proj": "duration_predictor.flows.*.conv_proj",
    # 转换卷积层的路径，将原路径映射到新的路径
    "dp.flows.*.convs.convs_1x1.0": "duration_predictor.flows.*.conv_dds.convs_pointwise.0",
    # 转换卷积层的路径，将原路径映射到新的路径
    "dp.flows.*.convs.convs_1x1.1": "duration_predictor.flows.*.conv_dds.convs_pointwise.1",
    # 转换卷积层的路径，将原路径映射到新的路径
    "dp.flows.*.convs.convs_1x1.2": "duration_predictor.flows.*.conv_dds.convs_pointwise.2",
    # 转换分离卷积层的路径，将原路径映射到新的路径
    "dp.flows.*.convs.convs_sep.0": "duration_predictor.flows.*.conv_dds.convs_dilated.0",
    # 转换分离卷积层的路径，将原路径映射到新的路径
    "dp.flows.*.convs.convs_sep.1": "duration_predictor.flows.*.conv_dds.convs_dilated.1",
    # 转换分离卷积层的路径，将原路径映射到新的路径
    "dp.flows.*.convs.convs_sep.2": "duration_predictor.flows.*.conv_dds.convs_dilated.2",
    # 转换归一化层的 gamma 参数路径，将原路径映射到新的路径
    "dp.flows.*.convs.norms_1.0.gamma": "duration_predictor.flows.*.conv_dds.norms_1.0.weight",
    # 转换归一化层的 beta 参数路径，将原路径映射到新的路径
    "dp.flows.*.convs.norms_1.0.beta": "duration_predictor.flows.*.conv_dds.norms_1.0.bias",
    # 转换归一化层的 gamma 参数路径，将原路径映射到新的路径
    "dp.flows.*.convs.norms_1.1.gamma": "duration_predictor.flows.*.conv_dds.norms_1.1.weight",
    # 转换归一化层的 beta 参数路径，将原路径映射到新的路径
    "dp.flows.*.convs.norms_1.1.beta": "duration_predictor.flows.*.conv_dds.norms_1.1.bias",
    # 转换归一化层的 gamma 参数路径，将原路径映射到新的路径
    "dp.flows.*.convs.norms_1.2.gamma": "duration_predictor.flows.*.conv_dds.norms_1.2.weight",
    # 转换归一化层的 beta 参数路径，将原路径映射到新的路径
    "dp.flows.*.convs.norms_1.2.beta": "duration_predictor.flows.*.conv_dds.norms_1.2.bias",
    # 转换归一化层的 gamma 参数路径，将原路径映射到新的路径
    "dp.flows.*.convs.norms_2.0.gamma": "duration_predictor.flows.*.conv_dds.norms_2.0.weight",
    # 转换归一化层的 beta 参数路径，将原路径映射到新的路径
    "dp.flows.*.convs.norms_2.0.beta": "duration_predictor.flows.*.conv_dds.norms_2.0.bias",
    # 转换归一化层的 gamma 参数路径，将原路径映射到新的路径
    "dp.flows.*.convs.norms_2.1.gamma": "duration_predictor.flows.*.conv_dds.norms_2.1.weight",
    # 转换归一化层的 beta 参数路径，将原路径映射到新的路径
    "dp.flows.*.convs.norms_2.1.beta": "duration_predictor.flows.*.conv_dds.norms_2.1.bias",
    # 转换归一化层的 gamma 参数路径，将原路径映射到新的路径
    "dp.flows.*.convs.norms_2.2.gamma": "duration_predictor.flows.*.conv_dds.norms_2.2.weight",
    # 转换归一化层的 beta 参数路径，将原路径映射到新的路径
    "dp.flows.*.convs.norms_2.2.beta": "duration_predictor.flows.*.conv_dds.norms_2.2.bias",
    # 转换后处理阶段的路径，将原路径映射到新的路径
    "dp.post_pre": "duration_predictor.post_conv_pre",
    # 转换后处理阶段的路径，将原路径映射到新的路径
    "dp.post_proj": "duration_predictor.post_conv_proj",
    # 转换后处理阶段的分离卷积层路径，将原路径映射到新的路径
    "dp.post_convs.convs_sep.*": "duration_predictor.post_conv_dds.convs_dilated.*",
    # 转换后处理阶段的 1x1 卷积层路径，将原路径映射到新的路径
    "dp.post_convs.convs_1x1.*": "duration_predictor.post_conv_dds.convs_pointwise.*",
    # 转换后处理阶段的归一化层 gamma 参数路径，将原路径映射到新的路径
    "dp.post_convs.norms_1.*.gamma": "duration_predictor.post_conv_dds.norms_1.*.weight",
    # 转换后处理阶段的归一化层 beta 参数路径，将原路径映射到新的路径
    "dp.post_convs.norms_1.*.beta": "duration_predictor.post_conv_dds.norms_1.*.bias",
    # 转换后处理阶段的归一化层 gamma 参数路径，将原路径映射到新的路径
    "dp.post_convs.norms_2.*.gamma": "duration_predictor.post_conv_dds.norms_2.*.weight",
    # 转换后处理阶段的归一化层 beta 参数路径，将原路径映射到新的路径
    "dp.post_convs.norms_2.*.beta": "duration_predictor.post_conv_dds.norms_2.*.bias",
    # 转换后处理阶段的 logs 参数路径，将原路径映射到新的路径
    "dp.post_flows.0.logs": "duration_predictor.post_flows.0.log_scale",
    # 转换后处理阶段的 m 参数路径，将原路径映射到新的路径
    "dp.post_flows.0.m": "duration_predictor.post_flows.0.translate",
    # 转换后处理阶段的前处理路径，将原路径映射到新的路径
    "dp.post_flows.*.pre": "duration_predictor.post_flows.*.conv_pre",
    # 转换后处理阶段的投影路径，将原路径映射到新的路径
    "dp.post_flows.*.proj": "duration_predictor.post_flows.*.conv_proj",
    # 转换后处理阶段的卷积层路径，将原路径映射到新的路径
    "dp.post_flows.*.convs.convs_1x1.0": "duration_predictor.post_flows.*.conv_dds.convs_pointwise.0",
    # 转换后处理阶段的卷积层路径，将原路径映射到新的路径
    "dp.post_flows.*.convs.convs_1x1.1": "duration_predictor.post_flows.*.conv_dds.convs_pointwise.1",
    # 转换后处理阶段的卷积层路径，将原路径映射到新的路径
    "dp
    # 定义一组映射关系，将源字符串路径映射到目标字符串路径
    "dp.post_flows.*.convs.convs_sep.0": "duration_predictor.post_flows.*.conv_dds.convs_dilated.0",
    "dp.post_flows.*.convs.convs_sep.1": "duration_predictor.post_flows.*.conv_dds.convs_dilated.1",
    "dp.post_flows.*.convs.convs_sep.2": "duration_predictor.post_flows.*.conv_dds.convs_dilated.2",
    # 映射 gamma 参数的路径
    "dp.post_flows.*.convs.norms_1.0.gamma": "duration_predictor.post_flows.*.conv_dds.norms_1.0.weight",
    "dp.post_flows.*.convs.norms_1.0.beta": "duration_predictor.post_flows.*.conv_dds.norms_1.0.bias",
    "dp.post_flows.*.convs.norms_1.1.gamma": "duration_predictor.post_flows.*.conv_dds.norms_1.1.weight",
    "dp.post_flows.*.convs.norms_1.1.beta": "duration_predictor.post_flows.*.conv_dds.norms_1.1.bias",
    "dp.post_flows.*.convs.norms_1.2.gamma": "duration_predictor.post_flows.*.conv_dds.norms_1.2.weight",
    "dp.post_flows.*.convs.norms_1.2.beta": "duration_predictor.post_flows.*.conv_dds.norms_1.2.bias",
    "dp.post_flows.*.convs.norms_2.0.gamma": "duration_predictor.post_flows.*.conv_dds.norms_2.0.weight",
    "dp.post_flows.*.convs.norms_2.0.beta": "duration_predictor.post_flows.*.conv_dds.norms_2.0.bias",
    "dp.post_flows.*.convs.norms_2.1.gamma": "duration_predictor.post_flows.*.conv_dds.norms_2.1.weight",
    "dp.post_flows.*.convs.norms_2.1.beta": "duration_predictor.post_flows.*.conv_dds.norms_2.1.bias",
    "dp.post_flows.*.convs.norms_2.2.gamma": "duration_predictor.post_flows.*.conv_dds.norms_2.2.weight",
    "dp.post_flows.*.convs.norms_2.2.beta": "duration_predictor.post_flows.*.conv_dds.norms_2.2.bias",
    # 映射条件参数路径
    "dp.cond": "duration_predictor.cond",  # num_speakers > 1

}

定义一个映射字典，用于将某些权重键映射到不同的键

MAPPING_FLOW = {
"flow.flows..pre": "flow.flows..conv_pre", # 将 'flow.flows..pre' 映射到 'flow.flows..conv_pre'
"flow.flows..enc.in_layers.0": "flow.flows..wavenet.in_layers.0", # 将 'flow.flows..enc.in_layers.0' 映射到 'flow.flows..wavenet.in_layers.0'
"flow.flows..enc.in_layers.1": "flow.flows..wavenet.in_layers.1", # 将 'flow.flows..enc.in_layers.1' 映射到 'flow.flows..wavenet.in_layers.1'
"flow.flows..enc.in_layers.2": "flow.flows..wavenet.in_layers.2", # 将 'flow.flows..enc.in_layers.2' 映射到 'flow.flows..wavenet.in_layers.2'
"flow.flows..enc.in_layers.3": "flow.flows..wavenet.in_layers.3", # 将 'flow.flows..enc.in_layers.3' 映射到 'flow.flows..wavenet.in_layers.3'
"flow.flows..enc.res_skip_layers.0": "flow.flows..wavenet.res_skip_layers.0", # 将 'flow.flows..enc.res_skip_layers.0' 映射到 'flow.flows..wavenet.res_skip_layers.0'
"flow.flows..enc.res_skip_layers.1": "flow.flows..wavenet.res_skip_layers.1", # 将 'flow.flows..enc.res_skip_layers.1' 映射到 'flow.flows..wavenet.res_skip_layers.1'
"flow.flows..enc.res_skip_layers.2": "flow.flows..wavenet.res_skip_layers.2", # 将 'flow.flows..enc.res_skip_layers.2' 映射到 'flow.flows..wavenet.res_skip_layers.2'
"flow.flows..enc.res_skip_layers.3": "flow.flows..wavenet.res_skip_layers.3", # 将 'flow.flows..enc.res_skip_layers.3' 映射到 'flow.flows..wavenet.res_skip_layers.3'
"flow.flows..enc.cond_layer": "flow.flows..wavenet.cond_layer", # 当 num_speakers > 1 时，将 'flow.flows..enc.cond_layer' 映射到 'flow.flows..wavenet.cond_layer'
"flow.flows..post": "flow.flows..conv_post", # 将 'flow.flows..post' 映射到 'flow.flows..conv_post'
}

定义一个映射字典，用于将生成器的权重键映射到不同的键

MAPPING_GENERATOR = {
"dec.conv_pre": "decoder.conv_pre", # 将 'dec.conv_pre' 映射到 'decoder.conv_pre'
"dec.ups.0": "decoder.upsampler.0", # 将 'dec.ups.0' 映射到 'decoder.upsampler.0'
"dec.ups.1": "decoder.upsampler.1", # 将 'dec.ups.1' 映射到 'decoder.upsampler.1'
"dec.ups.2": "decoder.upsampler.2", # 将 'dec.ups.2' 映射到 'decoder.upsampler.2'
"dec.ups.3": "decoder.upsampler.3", # 将 'dec.ups.3' 映射到 'decoder.upsampler.3'
"dec.resblocks..convs1.0": "decoder.resblocks..convs1.0", # 将 'dec.resblocks..convs1.0' 映射到 'decoder.resblocks..convs1.0'
"dec.resblocks..convs1.1": "decoder.resblocks..convs1.1", # 将 'dec.resblocks..convs1.1' 映射到 'decoder.resblocks..convs1.1'
"dec.resblocks..convs1.2": "decoder.resblocks..convs1.2", # 将 'dec.resblocks..convs1.2' 映射到 'decoder.resblocks..convs1.2'
"dec.resblocks..convs2.0": "decoder.resblocks..convs2.0", # 将 'dec.resblocks..convs2.0' 映射到 'decoder.resblocks..convs2.0'
"dec.resblocks..convs2.1": "decoder.resblocks..convs2.1", # 将 'dec.resblocks..convs2.1' 映射到 'decoder.resblocks..convs2.1'
"dec.resblocks..convs2.2": "decoder.resblocks..convs2.2", # 将 'dec.resblocks..convs2.2' 映射到 'decoder.resblocks..convs2.2'
"dec.conv_post": "decoder.conv_post", # 将 'dec.conv_post' 映射到 'decoder.conv_post'
"dec.cond": "decoder.cond", # 当 num_speakers > 1 时，将 'dec.cond' 映射到 'decoder.cond'
}

定义一个映射字典，用于将后验编码器的权重键映射到不同的键

MAPPING_POSTERIOR_ENCODER = {
"enc_q.pre": "posterior_encoder.conv_pre", # 将 'enc_q.pre' 映射到 'posterior_encoder.conv_pre'
"enc_q.enc.in_layers.": "posterior_encoder.wavenet.in_layers.", # 将 'enc_q.enc.in_layers.' 映射到 'posterior_encoder.wavenet.in_layers.'
"enc_q.enc.res_skip_layers.": "posterior_encoder.wavenet.res_skip_layers.", # 将 'enc_q.enc.res_skip_layers.' 映射到 'posterior_encoder.wavenet.res_skip_layers.'
"enc_q.enc.cond_layer": "posterior_encoder.wavenet.cond_layer", # 当 num_speakers > 1 时，将 'enc_q.enc.cond_layer' 映射到 'posterior_encoder.wavenet.cond_layer'
"enc_q.proj": "posterior_encoder.conv_proj", # 将 'enc_q.proj' 映射到 'posterior_encoder.conv_proj'
}

合并所有映射字典

MAPPING = {
**MAPPING_TEXT_ENCODER, # 将 MAPPING_TEXT_ENCODER 中的键值对加入到 MAPPING 字典中
**MAPPING_STOCHASTIC_DURATION_PREDICTOR, # 将 MAPPING_STOCHASTIC_DURATION_PREDICTOR 中的键值对加入到 MAPPING 字典中
**MAPPING_FLOW, # 将 MAPPING_FLOW 中的键值对加入到 MAPPING 字典中
**MAPPING_GENERATOR, # 将 MAPPING_GENERATOR 中的键值对加入到 MAPPING 字典中
**MAPPING_POSTERIOR_ENCODER, # 将 MAPPING_POSTERIOR_ENCODER 中的键值对加入到 MAPPING 字典中
"emb_g": "embed_speaker", # 当 num_speakers > 1 时，将 'emb_g' 映射到 'embed_speaker'
}

初始化一个空列表，用于存储顶级键

TOP_LEVEL_KEYS = []

初始化一个空列表，用于存储忽略的键

IGNORE_KEYS = []

def set_recursively(hf_pointer, key, value, full_name, weight_type):
# 遍历键，依次获取 hf_pointer 对象的属性
for attribute in key.split("."):
hf_pointer = getattr(hf_pointer, attribute)

# 获取指定权重类型的形状
if weight_type is not None:
    hf_shape = getattr(hf_pointer, weight_type).shape
else:
    hf_shape = hf_pointer.shape

# 如果键以特定后缀结尾，将 value 的最后一个维度去掉（原始权重是 Conv1d）
if key.endswith(".k_proj") or key.endswith(".v_proj") or key.endswith(".q_proj") or key.endswith(".out_proj"):
    value = value.squeeze(-1)

# 检查 hf_shape 和 value.shape 是否匹配，如果不匹配则抛出异常
if hf_shape != value.shape:
    raise ValueError(
        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
        f" {value.shape} for {full_name}"
    )

# 如果 weight_type 是 'weight'，将 hf_pointer 的权重数据```

}

末尾多余的大括号，可能是代码片段复制过程中的错误

MAPPING_FLOW = {
"flow.flows..pre": "flow.flows..conv_pre",
"flow.flows..enc.in_layers.0": "flow.flows..wavenet.in_layers.0",
"flow.flows..enc.in_layers.1": "flow.flows..wavenet.in_layers.1",
"flow.flows..enc.in_layers.2": "flow.flows..wavenet.in_layers.2",
"flow.flows..enc.in_layers.3": "flow.flows..wavenet.in_layers.3",
"flow.flows..enc.res_skip_layers.0": "flow.flows..wavenet.res_skip_layers.0",
"flow.flows..enc.res_skip_layers.1": "flow.flows..wavenet.res_skip_layers.1",
"flow.flows..enc.res_skip_layers.2": "flow.flows..wavenet.res_skip_layers.2",
"flow.flows..enc.res_skip_layers.3": "flow.flows..wavenet.res_skip_layers.3",
"flow.flows..enc.cond_layer": "flow.flows..wavenet.cond_layer", # 当 num_speakers > 1 时使用
# MAPPING_FLOW 中的映射关系，用于指定流模型的层对应关系
}

MAPPING_GENERATOR = {
"dec.conv_pre": "decoder.conv_pre",
"dec.ups.0": "decoder.upsampler.0",
"dec.ups.1": "decoder.upsampler.1",
"dec.ups.2": "decoder.upsampler.2",
"dec.ups.3": "decoder.upsampler.3",
"dec.resblocks..convs1.0": "decoder.resblocks..convs1.0",
"dec.resblocks..convs1.1": "decoder.resblocks..convs1.1",
"dec.resblocks..convs1.2": "decoder.resblocks..convs1.2",
"dec.resblocks..convs2.0": "decoder.resblocks..convs2.0",
"dec.resblocks..convs2.1": "decoder.resblocks..convs2.1",
"dec.resblocks..convs2.2": "decoder.resblocks..convs2.2",
"dec.conv_post": "decoder.conv_post",
"dec.cond": "decoder.cond", # 当 num_speakers > 1 时使用
# MAPPING_GENERATOR 中```
}

末尾多余的大括号，可能是代码片段复制过程中的错误

MAPPING_POSTERIOR_ENCODER = {
"enc_q.pre": "posterior_encoder.conv_pre",
"enc_q.enc.in_layers.": "posterior_encoder.wavenet.in_layers.",
"enc_q.enc.res_skip_layers.": "posterior_encoder.wavenet.res_skip_layers.",
"enc_q.enc.cond_layer": "posterior_encoder.wavenet.cond_layer", # 当 num_speakers > 1 时使用
# MAPPING_POSTERIOR_ENCODER 中的映射关系，用于指定后验编码器模型的层对应关系
}

MAPPING = {
**MAPPING_TEXT_ENCODER,
**MAPPING_STOCHASTIC_DURATION_PREDICTOR,
**MAPPING_FLOW,
**MAPPING_GENERATOR,
**MAPPING_POSTERIOR_ENCODER,
"emb_g": "embed_speaker", # 当 num_speakers > 1 时使用
# MAPPING 包含了所有模型的映射关系，整合了各个子映射字典
}

TOP_LEVEL_KEYS = []
IGNORE_KEYS = []

def set_recursively(hf_pointer, key, value, full_name, weight_type):
# 递归设置 hf_pointer 中指定的 key 属性值为 value

for attribute in key.split("."):
    # 通过循环逐级获取属性，直到达到指定的 key 所在的属性位置
    hf_pointer = getattr(hf_pointer, attribute)

if weight_type is not None:
    # 如果指定了 weight_type，则获取对应的形状信息
    hf_shape = getattr(hf_pointer, weight_type).shape
else:
    # 否则获取整体的形状信息
    hf_shape = hf_pointer.shape

# 如果 key 以特定字符串结尾，则压缩掉最后的核心维度（原始权重为 Conv1d）
if key.endswith(".k_proj") or key.endswith(".v_proj") or key.endswith(".q_proj") or key.endswith(".out_proj"):
    value = value.squeeze(-1)

# 检查值的形状是否与 hf_pointer 的形状相匹配，如果不匹配则抛出 ValueError
if hf_shape != value.shape:
    raise ValueError(
        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
        f" {value.shape} for {full_name}"
    )

if weight_type == "weight":
    # 如果 weight_type 是 'weight'，则将 hf_pointer 的权重数据设为 value
    hf_pointer.weight.data = value
# 如果权重类型是 "weight_g"，则将值赋给相应的 hf_pointer 对象的 weight_g 属性
elif weight_type == "weight_g":
    hf_pointer.weight_g.data = value
# 如果权重类型是 "weight_v"，则将值赋给相应的 hf_pointer 对象的 weight_v 属性
elif weight_type == "weight_v":
    hf_pointer.weight_v.data = value
# 如果权重类型是 "bias"，则将值赋给相应的 hf_pointer 对象的 bias 属性
elif weight_type == "bias":
    hf_pointer.bias.data = value
# 如果权重类型是 "running_mean"，则将值赋给相应的 hf_pointer 对象的 running_mean 属性
elif weight_type == "running_mean":
    hf_pointer.running_mean.data = value
# 如果权重类型是 "running_var"，则将值赋给相应的 hf_pointer 对象的 running_var 属性
elif weight_type == "running_var":
    hf_pointer.running_var.data = value
# 如果权重类型是 "num_batches_tracked"，则将值赋给相应的 hf_pointer 对象的 num_batches_tracked 属性
elif weight_type == "num_batches_tracked":
    hf_pointer.num_batches_tracked.data = value
# 如果权重类型不属于以上任何一种情况，则将值直接赋给 hf_pointer 对象的 data 属性
else:
    hf_pointer.data = value

# 记录初始化日志信息，描述哪个键的哪种权重类型（如果有）从完整名称 full_name 加载得来
logger.info(f"{key + ('.' + weight_type if weight_type is not None else '')} was initialized from {full_name}.")

检查给定的名称是否应该被忽略，根据忽略规则列表 ignore_keys

def should_ignore(name, ignore_keys):
for key in ignore_keys:
# 如果规则以 "." 结尾，检查名称是否以去掉最后一个字符的规则开头，如果是则忽略该名称
if key.endswith("."):
if name.startswith(key[:-1]):
return True
# 如果规则中包含 ".."，则按前缀和后缀进行分割，检查名称中是否同时包含前缀和后缀，如果是则忽略该名称
elif ".." in key:
prefix, suffix = key.split(".*.")
if prefix in name and suffix in name:
return True
# 否则，直接检查名称是否包含规则中指定的字符串，如果是则忽略该名称
elif key in name:
return True
# 如果都不匹配，则不忽略该名称
return False

递归地加载 Fairseq 模型的权重到 Hugging Face 模型中

def recursively_load_weights(fairseq_dict, hf_model):
unused_weights = []

# 遍历 Fairseq 模型字典中的每个名称和对应的值
for name, value in fairseq_dict.items():
    # 检查是否应该忽略该名称的加载
    if should_ignore(name, IGNORE_KEYS):
        # 如果需要忽略，记录日志并继续下一个名称的处理
        logger.info(f"{name} was ignored")
        continue

    is_used = False
    # 遍历映射规则 MAPPING 中的每对键值对
    for key, mapped_key in MAPPING.items():
        # 如果映射规则以 ".*" 结尾，去掉最后一个字符
        if key.endswith(".*"):
            key = key[:-1]
        # 如果映射规则中包含 "*"，按照前缀和后缀进行分割
        elif "*" in key:
            prefix, suffix = key.split(".*.")
            if prefix in name and suffix in name:
                key = suffix

        # 检查当前名称是否匹配映射规则中的键
        if key in name:
            is_used = True
            # 根据映射规则修改 mapped_key 中的 "*"，用名称中的索引替换
            if mapped_key.endswith(".*"):
                layer_index = name.split(key)[-1].split(".")[0]
                mapped_key = mapped_key.replace("*", layer_index)
            elif "*" in mapped_key:
                layer_index = name.split(key)[0].split(".")[-2]

                # 根据特定规则重新映射层索引
                if "flow.flows" in mapped_key:
                    layer_index = str(int(layer_index) // 2)
                if "duration_predictor.flows" in mapped_key or "duration_predictor.post_flows" in mapped_key:
                    layer_index = str(int(layer_index) // 2 + 1)

                mapped_key = mapped_key.replace("*", layer_index)
            
            # 根据名称中的标识确定权重类型
            if "weight_g" in name:
                weight_type = "weight_g"
            elif "weight_v" in name:
                weight_type = "weight_v"
            elif "bias" in name:
                weight_type = "bias"
            elif "weight" in name:
                weight_type = "weight"
            elif "running_mean" in name:
                weight_type = "running_mean"
            elif "running_var" in name:
                weight_type = "running_var"
            elif "num_batches_tracked" in name:
                weight_type = "num_batches_tracked"
            else:
                weight_type = None
            
            # 使用递归设置函数将值加载到 Hugging Face 模型中的指定位置
            set_recursively(hf_model, mapped_key, value, name, weight_type)
        continue
    # 如果没有匹配的映射规则，则记录为未使用的权重
    if not is_used:
        unused_weights.append(name)

# 记录未使用的权重信息到日志中
logger.warning(f"Unused weights: {unused_weights}")

使用 Torch 的 no_grad 装饰器，将 PyTorch 模型权重转换为 Transformers 设计的函数

@torch.no_grad()
def convert_checkpoint(
pytorch_dump_folder_path,
checkpoint_path=None,
config_path=None,
vocab_path=None,
language=None,
num_speakers=None,
sampling_rate=None,
repo_id=None,
):
"""
将模型权重从 PyTorch 复制/粘贴/调整到 Transformers 设计中。
"""
# 如果提供了配置文件路径，则从预训练配置中加载配置
if config_path is not None:
config = VitsConfig.from_pretrained(config_path)
else:
# 否则创建一个新的 VitsConfig 对象
config = VitsConfig()

# 如果提供了说话人数量，则更新配置中的说话人数量和说话人嵌入大小
if num_speakers:
    config.num_speakers = num_speakers
    config.speaker_embedding_size = 256

# 如果提供了采样率，则更新配置中的采样率
if sampling_rate:
    config.sampling_rate = sampling_rate

# 如果未提供检查点路径，则下载并准备 Facebook MMS-TTS 模型所需的词汇表、配置文件和检查点路径
if checkpoint_path is None:
    logger.info(f"***Converting model: facebook/mms-tts {language}***")

    # 下载词汇表
    vocab_path = hf_hub_download(
        repo_id="facebook/mms-tts",
        filename="vocab.txt",
        subfolder=f"models/{language}",
    )
    # 下载配置文件
    config_file = hf_hub_download(
        repo_id="facebook/mms-tts",
        filename="config.json",
        subfolder=f"models/{language}",
    )
    # 下载模型检查点
    checkpoint_path = hf_hub_download(
        repo_id="facebook/mms-tts",
        filename="G_100000.pth",
        subfolder=f"models/{language}",
    )

    # 读取并加载配置文件中的超参数
    with open(config_file, "r") as f:
        data = f.read()
        hps = json.loads(data)

    # 检查模型是否针对 uroman 数据集训练，如果是则发出警告
    is_uroman = hps["data"]["training_files"].split(".")[-1] == "uroman"
    if is_uroman:
        logger.warning("For this checkpoint, you should use `uroman` to convert input text before tokenizing it!")
else:
    # 如果提供了检查点路径，则记录信息并设置 is_uroman 为 False
    logger.info(f"***Converting model: {checkpoint_path}***")
    is_uroman = False

# 如果词汇表路径为空，则设置默认的符号列表和符号到索引映射关系
if vocab_path is None:
    _pad = "_"
    _punctuation = ';:,.!?¡¿—…"«»“” '
    _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
    _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
    symbols = _pad + _punctuation + _letters + _letters_ipa
    symbol_to_id = {s: i for i, s in enumerate(symbols)}
    phonemize = True
else:
    # 否则，从给定的词汇表路径读取符号列表，并创建符号到索引映射关系
    symbols = [line.replace("\n", "") for line in open(vocab_path, encoding="utf-8").readlines()]
    symbol_to_id = {s: i for i, s in enumerate(symbols)}
    # MMS-TTS 模型不使用 <pad> 标记，所以将其设置为用于间隔字符的标记
    _pad = symbols[0]
    phonemize = False

# 创建一个临时文件，将符号到索引映射关系保存为 JSON 格式
with tempfile.NamedTemporaryFile() as tf:
    with open(tf.name, "w", encoding="utf-8") as f:
        f.write(json.dumps(symbol_to_id, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

    # 根据临时文件中的符号到索引映射关系创建一个 VitsTokenizer 对象
    tokenizer = VitsTokenizer(tf.name, language=language, phonemize=phonemize, is_uroman=is_uroman, pad_token=_pad)

# 设置配置对象中的词汇表大小
config.vocab_size = len(symbols)

# 基于配置对象创建 VitsModel 模型
model = VitsModel(config)

# 对模型的解码器应用权重归一化
model.decoder.apply_weight_norm()

# 加载原始检查点的权重到模型中
orig_checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu"))
recursively_load_weights(orig_checkpoint["model"], model)

# 移除模型的解码器上的权重归一化
model.decoder.remove_weight_norm()

# 将模型和 tokenizer 的预训练权重和词汇表保存到指定路径
model.save_pretrained(pytorch_dump_folder_path)
tokenizer.save_pretrained(pytorch_dump_folder_path)
# 如果 repo_id 存在（即非空），则执行以下操作
if repo_id:
    # 打印信息：正在推送到中心库...
    print("Pushing to the hub...")
    # 调用 tokenizer 对象的 push_to_hub 方法，将模型的 tokenizer 推送到指定的 repo_id
    tokenizer.push_to_hub(repo_id)
    # 调用 model 对象的 push_to_hub 方法，将模型本身推送到指定的 repo_id
    model.push_to_hub(repo_id)

主程序入口，用于执行脚本的入口点

if name == "main":
# 创建参数解析器对象
parser = argparse.ArgumentParser()
# 添加命令行参数，用于指定原始检查点的本地路径
parser.add_argument("--checkpoint_path", default=None, type=str, help="Local path to original checkpoint")
# 添加命令行参数，用于指定vocab.txt文件的路径
parser.add_argument("--vocab_path", default=None, type=str, help="Path to vocab.txt")
# 添加命令行参数，用于指定待转换模型的hf config.json文件的路径
parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
# 添加命令行参数，用于指定分词器语言的三字母代码
parser.add_argument("--language", default=None, type=str, help="Tokenizer language (three-letter code)")
# 添加命令行参数，用于指定说话者的数量
parser.add_argument("--num_speakers", default=None, type=int, help="Number of speakers")
# 添加命令行参数，用于指定模型训练时的采样率
parser.add_argument(
"--sampling_rate", default=None, type=int, help="Sampling rate on which the model was trained."
)
# 添加命令行参数，必需参数，用于指定输出的PyTorch模型的路径
parser.add_argument(
"--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
)
# 添加命令行参数，用于指定转换后模型上传至🤗 hub的位置
parser.add_argument(
"--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
)

# 解析命令行参数
args = parser.parse_args()

# 调用函数，将指定参数传递给convert_checkpoint函数进行检查点转换
convert_checkpoint(
    args.pytorch_dump_folder_path,
    args.checkpoint_path,
    args.config_path,
    args.vocab_path,
    args.language,
    args.num_speakers,
    args.sampling_rate,
    args.push_to_hub,
)


# `.\models\vits\modeling_vits.py`

```py
# coding=utf-8
# Copyright 2023 The Kakao Enterprise Authors and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch VITS model."""

# Import necessary libraries
import math
from dataclasses import dataclass
from typing import Any, Optional, Tuple, Union

import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn

# Import modules from Hugging Face's library
from ...activations import ACT2FN
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
from ...modeling_outputs import (
    BaseModelOutput,
    ModelOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_vits import VitsConfig

# Get the logger instance for this module
logger = logging.get_logger(__name__)

# General docstring
_CONFIG_FOR_DOC = "VitsConfig"

# List of pretrained model names for VITS
VITS_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/mms-tts-eng",
    # See all VITS models at https://huggingface.co/models?filter=vits
    # and all MMS models at https://huggingface.co/models?sort=trending&search=facebook%2Fmms-tts
]

# Dataclass representing the output structure of VITS model
@dataclass
class VitsModelOutput(ModelOutput):
    """
    Describes the outputs for the VITS model, with potential hidden states and attentions.
    """
    # 定义输入参数和它们的类型注释，这些参数是模型的输出结果
    Args:
        waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            模型预测的最终音频波形。
        sequence_lengths (`torch.FloatTensor` of shape `(batch_size,)`):
            `waveform` 批次中每个元素的样本长度。
        spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
            在流模型输出的对数梅尔频谱图。此频谱图传递给 Hi-Fi GAN 解码器模型以获取最终音频波形。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            包含模型每一层输出的隐藏状态的元组。如果模型具有嵌入层，则还包括初始嵌入输出。
            每个张量的形状为 `(batch_size, sequence_length, hidden_size)`。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            包含注意力权重的元组，每个张量形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            这些注意力权重经过注意力 softmax 后使用，用于计算自注意力头中的加权平均值。
    """

    # 初始化各个输入参数为 None，用于后续的赋值操作
    waveform: torch.FloatTensor = None
    sequence_lengths: torch.FloatTensor = None
    spectrogram: Optional[Tuple[torch.FloatTensor]] = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class VitsTextEncoderOutput(ModelOutput):
    """
    Describes the outputs for the VITS text encoder model, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            The predicted mean values of the prior distribution for the latent text variables.
        prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            The predicted log-variance values of the prior distribution for the latent text variables.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attention weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    last_hidden_state: torch.FloatTensor = None
    prior_means: torch.FloatTensor = None
    prior_log_variances: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


@torch.jit.script
def fused_add_tanh_sigmoid_multiply(input_a, input_b, num_channels):
    """
    Applies a fused operation of addition, tanh, sigmoid, and element-wise multiplication.

    Args:
        input_a (torch.FloatTensor): Input tensor A.
        input_b (torch.FloatTensor): Input tensor B.
        num_channels (int): Number of channels for splitting input tensors.

    Returns:
        torch.FloatTensor: Output tensor after applying the fused operations.
    """
    # Element-wise addition of input tensors A and B
    in_act = input_a + input_b
    # Apply tanh activation to the first `num_channels` channels
    t_act = torch.tanh(in_act[:, :num_channels, :])
    # Apply sigmoid activation to the remaining channels
    s_act = torch.sigmoid(in_act[:, num_channels:, :])
    # Element-wise multiplication of tanh and sigmoid outputs
    acts = t_act * s_act
    return acts


def _unconstrained_rational_quadratic_spline(
    inputs,
    unnormalized_widths,
    unnormalized_heights,
    unnormalized_derivatives,
    reverse=False,
    tail_bound=5.0,
    min_bin_width=1e-3,
    min_bin_height=1e-3,
    min_derivative=1e-3,
):
    """
    This transformation represents a monotonically increasing piecewise rational quadratic function. Outside of the
    `tail_bound`, the transform behaves as an identity function.

    Args:
        inputs (torch.Tensor): Input tensor to be transformed.
        unnormalized_widths (torch.Tensor): Unnormalized widths of the spline segments.
        unnormalized_heights (torch.Tensor): Unnormalized heights of the spline segments.
        unnormalized_derivatives (torch.Tensor): Unnormalized derivatives of the spline segments.
        reverse (bool, optional): If True, applies the transformation in reverse.
        tail_bound (float, optional): Bound beyond which the transform behaves as an identity function.
        min_bin_width (float, optional): Minimum width of each spline bin.
        min_bin_height (float, optional): Minimum height of each spline bin.
        min_derivative (float, optional): Minimum derivative of each spline segment.

    Returns:
        torch.Tensor: Transformed output tensor.
    """
    # Function description continues in the implementation
    # 创建一个布尔掩码，指示哪些输入值在指定的区间内
    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
    # 通过反转掩码来确定哪些输入值在区间外
    outside_interval_mask = ~inside_interval_mask

    # 初始化输出和对数绝对值行列式的张量，形状与输入相同
    outputs = torch.zeros_like(inputs)
    log_abs_det = torch.zeros_like(inputs)
    # 计算常数值，用于限制分段有理二次函数的行为
    constant = np.log(np.exp(1 - min_derivative) - 1)

    # 在维度上进行填充操作，确保未归一化导数的维度正确
    unnormalized_derivatives = nn.functional.pad(unnormalized_derivatives, pad=(1, 1))
    # 将第一个和最后一个未归一化导数设置为常数值
    unnormalized_derivatives[..., 0] = constant
    unnormalized_derivatives[..., -1] = constant

    # 对区间外的输入值直接赋值为原始输入值
    outputs[outside_interval_mask] = inputs[outside_interval_mask]
    # 对区间外的对数绝对值行列式赋值为零
    log_abs_det[outside_interval_mask] = 0.0
    # 调用 _rational_quadratic_spline 函数计算和更新输出和对数绝对行列式
    outputs[inside_interval_mask], log_abs_det[inside_interval_mask] = _rational_quadratic_spline(
        # 提供在区间内的输入数据
        inputs=inputs[inside_interval_mask],
        # 提供在区间内的未归一化宽度
        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
        # 提供在区间内的未归一化高度
        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
        # 提供在区间内的未归一化导数
        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
        # 指定是否反向处理
        reverse=reverse,
        # 指定尾部边界
        tail_bound=tail_bound,
        # 指定最小箱子宽度
        min_bin_width=min_bin_width,
        # 指定最小箱子高度
        min_bin_height=min_bin_height,
        # 指定最小导数
        min_derivative=min_derivative,
    )
    # 返回更新后的输出和对数绝对行列式
    return outputs, log_abs_det
def _rational_quadratic_spline(
    inputs,
    unnormalized_widths,
    unnormalized_heights,
    unnormalized_derivatives,
    reverse,
    tail_bound,
    min_bin_width,
    min_bin_height,
    min_derivative,
):
    """
    This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the
    function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`):
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`):
            Hidden-states as transformed by the piecewise rational quadratic function.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs`.
    """
    # 设置上界和下界为尾部限制
    upper_bound = tail_bound
    lower_bound = -tail_bound

    # 检查输入是否在定义域内
    if torch.min(inputs) < lower_bound or torch.max(inputs) > upper_bound:
        raise ValueError("Input to a transform is not within its domain")

    # 获取宽度维度的数量
    num_bins = unnormalized_widths.shape[-1]

    # 检查最小的 bin 宽度是否过大
    if min_bin_width * num_bins > 1.0:
        raise ValueError(f"Minimal bin width {min_bin_width} too large for the number of bins {num_bins}")
    # 检查最小柱高乘以柱子数量是否大于1.0，如果是则抛出值错误异常
    if min_bin_height * num_bins > 1.0:
        raise ValueError(f"Minimal bin height {min_bin_height} too large for the number of bins {num_bins}")

    # 使用 softmax 函数对未归一化的宽度进行归一化处理
    widths = nn.functional.softmax(unnormalized_widths, dim=-1)
    # 根据公式计算每个柱子的宽度
    widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
    # 计算累积宽度并进行填充，确保第一个元素为 0.0
    cumwidths = torch.cumsum(widths, dim=-1)
    cumwidths = nn.functional.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
    # 将累积宽度映射到指定的上下界
    cumwidths = (upper_bound - lower_bound) * cumwidths + lower_bound
    cumwidths[..., 0] = lower_bound  # 设置第一个元素为下界
    cumwidths[..., -1] = upper_bound  # 设置最后一个元素为上界
    # 计算每个柱子的实际宽度
    widths = cumwidths[..., 1:] - cumwidths[..., :-1]

    # 计算导数，使用 softplus 函数对未归一化的导数进行处理
    derivatives = min_derivative + nn.functional.softplus(unnormalized_derivatives)

    # 使用 softmax 函数对未归一化的高度进行归一化处理
    heights = nn.functional.softmax(unnormalized_heights, dim=-1)
    # 根据公式计算每个柱子的高度
    heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
    # 计算累积高度并进行填充，确保第一个元素为 0.0
    cumheights = torch.cumsum(heights, dim=-1)
    cumheights = nn.functional.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
    # 将累积高度映射到指定的上下界
    cumheights = (upper_bound - lower_bound) * cumheights + lower_bound
    cumheights[..., 0] = lower_bound  # 设置第一个元素为下界
    cumheights[..., -1] = upper_bound  # 设置最后一个元素为上界
    # 计算每个柱子的实际高度
    heights = cumheights[..., 1:] - cumheights[..., :-1]

    # 根据 reverse 参数选择要使用的柱子位置
    bin_locations = cumheights if reverse else cumwidths
    # 在最后一个位置加上微小的偏移量，以防止除以零的情况
    bin_locations[..., -1] += 1e-6
    # 根据输入的值确定每个输入点所属的柱子索引
    bin_idx = torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
    bin_idx = bin_idx[..., None]

    # 获取每个输入点所在柱子的累积宽度和宽度
    input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
    input_bin_widths = widths.gather(-1, bin_idx)[..., 0]

    # 获取每个输入点所在柱子的累积高度和高度
    input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
    # 计算每个柱子的斜率
    delta = heights / widths
    input_delta = delta.gather(-1, bin_idx)[..., 0]

    # 获取每个输入点所在柱子的导数和导数加一
    input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
    input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]

    # 获取每个输入点所在柱子的高度
    input_heights = heights.gather(-1, bin_idx)[..., 0]

    # 计算中间变量1
    intermediate1 = input_derivatives + input_derivatives_plus_one - 2 * input_delta
    # 如果不是反向操作，根据给定的公式计算 theta 值
    if not reverse:
        theta = (inputs - input_cumwidths) / input_bin_widths
        theta_one_minus_theta = theta * (1 - theta)

        # 计算输出值
        numerator = input_heights * (input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta)
        denominator = input_delta + intermediate1 * theta_one_minus_theta
        outputs = input_cumheights + numerator / denominator

        # 计算对数绝对值行列式的值
        derivative_numerator = input_delta.pow(2) * (
            input_derivatives_plus_one * theta.pow(2)
            + 2 * input_delta * theta_one_minus_theta
            + input_derivatives * (1 - theta).pow(2)
        )
        log_abs_det = torch.log(derivative_numerator) - 2 * torch.log(denominator)
        return outputs, log_abs_det
    # 如果输入不符合特定条件，则执行以下代码块
    else:
        # 计算二次方程的根
        intermediate2 = inputs - input_cumheights
        # 计算中间变量3，即 intermediate2 乘以 intermediate1
        intermediate3 = intermediate2 * intermediate1
        # 计算二次方程的系数 a
        a = input_heights * (input_delta - input_derivatives) + intermediate3
        # 计算二次方程的系数 b
        b = input_heights * input_derivatives - intermediate3
        # 计算二次方程的常数项 c
        c = -input_delta * intermediate2

        # 计算判别式
        discriminant = b.pow(2) - 4 * a * c
        # 如果判别式有任何值小于零，抛出运行时错误
        if not (discriminant >= 0).all():
            raise RuntimeError(f"invalid discriminant {discriminant}")

        # 计算二次方程的一个根
        root = (2 * c) / (-b - torch.sqrt(discriminant))
        # 计算输出值
        outputs = root * input_bin_widths + input_cumwidths

        # 计算 theta * (1 - theta)
        theta_one_minus_theta = root * (1 - root)
        # 计算分母
        denominator = input_delta + intermediate1 * theta_one_minus_theta
        # 计算导数的分子
        derivative_numerator = input_delta.pow(2) * (
            input_derivatives_plus_one * root.pow(2)
            + 2 * input_delta * theta_one_minus_theta
            + input_derivatives * (1 - root).pow(2)
        )
        # 计算对数绝对值行列式的值
        log_abs_det = torch.log(derivative_numerator) - 2 * torch.log(denominator)
        # 返回计算结果：输出值和对数绝对值行列式的负值
        return outputs, -log_abs_det
# 定义一个名为 VitsWaveNet 的神经网络模型类，继承自 torch.nn.Module
class VitsWaveNet(torch.nn.Module):
    # 初始化方法，接受两个参数：配置对象 config 和层数 num_layers
    def __init__(self, config: VitsConfig, num_layers: int):
        # 调用父类的初始化方法
        super().__init__()
        # 设置隐藏层大小为 config 中的 hidden_size
        self.hidden_size = config.hidden_size
        # 设置网络层数为传入的 num_layers
        self.num_layers = num_layers

        # 初始化输入层和残差跳跃连接层为 ModuleList，用于存储网络的卷积层
        self.in_layers = torch.nn.ModuleList()
        self.res_skip_layers = torch.nn.ModuleList()
        # 使用 config 中的 wavenet_dropout 设置一个 Dropout 层
        self.dropout = nn.Dropout(config.wavenet_dropout)

        # 根据是否存在 nn.utils.parametrizations.weight_norm 决定 weight_norm 函数的赋值
        if hasattr(nn.utils.parametrizations, "weight_norm"):
            weight_norm = nn.utils.parametrizations.weight_norm
        else:
            weight_norm = nn.utils.weight_norm

        # 如果 config 中的 speaker_embedding_size 不为 0，则创建一个 Conv1d 来处理说话者嵌入
        if config.speaker_embedding_size != 0:
            cond_layer = torch.nn.Conv1d(config.speaker_embedding_size, 2 * config.hidden_size * num_layers, 1)
            # 将 cond_layer 应用 weight_norm
            self.cond_layer = weight_norm(cond_layer, name="weight")

        # 循环创建 num_layers 层的卷积层
        for i in range(num_layers):
            dilation = config.wavenet_dilation_rate**i
            padding = (config.wavenet_kernel_size * dilation - dilation) // 2
            # 创建一个 dilation 卷积层，用于输入数据
            in_layer = torch.nn.Conv1d(
                in_channels=config.hidden_size,
                out_channels=2 * config.hidden_size,
                kernel_size=config.wavenet_kernel_size,
                dilation=dilation,
                padding=padding,
            )
            # 应用 weight_norm 到 in_layer
            in_layer = weight_norm(in_layer, name="weight")
            self.in_layers.append(in_layer)

            # 如果不是最后一层，创建一个残差跳跃连接层
            if i < num_layers - 1:
                res_skip_channels = 2 * config.hidden_size
            else:
                res_skip_channels = config.hidden_size

            # 创建一个 1x1 的卷积层作为残差跳跃连接层
            res_skip_layer = torch.nn.Conv1d(config.hidden_size, res_skip_channels, 1)
            # 应用 weight_norm 到 res_skip_layer
            res_skip_layer = weight_norm(res_skip_layer, name="weight")
            self.res_skip_layers.append(res_skip_layer)
    # 前向传播函数，用于模型的前向计算
    def forward(self, inputs, padding_mask, global_conditioning=None):
        # 初始化输出张量，形状与输入相同
        outputs = torch.zeros_like(inputs)
        # 创建一个张量，包含隐藏大小的整数值
        num_channels_tensor = torch.IntTensor([self.hidden_size])

        # 如果存在全局条件，则通过条件层处理全局条件数据
        if global_conditioning is not None:
            global_conditioning = self.cond_layer(global_conditioning)

        # 遍历每一层
        for i in range(self.num_layers):
            # 使用第i层输入层处理输入数据
            hidden_states = self.in_layers[i](inputs)

            # 如果存在全局条件，则从全局条件中选择对应层的状态
            if global_conditioning is not None:
                cond_offset = i * 2 * self.hidden_size
                global_states = global_conditioning[:, cond_offset : cond_offset + 2 * self.hidden_size, :]
            else:
                # 否则初始化全局状态为与隐藏状态形状相同的零张量
                global_states = torch.zeros_like(hidden_states)

            # 调用融合操作函数，计算激活函数的输出
            acts = fused_add_tanh_sigmoid_multiply(hidden_states, global_states, num_channels_tensor[0])
            # 对激活输出进行dropout处理
            acts = self.dropout(acts)

            # 使用残差连接和跳跃连接层处理激活输出
            res_skip_acts = self.res_skip_layers[i](acts)
            if i < self.num_layers - 1:
                # 如果不是最后一层，则进行残差连接
                res_acts = res_skip_acts[:, : self.hidden_size, :]
                inputs = (inputs + res_acts) * padding_mask
                outputs = outputs + res_skip_acts[:, self.hidden_size :, :]
            else:
                # 如果是最后一层，则仅将输出增加残差跳跃连接层的输出
                outputs = outputs + res_skip_acts

        # 最后将输出乘以填充掩码，返回最终的输出张量
        return outputs * padding_mask

    # 移除所有权重归一化操作
    def remove_weight_norm(self):
        # 如果存在说话者嵌入大小，则移除条件层的权重归一化
        if self.speaker_embedding_size != 0:
            torch.nn.utils.remove_weight_norm(self.cond_layer)
        # 分别对每一层输入层和残差跳跃连接层移除权重归一化
        for layer in self.in_layers:
            torch.nn.utils.remove_weight_norm(layer)
        for layer in self.res_skip_layers:
            torch.nn.utils.remove_weight_norm(layer)
# 定义一个名为 VitsPosteriorEncoder 的神经网络模块
class VitsPosteriorEncoder(nn.Module):
    # 初始化函数，接受一个 VitsConfig 类型的参数 config
    def __init__(self, config: VitsConfig):
        super().__init__()
        # 设置输出通道数为 config.flow_size
        self.out_channels = config.flow_size

        # 使用 1 维卷积定义 conv_pre 层，输入通道数为 config.spectrogram_bins，输出通道数为 config.hidden_size，卷积核大小为 1
        self.conv_pre = nn.Conv1d(config.spectrogram_bins, config.hidden_size, 1)
        # 初始化一个 VitsWaveNet 类型的模型 wavenet，传入参数 config 和 posterior_encoder_num_wavenet_layers
        self.wavenet = VitsWaveNet(config, num_layers=config.posterior_encoder_num_wavenet_layers)
        # 使用 1 维卷积定义 conv_proj 层，输入通道数为 config.hidden_size，输出通道数为 self.out_channels * 2，卷积核大小为 1
        self.conv_proj = nn.Conv1d(config.hidden_size, self.out_channels * 2, 1)

    # 前向传播函数，接受 inputs（输入数据）、padding_mask（填充掩码）、global_conditioning（全局条件）作为参数
    def forward(self, inputs, padding_mask, global_conditioning=None):
        # 对输入数据 inputs 应用 conv_pre 层和 padding_mask，然后将结果赋值给 inputs
        inputs = self.conv_pre(inputs) * padding_mask
        # 将处理后的 inputs 输入到 wavenet 模型中进行处理，同时传入 padding_mask 和 global_conditioning
        inputs = self.wavenet(inputs, padding_mask, global_conditioning)
        # 对处理后的结果应用 conv_proj 层和 padding_mask，然后将结果赋值给 stats
        stats = self.conv_proj(inputs) * padding_mask
        # 将 stats 按照第二个维度（通道维度）拆分为均值 mean 和对数标准差 log_stddev
        mean, log_stddev = torch.split(stats, self.out_channels, dim=1)
        # 使用均值 mean 和随机生成的正态分布数据（标准差为 exp(log_stddev)）生成采样数据，并应用 padding_mask
        sampled = (mean + torch.randn_like(mean) * torch.exp(log_stddev)) * padding_mask
        # 返回采样数据 sampled、均值 mean 和对数标准差 log_stddev
        return sampled, mean, log_stddev


# 从 transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock 复制而来的类
class HifiGanResidualBlock(nn.Module):
    # 初始化函数，接受 channels（通道数）、kernel_size（卷积核大小，默认为 3）、dilation（膨胀率元组，默认为 (1, 3, 5)）、leaky_relu_slope（LeakyReLU 斜率，默认为 0.1）作为参数
    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
        super().__init__()
        # 设置 LeakyReLU 的斜率
        self.leaky_relu_slope = leaky_relu_slope

        # 创建多个 1 维卷积层的列表 convs1，每个卷积层的输入输出通道数相同，采用不同的膨胀率
        self.convs1 = nn.ModuleList(
            [
                nn.Conv1d(
                    channels,
                    channels,
                    kernel_size,
                    stride=1,
                    dilation=dilation[i],
                    padding=self.get_padding(kernel_size, dilation[i]),
                )
                for i in range(len(dilation))
            ]
        )
        # 创建多个 1 维卷积层的列表 convs2，每个卷积层的输入输出通道数相同，都采用膨胀率为 1
        self.convs2 = nn.ModuleList(
            [
                nn.Conv1d(
                    channels,
                    channels,
                    kernel_size,
                    stride=1,
                    dilation=1,
                    padding=self.get_padding(kernel_size, 1),
                )
                for _ in range(len(dilation))
            ]
        )

    # 获取给定卷积核大小和膨胀率的填充数
    def get_padding(self, kernel_size, dilation=1):
        return (kernel_size * dilation - dilation) // 2

    # 对 convs1 和 convs2 中的卷积层应用权重归一化
    def apply_weight_norm(self):
        for layer in self.convs1:
            nn.utils.weight_norm(layer)
        for layer in self.convs2:
            nn.utils.weight_norm(layer)

    # 移除 convs1 和 convs2 中的卷积层的权重归一化
    def remove_weight_norm(self):
        for layer in self.convs1:
            nn.utils.remove_weight_norm(layer)
        for layer in self.convs2:
            nn.utils.remove_weight_norm(layer)

    # 前向传播函数，接受 hidden_states（隐藏状态）作为输入
    def forward(self, hidden_states):
        # 遍历 convs1 和 convs2 中的每一对卷积层
        for conv1, conv2 in zip(self.convs1, self.convs2):
            # 将隐藏状态作为残差项保存
            residual = hidden_states
            # 应用 LeakyReLU 激活函数
            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
            # 应用 conv1 卷积层
            hidden_states = conv1(hidden_states)
            # 再次应用 LeakyReLU 激活函数
            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
            # 应用 conv2 卷积层
            hidden_states = conv2(hidden_states)
            # 将残差项加到输出上，形成残差连接
            hidden_states = hidden_states + residual
        # 返回最终的隐藏状态
        return hidden_states
    # 初始化函数，接受一个VitsConfig类型的配置对象作为参数
    def __init__(self, config: VitsConfig):
        super().__init__()  # 调用父类的初始化函数

        # 将配置对象保存在实例变量中
        self.config = config

        # 计算残差块卷积核数量和上采样率的数量
        self.num_kernels = len(config.resblock_kernel_sizes)
        self.num_upsamples = len(config.upsample_rates)

        # 创建一个1维卷积层，作为初始卷积层
        self.conv_pre = nn.Conv1d(
            config.flow_size,
            config.upsample_initial_channel,
            kernel_size=7,
            stride=1,
            padding=3,
        )

        # 创建一个空的模块列表，用于存放上采样的卷积层
        self.upsampler = nn.ModuleList()
        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
            # 每次迭代，向模块列表中添加一个转置卷积层
            self.upsampler.append(
                nn.ConvTranspose1d(
                    config.upsample_initial_channel // (2**i),
                    config.upsample_initial_channel // (2 ** (i + 1)),
                    kernel_size=kernel_size,
                    stride=upsample_rate,
                    padding=(kernel_size - upsample_rate) // 2,
                )
            )

        # 创建一个空的模块列表，用于存放残差块
        self.resblocks = nn.ModuleList()
        for i in range(len(self.upsampler)):
            channels = config.upsample_initial_channel // (2 ** (i + 1))
            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
                # 每次迭代，向模块列表中添加一个残差块
                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))

        # 创建一个1维卷积层，作为后处理卷积层
        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3, bias=False)

        # 如果配置中指定了说话人嵌入的大小，则创建一个条件卷积层
        if config.speaker_embedding_size != 0:
            self.cond = nn.Conv1d(config.speaker_embedding_size, config.upsample_initial_channel, 1)

    # 对模型中的上采样层应用权重归一化
    def apply_weight_norm(self):
        for layer in self.upsampler:
            nn.utils.weight_norm(layer)
        for layer in self.resblocks:
            layer.apply_weight_norm()

    # 移除模型中的上采样层的权重归一化
    def remove_weight_norm(self):
        for layer in self.upsampler:
            nn.utils.remove_weight_norm(layer)
        for layer in self.resblocks:
            layer.remove_weight_norm()

    # 前向传播函数，接受频谱图和全局条件作为输入
    def forward(
        self, spectrogram: torch.FloatTensor, global_conditioning: Optional[torch.FloatTensor] = None
    ):
        # 省略函数体，由具体的前向传播逻辑组成
    ) -> torch.FloatTensor:
        r"""
        Converts a spectrogram into a speech waveform.

        Args:
            spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`):
                Tensor containing the spectrograms.
            global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, *optional*):
                Tensor containing speaker embeddings, for multispeaker models.

        Returns:
            `torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform.
        """
        # 将输入的频谱图通过预处理卷积层转换
        hidden_states = self.conv_pre(spectrogram)

        # 如果提供了全局条件信息，则通过条件模块进行调节
        if global_conditioning is not None:
            hidden_states = hidden_states + self.cond(global_conditioning)

        # 多次进行上采样操作，使用LeakyReLU作为激活函数
        for i in range(self.num_upsamples):
            hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope)
            hidden_states = self.upsampler[i](hidden_states)

            # 应用残差块以保留重要信息并减少训练中的梯度消失问题
            res_state = self.resblocks[i * self.num_kernels](hidden_states)
            for j in range(1, self.num_kernels):
                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
            hidden_states = res_state / self.num_kernels

        # 最终通过LeakyReLU和后处理卷积层处理得到最终的波形数据
        hidden_states = nn.functional.leaky_relu(hidden_states)
        hidden_states = self.conv_post(hidden_states)
        waveform = torch.tanh(hidden_states)
        return waveform
class VitsResidualCouplingLayer(nn.Module):
    def __init__(self, config: VitsConfig):
        super().__init__()
        # 计算每半通道数，用于定义不同层次的卷积大小
        self.half_channels = config.flow_size // 2

        # 前处理卷积层，将半通道数转换为隐藏层大小
        self.conv_pre = nn.Conv1d(self.half_channels, config.hidden_size, 1)
        # WaveNet 模型，使用给定的配置和层数
        self.wavenet = VitsWaveNet(config, num_layers=config.prior_encoder_num_wavenet_layers)
        # 后处理卷积层，将隐藏层大小转换回半通道数
        self.conv_post = nn.Conv1d(config.hidden_size, self.half_channels, 1)

    def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
        # 将输入张量拆分为两半，分别处理
        first_half, second_half = torch.split(inputs, [self.half_channels] * 2, dim=1)
        # 使用前处理卷积层处理第一半数据，同时考虑填充掩码
        hidden_states = self.conv_pre(first_half) * padding_mask
        # 将处理后的数据输入 WaveNet 模型，考虑填充掩码和全局条件
        hidden_states = self.wavenet(hidden_states, padding_mask, global_conditioning)
        # 使用后处理卷积层处理 WaveNet 输出，同时考虑填充掩码
        mean = self.conv_post(hidden_states) * padding_mask
        # 初始化对数标准差为零张量
        log_stddev = torch.zeros_like(mean)

        if not reverse:
            # 如果不是反向模式，则执行如下操作
            # 计算第二半数据的均值修正，并考虑填充掩码和对数标准差
            second_half = mean + second_half * torch.exp(log_stddev) * padding_mask
            # 将修正后的数据拼接起来作为输出
            outputs = torch.cat([first_half, second_half], dim=1)
            # 计算对数行列式，以便可逆层的反向传播使用
            log_determinant = torch.sum(log_stddev, [1, 2])
            return outputs, log_determinant
        else:
            # 如果是反向模式，则执行如下操作
            # 计算第二半数据的均值修正反向，并考虑填充掩码和对数标准差
            second_half = (second_half - mean) * torch.exp(-log_stddev) * padding_mask
            # 将修正后的数据拼接起来作为输出
            outputs = torch.cat([first_half, second_half], dim=1)
            return outputs, None


class VitsResidualCouplingBlock(nn.Module):
    def __init__(self, config: VitsConfig):
        super().__init__()
        # 创建多个 VitsResidualCouplingLayer 层作为流
        self.flows = nn.ModuleList()
        for _ in range(config.prior_encoder_num_flows):
            self.flows.append(VitsResidualCouplingLayer(config))

    def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
        if not reverse:
            # 如果不是反向模式，则对每个流执行正向操作
            for flow in self.flows:
                inputs, _ = flow(inputs, padding_mask, global_conditioning)
                inputs = torch.flip(inputs, [1])  # 翻转张量维度1（时间维度）
        else:
            # 如果是反向模式，则对每个流执行反向操作
            for flow in reversed(self.flows):
                inputs = torch.flip(inputs, [1])  # 翻转张量维度1（时间维度）
                inputs, _ = flow(inputs, padding_mask, global_conditioning, reverse=True)
        return inputs
    # 初始化方法，接受一个VitsConfig类型的配置和一个可选的dropout_rate参数
    def __init__(self, config: VitsConfig, dropout_rate=0.0):
        # 调用父类的初始化方法
        super().__init__()
        # 获取配置中的参数并赋值给本地变量
        kernel_size = config.duration_predictor_kernel_size
        channels = config.hidden_size
        self.num_layers = config.depth_separable_num_layers

        # 创建一个Dropout层，用于随机丢弃输入数据中的部分神经元
        self.dropout = nn.Dropout(dropout_rate)
        # 初始化一个ModuleList用于存放深度可分离卷积层
        self.convs_dilated = nn.ModuleList()
        # 初始化一个ModuleList用于存放逐点卷积层
        self.convs_pointwise = nn.ModuleList()
        # 初始化一个ModuleList用于存放LayerNorm层1
        self.norms_1 = nn.ModuleList()
        # 初始化一个ModuleList用于存放LayerNorm层2
        self.norms_2 = nn.ModuleList()
        
        # 循环创建num_layers个深度可分离卷积、逐点卷积、LayerNorm层1和LayerNorm层2
        for i in range(self.num_layers):
            # 计算当前层的膨胀系数和填充数
            dilation = kernel_size**i
            padding = (kernel_size * dilation - dilation) // 2
            # 添加一个深度可分离卷积层到ModuleList中
            self.convs_dilated.append(
                nn.Conv1d(
                    in_channels=channels,
                    out_channels=channels,
                    kernel_size=kernel_size,
                    groups=channels,
                    dilation=dilation,
                    padding=padding,
                )
            )
            # 添加一个逐点卷积层到ModuleList中
            self.convs_pointwise.append(nn.Conv1d(channels, channels, 1))
            # 添加一个LayerNorm层1到ModuleList中
            self.norms_1.append(nn.LayerNorm(channels))
            # 添加一个LayerNorm层2到ModuleList中
            self.norms_2.append(nn.LayerNorm(channels))

    # 前向传播方法，接受输入数据、填充遮罩和全局条件（可选），返回处理后的数据
    def forward(self, inputs, padding_mask, global_conditioning=None):
        # 如果有全局条件，则将输入数据和全局条件相加
        if global_conditioning is not None:
            inputs = inputs + global_conditioning

        # 循环进行num_layers次操作
        for i in range(self.num_layers):
            # 应用深度可分离卷积层到输入数据，并乘以填充遮罩
            hidden_states = self.convs_dilated[i](inputs * padding_mask)
            # 对卷积后的隐藏状态应用LayerNorm层1
            hidden_states = self.norms_1[i](hidden_states.transpose(1, -1)).transpose(1, -1)
            # 应用GELU激活函数
            hidden_states = nn.functional.gelu(hidden_states)
            # 应用逐点卷积层到激活后的隐藏状态
            hidden_states = self.convs_pointwise[i](hidden_states)
            # 对逐点卷积后的隐藏状态应用LayerNorm层2
            hidden_states = self.norms_2[i](hidden_states.transpose(1, -1)).transpose(1, -1)
            # 应用GELU激活函数
            hidden_states = nn.functional.gelu(hidden_states)
            # 应用Dropout层到隐藏状态
            hidden_states = self.dropout(hidden_states)
            # 将输入数据和处理后的隐藏状态相加，得到下一层的输入数据
            inputs = inputs + hidden_states

        # 返回处理后的输入数据乘以填充遮罩
        return inputs * padding_mask
# 定义一个名为 VitsConvFlow 的自定义神经网络模块，继承自 nn.Module 类
class VitsConvFlow(nn.Module):
    # 初始化函数，接受一个 VitsConfig 类型的配置对象作为参数
    def __init__(self, config: VitsConfig):
        super().__init__()
        # 设置卷积层的输出通道数为隐藏大小
        self.filter_channels = config.hidden_size
        # 设置卷积深度可分离卷积的通道数为隐藏大小的一半
        self.half_channels = config.depth_separable_channels // 2
        # 设置持续时间预测流的分箱数
        self.num_bins = config.duration_predictor_flow_bins
        # 设置持续时间预测的尾部边界
        self.tail_bound = config.duration_predictor_tail_bound

        # 定义预卷积层，输入通道数为半通道数，输出通道数为过滤器通道数
        self.conv_pre = nn.Conv1d(self.half_channels, self.filter_channels, 1)
        # 定义扩展的深度可分离卷积层
        self.conv_dds = VitsDilatedDepthSeparableConv(config)
        # 定义投影卷积层，输入通道数为过滤器通道数，输出通道数为半通道数乘以（分箱数乘以3再减1）
        self.conv_proj = nn.Conv1d(self.filter_channels, self.half_channels * (self.num_bins * 3 - 1), 1)

    # 前向传播函数，接受输入、填充掩码、全局条件（可选）、是否反向（可选）作为参数
    def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
        # 将输入张量按通道数的一半分割成两部分
        first_half, second_half = torch.split(inputs, [self.half_channels] * 2, dim=1)

        # 对第一部分进行预卷积操作
        hidden_states = self.conv_pre(first_half)
        # 对预卷积结果进行深度可分离卷积操作
        hidden_states = self.conv_dds(hidden_states, padding_mask, global_conditioning)
        # 对深度可分离卷积结果进行投影卷积，并乘以填充掩码
        hidden_states = self.conv_proj(hidden_states) * padding_mask

        # 获取批次大小、通道数和长度
        batch_size, channels, length = first_half.shape
        # 重塑隐藏状态张量的形状，并对维度进行置换
        hidden_states = hidden_states.reshape(batch_size, channels, -1, length).permute(0, 1, 3, 2)

        # 提取未归一化的宽度、高度和导数
        unnormalized_widths = hidden_states[..., : self.num_bins] / math.sqrt(self.filter_channels)
        unnormalized_heights = hidden_states[..., self.num_bins : 2 * self.num_bins] / math.sqrt(self.filter_channels)
        unnormalized_derivatives = hidden_states[..., 2 * self.num_bins :]

        # 使用非约束有理二次样条函数对第二部分进行变换，并返回变换后的结果和对数绝对值行列式
        second_half, log_abs_det = _unconstrained_rational_quadratic_spline(
            second_half,
            unnormalized_widths,
            unnormalized_heights,
            unnormalized_derivatives,
            reverse=reverse,
            tail_bound=self.tail_bound,
        )

        # 将第一部分和变换后的第二部分连接起来，并乘以填充掩码
        outputs = torch.cat([first_half, second_half], dim=1) * padding_mask
        # 如果不是反向传播，则计算对数行列式，并返回结果和对数行列式
        if not reverse:
            log_determinant = torch.sum(log_abs_det * padding_mask, [1, 2])
            return outputs, log_determinant
        # 如果是反向传播，则只返回结果
        else:
            return outputs, None


# 定义一个名为 VitsElementwiseAffine 的自定义神经网络模块，继承自 nn.Module 类
class VitsElementwiseAffine(nn.Module):
    # 初始化函数，接受一个 VitsConfig 类型的配置对象作为参数
    def __init__(self, config: VitsConfig):
        super().__init__()
        # 设置通道数为深度可分离卷积的通道数
        self.channels = config.depth_separable_channels
        # 定义平移参数和对数尺度参数作为可训练参数
        self.translate = nn.Parameter(torch.zeros(self.channels, 1))
        self.log_scale = nn.Parameter(torch.zeros(self.channels, 1))

    # 前向传播函数，接受输入、填充掩码、全局条件（可选）、是否反向（可选）作为参数
    def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
        # 如果不是反向传播，则计算输出并乘以填充掩码
        if not reverse:
            outputs = self.translate + torch.exp(self.log_scale) * inputs
            outputs = outputs * padding_mask
            log_determinant = torch.sum(self.log_scale * padding_mask, [1, 2])
            return outputs, log_determinant
        # 如果是反向传播，则计算输出并返回结果
        else:
            outputs = (inputs - self.translate) * torch.exp(-self.log_scale) * padding_mask
            return outputs, None
    # 初始化函数，用于初始化对象
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 从配置中获取说话人嵌入大小作为嵌入维度
        embed_dim = config.speaker_embedding_size
        # 从配置中获取隐藏层大小作为卷积滤波器的通道数
        filter_channels = config.hidden_size

        # 定义预处理卷积层，输入和输出通道数都为 filter_channels，卷积核大小为 1
        self.conv_pre = nn.Conv1d(filter_channels, filter_channels, 1)
        # 定义投影卷积层，输入和输出通道数都为 filter_channels，卷积核大小为 1
        self.conv_proj = nn.Conv1d(filter_channels, filter_channels, 1)
        
        # 创建 VitsDilatedDepthSeparableConv 模块，配置中包括 dropout 率
        self.conv_dds = VitsDilatedDepthSeparableConv(
            config,
            dropout_rate=config.duration_predictor_dropout,
        )

        # 如果嵌入维度不为 0，则定义条件卷积层，将嵌入维度映射到 filter_channels
        if embed_dim != 0:
            self.cond = nn.Conv1d(embed_dim, filter_channels, 1)

        # 创建流模块列表，第一个元素是 VitsElementwiseAffine 模块
        self.flows = nn.ModuleList()
        self.flows.append(VitsElementwiseAffine(config))
        
        # 根据配置循环创建多个 VitsConvFlow 模块，用于流模块列表
        for _ in range(config.duration_predictor_num_flows):
            self.flows.append(VitsConvFlow(config))

        # 定义后处理的预处理卷积层，输入通道数为 1，输出通道数为 filter_channels，卷积核大小为 1
        self.post_conv_pre = nn.Conv1d(1, filter_channels, 1)
        # 定义后处理的投影卷积层，输入和输出通道数都为 filter_channels，卷积核大小为 1
        self.post_conv_proj = nn.Conv1d(filter_channels, filter_channels, 1)
        
        # 创建后处理的 VitsDilatedDepthSeparableConv 模块，配置中包括 dropout 率
        self.post_conv_dds = VitsDilatedDepthSeparableConv(
            config,
            dropout_rate=config.duration_predictor_dropout,
        )

        # 创建后处理流模块列表，第一个元素是 VitsElementwiseAffine 模块
        self.post_flows = nn.ModuleList()
        self.post_flows.append(VitsElementwiseAffine(config))
        
        # 根据配置循环创建多个 VitsConvFlow 模块，用于后处理流模块列表
        for _ in range(config.duration_predictor_num_flows):
            self.post_flows.append(VitsConvFlow(config))
class VitsDurationPredictor(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 从配置中获取预测器的参数
        kernel_size = config.duration_predictor_kernel_size
        filter_channels = config.duration_predictor_filter_channels

        # 定义模型的各个层和模块
        self.dropout = nn.Dropout(config.duration_predictor_dropout)
        # 第一个卷积层，用于特征提取
        self.conv_1 = nn.Conv1d(config.hidden_size, filter_channels, kernel_size, padding=kernel_size // 2)
        self.norm_1 = nn.LayerNorm(filter_channels, eps=config.layer_norm_eps)
        # 第二个卷积层，用于进一步提取特征
        self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
        self.norm_2 = nn.LayerNorm(filter_channels, eps=config.layer_norm_eps)
        # 最后的投影层，用于预测持续时间
        self.proj = nn.Conv1d(filter_channels, 1, 1)

        # 如果有说话者嵌入的大小，则定义条件卷积层
        if config.speaker_embedding_size != 0:
            self.cond = nn.Conv1d(config.speaker_embedding_size, config.hidden_size, 1)

    def forward(self, inputs, padding_mask, global_conditioning=None):
        # 对输入进行离散化处理，防止梯度回传到条件信息
        inputs = torch.detach(inputs)

        # 如果有全局条件信息，则将其加入到输入中
        if global_conditioning is not None:
            global_conditioning = torch.detach(global_conditioning)
            inputs = inputs + self.cond(global_conditioning)

        # 第一层卷积，激活函数，层归一化和 dropout
        inputs = self.conv_1(inputs * padding_mask)
        inputs = torch.relu(inputs)
        inputs = self.norm_1(inputs.transpose(1, -1)).transpose(1, -1)
        inputs = self.dropout(inputs)

        # 第二层卷积，激活函数，层归一化和 dropout
        inputs = self.conv_2(inputs * padding_mask)
        inputs = torch.relu(inputs)
        inputs = self.norm_2(inputs.transpose(1, -1)).transpose(1, -1)
        inputs = self.dropout(inputs)

        # 最终的投影层，用于生成持续时间预测
        inputs = self.proj(inputs * padding_mask)
        return inputs * padding_mask


class VitsAttention(nn.Module):
    """Multi-headed attention with relative positional representation."""

    def __init__(self, config: VitsConfig):
        super().__init__()
        # 从配置中获取注意力机制的参数
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.dropout = config.attention_dropout
        self.window_size = config.window_size

        # 计算每个头的维度
        self.head_dim = self.embed_dim // self.num_heads
        self.scaling = self.head_dim**-0.5

        # 检查隐藏层维度是否可以被头数整除
        if (self.head_dim * self.num_heads) != self.embed_dim:
            raise ValueError(
                f"hidden_size must be divisible by num_attention_heads (got `hidden_size`: {self.embed_dim}"
                f" and `num_attention_heads`: {self.num_heads})."
            )

        # 定义键、值、查询和输出的线性投影层
        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)

        # 如果定义了窗口大小，则使用相对位置表示
        if self.window_size:
            self.emb_rel_k = nn.Parameter(torch.randn(1, self.window_size * 2 + 1, self.head_dim) * self.scaling)
            self.emb_rel_v = nn.Parameter(torch.randn(1, self.window_size * 2 + 1, self.head_dim) * self.scaling)
    # 将输入张量重新形状为指定的形状，用于多头注意力机制中
    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    # 实现 Transformer 模型的前向传播
    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 这里是前向传播函数，接收多个输入参数并进行计算，返回输出结果

    # 获取相对位置嵌入的方法
    def _get_relative_embeddings(self, relative_embeddings, length):
        pad_length = max(length - (self.window_size + 1), 0)
        if pad_length > 0:
            # 在相对嵌入张量的长度维度上进行填充
            relative_embeddings = nn.functional.pad(relative_embeddings, [0, 0, pad_length, pad_length, 0, 0])

        slice_start_position = max((self.window_size + 1) - length, 0)
        slice_end_position = slice_start_position + 2 * length - 1
        # 切片获取相对位置嵌入的部分
        return relative_embeddings[:, slice_start_position:slice_end_position]

    # 将相对位置转换为绝对位置
    def _relative_position_to_absolute_position(self, x):
        batch_heads, length, _ = x.size()

        # 在最后一列上进行填充，以进行相对索引到绝对索引的转换
        x = nn.functional.pad(x, [0, 1, 0, 0, 0, 0])

        # 扩展额外元素以匹配形状 (len+1, 2*len-1)
        x_flat = x.view([batch_heads, length * 2 * length])
        x_flat = nn.functional.pad(x_flat, [0, length - 1, 0, 0])

        # 重塑并切片去除填充元素
        x_final = x_flat.view([batch_heads, length + 1, 2 * length - 1])
        x_final = x_final[:, :length, length - 1 :]
        return x_final

    # 将绝对位置转换为相对位置
    def _absolute_position_to_relative_position(self, x):
        batch_heads, length, _ = x.size()

        # 沿着列维度进行填充
        x = nn.functional.pad(x, [0, length - 1, 0, 0, 0, 0])
        x_flat = x.view([batch_heads, length * (2 * length - 1)])

        # 在重塑后的元素前面添加 0，以平移元素位置
        x_flat = nn.functional.pad(x_flat, [length, 0, 0, 0])
        x_final = x_flat.view([batch_heads, length, 2 * length])[:, :, 1:]
        return x_final
# 定义 VitsFeedForward 类，继承自 nn.Module，用于实现 Vits 模型的前馈网络部分
class VitsFeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 第一个卷积层，输入通道数为 config.hidden_size，输出通道数为 config.ffn_dim，卷积核大小为 config.ffn_kernel_size
        self.conv_1 = nn.Conv1d(config.hidden_size, config.ffn_dim, config.ffn_kernel_size)
        # 第二个卷积层，输入通道数为 config.ffn_dim，输出通道数为 config.hidden_size，卷积核大小为 config.ffn_kernel_size
        self.conv_2 = nn.Conv1d(config.ffn_dim, config.hidden_size, config.ffn_kernel_size)
        # Dropout 层，用于随机失活，参数为 config.activation_dropout
        self.dropout = nn.Dropout(config.activation_dropout)

        # 根据配置文件中的 hidden_act 参数确定激活函数
        if isinstance(config.hidden_act, str):
            self.act_fn = ACT2FN[config.hidden_act]
        else:
            self.act_fn = config.hidden_act

        # 如果卷积核大小大于 1，则设置填充值以保证卷积操作不改变张量的维度
        if config.ffn_kernel_size > 1:
            pad_left = (config.ffn_kernel_size - 1) // 2
            pad_right = config.ffn_kernel_size // 2
            self.padding = [pad_left, pad_right, 0, 0, 0, 0]
        else:
            self.padding = None

    # 前向传播函数，接受 hidden_states 和 padding_mask 作为输入
    def forward(self, hidden_states, padding_mask):
        # 调整 hidden_states 的维度顺序，使得通道维度变为第二维度
        hidden_states = hidden_states.permute(0, 2, 1)
        padding_mask = padding_mask.permute(0, 2, 1)

        # 将 hidden_states 和 padding_mask 进行逐元素乘法
        hidden_states = hidden_states * padding_mask
        
        # 如果有设置填充值，对 hidden_states 进行填充操作
        if self.padding is not None:
            hidden_states = nn.functional.pad(hidden_states, self.padding)

        # 经过第一个卷积层、激活函数、以及 Dropout 层的处理
        hidden_states = self.conv_1(hidden_states)
        hidden_states = self.act_fn(hidden_states)
        hidden_states = self.dropout(hidden_states)

        # 再次经过逐元素乘法操作
        hidden_states = hidden_states * padding_mask

        # 如果有设置填充值，再次对 hidden_states 进行填充操作
        if self.padding is not None:
            hidden_states = nn.functional.pad(hidden_states, self.padding)

        # 经过第二个卷积层的处理
        hidden_states = self.conv_2(hidden_states)

        # 再次经过逐元素乘法操作
        hidden_states = hidden_states * padding_mask

        # 调整 hidden_states 的维度顺序，使得通道维度恢复到最后一维
        hidden_states = hidden_states.permute(0, 2, 1)
        
        # 返回处理后的 hidden_states
        return hidden_states


# 定义 VitsEncoderLayer 类，继承自 nn.Module，用于实现 Vits 模型的编码器层
class VitsEncoderLayer(nn.Module):
    def __init__(self, config: VitsConfig):
        super().__init__()
        # 自注意力机制层
        self.attention = VitsAttention(config)
        # Dropout 层，用于随机失活，参数为 config.hidden_dropout
        self.dropout = nn.Dropout(config.hidden_dropout)
        # LayerNorm 层，用于归一化输入数据
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 前馈网络层
        self.feed_forward = VitsFeedForward(config)
        # 最终归一化层
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    # 前向传播函数，接受 hidden_states、padding_mask、attention_mask 和 output_attentions 作为输入
    def forward(
        self,
        hidden_states: torch.Tensor,
        padding_mask: torch.FloatTensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 保存残差连接
        residual = hidden_states
        
        # 自注意力机制层的前向传播，返回处理后的 hidden_states 和注意力权重
        hidden_states, attn_weights = self.attention(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
        )

        # 经过 Dropout 层处理
        hidden_states = self.dropout(hidden_states)
        
        # 残差连接和 LayerNorm 层的处理
        hidden_states = self.layer_norm(residual + hidden_states)

        # 保存新的残差连接
        residual = hidden_states
        
        # 前馈网络层的前向传播
        hidden_states = self.feed_forward(hidden_states, padding_mask)
        
        # 经过 Dropout 层处理
        hidden_states = self.dropout(hidden_states)
        
        # 最终归一化层的处理
        hidden_states = self.final_layer_norm(residual + hidden_states)

        # 输出结果保存在 outputs 中
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则将 attn_weights 加入到 outputs 中
        if output_attentions:
            outputs += (attn_weights,)

        # 返回 outputs
        return outputs
    # 初始化函数，用于创建一个新的VitsEncoder对象
    def __init__(self, config: VitsConfig):
        # 调用父类的初始化函数，确保继承父类的属性和方法
        super().__init__()
        # 将传入的配置对象保存在实例变量中，以便在类中的其他方法中使用
        self.config = config
        # 创建一个包含多个VitsEncoderLayer对象的模块列表，数量由配置中的num_hidden_layers指定
        self.layers = nn.ModuleList([VitsEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        # 设置梯度检查点标志为False，表示不使用梯度检查点功能
        self.gradient_checkpointing = False
        # 设置层丢弃率，从配置中获取
        self.layerdrop = config.layerdrop

    # 前向传播函数，定义了VitsEncoder对象的数据流向
    def forward(
        self,
        hidden_states: torch.FloatTensor,
        padding_mask: torch.FloatTensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    # 定义函数的返回类型为元组或 BaseModelOutput 类型
    ) -> Union[Tuple, BaseModelOutput]:
    
        # 如果不输出隐藏状态，则初始化空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果不输出注意力权重，则初始化空元组
        all_self_attentions = () if output_attentions else None
    
        # 扩展 attention_mask 到四维张量
        if attention_mask is not None:
            # 将二维注意力掩码扩展为四维张量 [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
    
        # 对隐藏状态应用填充掩码
        hidden_states = hidden_states * padding_mask
    
        # 检查是否启用了 DeepSpeed Zero3
        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
    
        # 遍历所有的编码器层
        for encoder_layer in self.layers:
            # 如果输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)
    
            # 添加 LayerDrop（参见 https://arxiv.org/abs/1909.11556 进行描述）
            dropout_probability = np.random.uniform(0, 1)
    
            # 计算是否跳过当前层
            skip_the_layer = self.training and (dropout_probability < self.layerdrop)
            
            # 如果不跳过当前层或者 DeepSpeed Zero3 已启用
            if not skip_the_layer or deepspeed_zero3_is_enabled:
                # 如果启用了梯度检查点且在训练模式下，则使用梯度检查点函数计算层输出
                if self.gradient_checkpointing and self.training:
                    layer_outputs = self._gradient_checkpointing_func(
                        encoder_layer.__call__,
                        hidden_states,
                        padding_mask,
                        attention_mask,
                        output_attentions,
                    )
                else:
                    # 否则直接调用编码器层计算层输出
                    layer_outputs = encoder_layer(
                        hidden_states,
                        attention_mask=attention_mask,
                        padding_mask=padding_mask,
                        output_attentions=output_attentions,
                    )
                # 更新隐藏状态为当前层的输出的第一个元素
                hidden_states = layer_outputs[0]
    
            # 如果跳过当前层，则设置层输出为 None
            if skip_the_layer:
                layer_outputs = (None, None)
    
            # 如果输出注意力权重，则将当前层的注意力权重添加到 all_self_attentions 中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
    
        # 对最终的隐藏状态再次应用填充掩码
        hidden_states = hidden_states * padding_mask
    
        # 如果输出隐藏状态，则将最终的隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)
    
        # 如果不返回字典形式的结果，则返回所有非 None 的结果元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
    
        # 返回 BaseModelOutput 类型的结果
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
class VitsTextEncoder(nn.Module):
    """
    Transformer encoder that uses relative positional representation instead of absolute positional encoding.
    """

    def __init__(self, config: VitsConfig):
        super().__init__()
        self.config = config
        # 初始化词嵌入层，vocab_size为词汇表大小，hidden_size为隐藏层大小，pad_token_id为填充token的ID
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
        # 初始化编码器
        self.encoder = VitsEncoder(config)
        # 项目层，使用1维卷积进行投影
        self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1)

    def get_input_embeddings(self):
        # 返回输入的词嵌入层
        return self.embed_tokens

    def set_input_embeddings(self, value):
        # 设置输入的词嵌入层
        self.embed_tokens = value

    def forward(
        self,
        input_ids: torch.Tensor,
        padding_mask: torch.FloatTensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple[torch.Tensor], VitsTextEncoderOutput]:
        # 使用词嵌入层乘以sqrt(hidden_size)来得到输入的隐藏状态
        hidden_states = self.embed_tokens(input_ids) * math.sqrt(self.config.hidden_size)

        # 将隐藏状态传入编码器进行编码
        encoder_outputs = self.encoder(
            hidden_states=hidden_states,
            padding_mask=padding_mask,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器的输出中的最后一个隐藏状态
        last_hidden_state = encoder_outputs[0] if not return_dict else encoder_outputs.last_hidden_state

        # 通过卷积层对最后隐藏状态进行投影，同时考虑填充mask
        stats = self.project(last_hidden_state.transpose(1, 2)).transpose(1, 2) * padding_mask
        # 将投影后的统计数据分割为先验均值和对数方差
        prior_means, prior_log_variances = torch.split(stats, self.config.flow_size, dim=2)

        if not return_dict:
            # 如果不要求返回字典，则返回元组形式的输出
            outputs = (last_hidden_state, prior_means, prior_log_variances) + encoder_outputs[1:]
            return outputs

        # 如果要求返回字典形式的输出，构建VitsTextEncoderOutput对象
        return VitsTextEncoderOutput(
            last_hidden_state=last_hidden_state,
            prior_means=prior_means,
            prior_log_variances=prior_log_variances,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )


class VitsPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 设置配置类
    config_class = VitsConfig
    # 基础模型前缀
    base_model_prefix = "vits"
    # 主要输入名称
    main_input_name = "input_ids"
    # 是否支持梯度检查点
    supports_gradient_checkpointing = True
    # 初始化神经网络模块的权重
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果是线性层
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果有偏置项，初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        
        # 如果是 LayerNorm 层
        elif isinstance(module, nn.LayerNorm):
            # 初始化偏置项为零
            module.bias.data.zero_()
            # 初始化权重为全1
            module.weight.data.fill_(1.0)
        
        # 如果是 1D 卷积层
        elif isinstance(module, nn.Conv1d):
            # 使用 Kaiming 初始化方法初始化权重
            nn.init.kaiming_normal_(module.weight)
            # 如果有偏置项，根据组、输入通道数和卷积核大小计算初始化范围并均匀分布初始化
            if module.bias is not None:
                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
                nn.init.uniform_(module.bias, a=-k, b=k)
        
        # 如果是嵌入层
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果指定了填充索引，将该索引处的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
# 导入 VITS 模型的文档字符串，介绍了模型的继承关系和通用方法的使用
VITS_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`VitsConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# VITS 模型的输入文档字符串，详细描述了输入参数的含义和使用方法
VITS_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
            1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        speaker_id (`int`, *optional*):
            Which speaker embedding to use. Only used for multispeaker models.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# 添加 VITS 模型的开始文档字符串，描述了完整的 VITS 模型用于文本到语音合成
@add_start_docstrings(
    "The complete VITS model, for text-to-speech synthesis.",
    VITS_START_DOCSTRING,
)
class VitsModel(VitsPreTrainedModel):
    # 类定义部分，继承自 VitsPreTrainedModel
    # 初始化函数，用于初始化模型对象
    def __init__(self, config: VitsConfig):
        # 调用父类的初始化方法，传入配置参数
        super().__init__(config)
        # 将配置参数保存到对象中
        self.config = config
        # 创建文本编码器对象，使用给定配置参数
        self.text_encoder = VitsTextEncoder(config)
        # 创建残差耦合块对象，使用给定配置参数
        self.flow = VitsResidualCouplingBlock(config)
        # 创建 HiFi-GAN 解码器对象，使用给定配置参数
        self.decoder = VitsHifiGan(config)

        # 根据配置决定使用随机时长预测器或固定时长预测器
        if config.use_stochastic_duration_prediction:
            self.duration_predictor = VitsStochasticDurationPredictor(config)
        else:
            self.duration_predictor = VitsDurationPredictor(config)

        # 如果配置中的说话人数量大于 1，则创建说话人嵌入对象
        if config.num_speakers > 1:
            self.embed_speaker = nn.Embedding(config.num_speakers, config.speaker_embedding_size)

        # 仅在训练时使用，创建后验编码器对象
        self.posterior_encoder = VitsPosteriorEncoder(config)

        # 初始化合成语音的参数，控制语速、噪声比例和噪声时长
        self.speaking_rate = config.speaking_rate
        self.noise_scale = config.noise_scale
        self.noise_scale_duration = config.noise_scale_duration

        # 执行初始化后处理操作
        self.post_init()

    # 返回文本编码器对象
    def get_encoder(self):
        return self.text_encoder

    # 前向传播函数，实现模型的前向计算
    @add_start_docstrings_to_model_forward(VITS_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=VitsModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        speaker_id: Optional[int] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.FloatTensor] = None,

`.\models\vits\tokenization_vits.py`

# coding=utf-8
# Copyright 2023 The Kakao Enterprise Authors, the MMS-TTS Authors and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization class for VITS."""


import json
import os
import re
from typing import Any, Dict, List, Optional, Tuple, Union

from ...tokenization_utils import PreTrainedTokenizer
from ...utils import is_phonemizer_available, logging


if is_phonemizer_available():
    import phonemizer


logger = logging.get_logger(__name__)

# 定义词汇表文件名字典常量
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}

# 预训练模型的词汇表文件映射常量
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/mms-tts-eng": "https://huggingface.co/facebook/mms-tts-eng/resolve/main/vocab.json",
    }
}

# 预训练模型的最大输入尺寸常量
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    # 该模型没有最大输入长度限制
    "facebook/mms-tts-eng": 4096,
}


def has_non_roman_characters(input_string):
    # 查找输入字符串中是否包含非罗马字符
    non_roman_pattern = re.compile(r"[^\x00-\x7F]")

    # 在输入字符串中搜索非罗马字符
    match = non_roman_pattern.search(input_string)
    has_non_roman = match is not None
    return has_non_roman


class VitsTokenizer(PreTrainedTokenizer):
    """
    Construct a VITS tokenizer. Also supports MMS-TTS.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        language (`str`, *optional*):
            Language identifier.
        add_blank (`bool`, *optional*, defaults to `True`):
            Whether to insert token id 0 in between the other tokens.
        normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the input text by removing all casing and punctuation.
        phonemize (`bool`, *optional*, defaults to `True`):
            Whether to convert the input text into phonemes.
        is_uroman (`bool`, *optional*, defaults to `False`):
            Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing.
    """

    # 词汇表文件名字典常量
    vocab_files_names = VOCAB_FILES_NAMES
    # 预训练模型的词汇表文件映射常量
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 预训练模型的最大输入尺寸常量
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 模型输入名称列表
    model_input_names = ["input_ids", "attention_mask"]
    # 初始化函数，用于创建一个新的对象实例
    def __init__(
        self,
        vocab_file,                # 词汇表文件的路径
        pad_token="<pad>",         # 填充标记，默认为"<pad>"
        unk_token="<unk>",         # 未知标记，默认为"<unk>"
        language=None,             # 语言设置，默认为None
        add_blank=True,            # 是否添加空白标记，默认为True
        normalize=True,            # 是否进行文本规范化，默认为True
        phonemize=True,            # 是否进行音素化，默认为True
        is_uroman=False,           # 是否为乌罗马尼亚语，默认为False
        **kwargs,                  # 其他关键字参数
    ) -> None:
        # 使用指定的编码打开词汇表文件，并加载为字典形式到self.encoder中
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)

        # 根据self.encoder创建反向字典，用于从编码解码为原始词汇
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.language = language        # 设置对象的语言属性
        self.add_blank = add_blank      # 设置对象是否添加空白标记的属性
        self.normalize = normalize      # 设置对象是否进行文本规范化的属性
        self.phonemize = phonemize      # 设置对象是否进行音素化的属性
        self.is_uroman = is_uroman      # 设置对象是否为乌罗马尼亚语的属性

        # 调用父类的初始化方法，传递相同的参数和额外的关键字参数
        super().__init__(
            pad_token=pad_token,
            unk_token=unk_token,
            language=language,
            add_blank=add_blank,
            normalize=normalize,
            phonemize=phonemize,
            is_uroman=is_uroman,
            **kwargs,
        )

    @property
    def vocab_size(self):
        # 返回词汇表的大小，即self.encoder中条目的数量
        return len(self.encoder)

    def get_vocab(self):
        # 创建并返回词汇表的字典，将编码映射到词汇
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)  # 将额外添加的标记编码映射也加入到词汇表中
        return vocab

    def normalize_text(self, input_string):
        """Lowercase the input string, respecting any special token ids that may be part or entirely upper-cased."""
        # 获取所有词汇（包括特殊标记）的列表
        all_vocabulary = list(self.encoder.keys()) + list(self.added_tokens_encoder.keys())
        filtered_text = ""

        i = 0
        # 遍历输入字符串的每个字符
        while i < len(input_string):
            found_match = False
            # 遍历词汇表中的每个词汇
            for word in all_vocabulary:
                # 如果输入字符串中的当前位置开始的子串与词汇匹配
                if input_string[i : i + len(word)] == word:
                    filtered_text += word  # 将匹配的词汇添加到过滤后的文本中
                    i += len(word)          # 更新当前位置
                    found_match = True
                    break

            # 如果没有找到匹配的词汇，则将当前字符转换为小写添加到过滤后的文本中
            if not found_match:
                filtered_text += input_string[i].lower()
                i += 1

        return filtered_text

    def _preprocess_char(self, text):
        """Special treatment of characters in certain languages"""
        # 如果语言设置为罗马尼亚语（ron），则将特定字符进行替换处理
        if self.language == "ron":
            text = text.replace("ț", "ţ")  # 将字符"ț"替换为"ţ"
        return text

    def prepare_for_tokenization(
        self, text: str,                        # 输入的文本字符串
        is_split_into_words: bool = False,      # 是否已经分割为单词，默认为False
        normalize: Optional[bool] = None,       # 是否进行文本规范化，可选参数，默认为None
        **kwargs                               # 其他关键字参数
    ):
        """
        Performs any necessary transformations before tokenization.

        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
            text (`str`):
                The text to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize.
            normalize (`bool`, *optional*, defaults to `None`):
                Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
                trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
                text consists only of lower-case characters.
            kwargs (`Dict[str, Any]`, *optional*):
                Keyword arguments to use for the tokenization.

        Returns:
            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
        """
        # Determine the normalization setting to use
        normalize = normalize if normalize is not None else self.normalize

        # Normalize the text if required
        if normalize:
            # Normalize text casing and punctuation
            text = self.normalize_text(text)

        # Preprocess text to filter unwanted characters
        filtered_text = self._preprocess_char(text)

        # Check for non-Roman characters if the tokenizer is set to uroman
        if has_non_roman_characters(filtered_text) and self.is_uroman:
            # Issue a warning if non-Roman characters are detected
            logger.warning(
                "Text to the tokenizer contains non-Roman characters. Ensure the `uroman` Romanizer is "
                "applied to the text prior to passing it to the tokenizer. See "
                "`https://github.com/isi-nlp/uroman` for details."
            )

        # Apply phonemization if enabled
        if self.phonemize:
            # Check if phonemizer is available
            if not is_phonemizer_available():
                # Raise an error if phonemizer is not installed
                raise ImportError("Please install the `phonemizer` Python package to use this tokenizer.")

            # Phonemize the filtered text
            filtered_text = phonemizer.phonemize(
                filtered_text,
                language="en-us",
                backend="espeak",
                strip=True,
                preserve_punctuation=True,
                with_stress=True,
            )
            # Replace multiple spaces with a single space
            filtered_text = re.sub(r"\s+", " ", filtered_text)
        elif normalize:
            # Strip characters outside of the vocabulary (punctuation)
            filtered_text = "".join(list(filter(lambda char: char in self.encoder, filtered_text))).strip()

        # Return the processed text and remaining kwargs
        return filtered_text, kwargs
    def _tokenize(self, text: str) -> List[str]:
        """Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters."""
        # 将字符串按字符分割为列表
        tokens = list(text)

        # 如果设置了添加空白标记，将空白标记插入相邻字符之间
        if self.add_blank:
            # 创建一个新列表，用于在字符之间插入空白标记
            interspersed = [self._convert_id_to_token(0)] * (len(tokens) * 2 + 1)
            interspersed[1::2] = tokens  # 将原始字符插入新列表的奇数位置
            tokens = interspersed

        return tokens  # 返回处理后的标记列表

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        # 如果设置了添加空白标记并且标记列表长度大于1，只返回奇数位置的标记组成的字符串
        if self.add_blank and len(tokens) > 1:
            tokens = tokens[1::2]
        return "".join(tokens)  # 将标记列表连接为字符串并返回

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 根据词汇表将标记转换为对应的 ID，如果标记不存在则使用未知标记的 ID
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 根据索引从词汇表中获取对应的标记
        return self.decoder.get(index)

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Union[Tuple[str], None]:
        # 检查保存目录是否存在，如果不存在则记录错误并返回空值
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return

        # 拼接词汇表文件的完整路径和文件名
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 将词汇表以 JSON 格式写入到文件中
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        return (vocab_file,)  # 返回保存的词汇表文件路径元组

`.\models\vits\init.py`

# 版权声明及许可信息
# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 导入必要的类型检查模块
from typing import TYPE_CHECKING

# 从本地模块导入所需的工具函数和类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_sentencepiece_available,
    is_speech_available,
    is_torch_available,
)

# 定义导入结构，以便在懒加载时使用
_import_structure = {
    "configuration_vits": [
        "VITS_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "VitsConfig",
    ],
    "tokenization_vits": ["VitsTokenizer"],
}

# 尝试检查是否有必要的 Torch 库可用
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，则添加下列模型相关的导入
    _import_structure["modeling_vits"] = [
        "VITS_PRETRAINED_MODEL_ARCHIVE_LIST",
        "VitsModel",
        "VitsPreTrainedModel",
    ]

# 如果是类型检查阶段，则从相应模块中导入特定类和常量
if TYPE_CHECKING:
    from .configuration_vits import (
        VITS_PRETRAINED_CONFIG_ARCHIVE_MAP,
        VitsConfig,
    )
    from .tokenization_vits import VitsTokenizer

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_vits import (
            VITS_PRETRAINED_MODEL_ARCHIVE_LIST,
            VitsModel,
            VitsPreTrainedModel,
        )

# 如果不是类型检查阶段，则进行懒加载处理
else:
    import sys

    # 将当前模块替换为懒加载模块对象
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\vit_hybrid\configuration_vit_hybrid.py`

# 设置文件编码为 UTF-8
# 版权声明及许可条款
# 根据 Apache 许可证 2.0 版本使用此文件
# 可以在符合许可证条件的情况下使用该文件
# 许可证详细信息请参见 http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按"原样"分发本软件
# 没有任何形式的明示或暗示担保或条件
# 请参阅许可证了解具体的语言权限以及限制条件
""" ViT Hybrid model configuration"""

# 从相关模块导入必要的类和函数
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ..auto.configuration_auto import CONFIG_MAPPING
from ..bit import BitConfig

# 获取一个用于记录日志的 logger 对象
logger = logging.get_logger(__name__)

# 预训练配置文件的映射，指定了各种预训练模型的下载地址
VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "google/vit-hybrid-base-bit-384": "https://huggingface.co/vit-hybrid-base-bit-384/resolve/main/config.json",
    # 查看所有 ViT 混合模型的完整列表请访问 https://huggingface.co/models?filter=vit
}

# ViT Hybrid 配置类，继承自 PretrainedConfig 类
class ViTHybridConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ViTHybridModel`]. It is used to instantiate a ViT
    Hybrid model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the ViT Hybrid
    [google/vit-hybrid-base-bit-384](https://huggingface.co/google/vit-hybrid-base-bit-384) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import ViTHybridConfig, ViTHybridModel

    >>> # Initializing a ViT Hybrid vit-hybrid-base-bit-384 style configuration
    >>> configuration = ViTHybridConfig()

    >>> # Initializing a model (with random weights) from the vit-hybrid-base-bit-384 style configuration
    >>> model = ViTHybridModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    # 指定模型类型为 "vit-hybrid"
    model_type = "vit-hybrid"

    # 初始化方法，定义了该配置类的各种参数
    def __init__(
        self,
        backbone_config=None,
        backbone=None,
        use_pretrained_backbone=False,
        use_timm_backbone=False,
        backbone_kwargs=None,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.0,
        attention_probs_dropout_prob=0.0,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        image_size=224,
        patch_size=1,
        num_channels=3,
        backbone_featmap_shape=[1, 1024, 24, 24],
        qkv_bias=True,
        **kwargs,
        ):
        super().__init__(**kwargs)
        # 如果使用预训练的主干网络，则抛出异常
        if use_pretrained_backbone:
            raise ValueError("Pretrained backbones are not supported yet.")

        # 如果同时指定了 `backbone` 和 `backbone_config`，则抛出异常
        if backbone_config is not None and backbone is not None:
            raise ValueError("You can't specify both `backbone` and `backbone_config`.")

        # 如果 `backbone_config` 和 `backbone` 都未指定，则使用默认的 `BiT` 主干网络配置
        if backbone_config is None and backbone is None:
            logger.info("`backbone_config` is `None`. Initializing the config with a `BiT` backbone.")
            backbone_config = {
                "global_padding": "same",
                "layer_type": "bottleneck",
                "depths": [3, 4, 9],
                "out_features": ["stage3"],
                "embedding_dynamic_padding": True,
            }

        # 如果同时指定了 `backbone_kwargs` 和 `backbone_config`，则抛出异常
        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")

        # 如果 `backbone_config` 是一个字典，则根据 `model_type` 创建对应的主干网络配置类
        if isinstance(backbone_config, dict):
            if "model_type" in backbone_config:
                backbone_config_class = CONFIG_MAPPING[backbone_config["model_type"]]
            else:
                logger.info(
                    "`model_type` is not found in `backbone_config`. Use `Bit` as the backbone configuration class."
                )
                backbone_config_class = BitConfig
            backbone_config = backbone_config_class(**backbone_config)

        # 设置类的属性值
        self.backbone_featmap_shape = backbone_featmap_shape
        self.backbone_config = backbone_config
        self.backbone = backbone
        self.use_pretrained_backbone = use_pretrained_backbone
        self.use_timm_backbone = use_timm_backbone
        self.backbone_kwargs = backbone_kwargs
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.qkv_bias = qkv_bias

posted @ 2024-07-01 10:56 绝不原创的飞龙阅读(15) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-一百一十八-

Transformers 源码解析（一百一十八）

`.\models\vit\init.py`

`.\models\vitdet\configuration_vitdet.py`

`.\models\vitdet\modeling_vitdet.py`

`.\models\vitdet\init.py`

`.\models\vitmatte\configuration_vitmatte.py`

`.\models\vitmatte\convert_vitmatte_to_hf.py`

`.\models\vitmatte\image_processing_vitmatte.py`

`.\models\vitmatte\modeling_vitmatte.py`

`.\models\vitmatte\init.py`

`.\models\vits\configuration_vits.py`

`.\models\vits\convert_original_checkpoint.py`

定义一个映射字典，用于将某些权重键映射到不同的键

定义一个映射字典，用于将生成器的权重键映射到不同的键

定义一个映射字典，用于将后验编码器的权重键映射到不同的键

合并所有映射字典

初始化一个空列表，用于存储顶级键

初始化一个空列表，用于存储忽略的键

末尾多余的大括号，可能是代码片段复制过程中的错误

末尾多余的大括号，可能是代码片段复制过程中的错误

检查给定的名称是否应该被忽略，根据忽略规则列表 ignore_keys

递归地加载 Fairseq 模型的权重到 Hugging Face 模型中

使用 Torch 的 no_grad 装饰器，将 PyTorch 模型权重转换为 Transformers 设计的函数

主程序入口，用于执行脚本的入口点

`.\models\vits\tokenization_vits.py`

`.\models\vits\init.py`

`.\models\vit_hybrid\configuration_vit_hybrid.py`

公告

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-一百一十八-

Transformers 源码解析（一百一十八）

.\models\vit\__init__.py

.\models\vitdet\configuration_vitdet.py

.\models\vitdet\modeling_vitdet.py

.\models\vitdet\__init__.py

.\models\vitmatte\configuration_vitmatte.py

.\models\vitmatte\convert_vitmatte_to_hf.py

.\models\vitmatte\image_processing_vitmatte.py

.\models\vitmatte\modeling_vitmatte.py

.\models\vitmatte\__init__.py

.\models\vits\configuration_vits.py

.\models\vits\convert_original_checkpoint.py

定义一个映射字典，用于将某些权重键映射到不同的键

定义一个映射字典，用于将生成器的权重键映射到不同的键

定义一个映射字典，用于将后验编码器的权重键映射到不同的键

合并所有映射字典

初始化一个空列表，用于存储顶级键

初始化一个空列表，用于存储忽略的键

末尾多余的大括号，可能是代码片段复制过程中的错误

末尾多余的大括号，可能是代码片段复制过程中的错误

检查给定的名称是否应该被忽略，根据忽略规则列表 ignore_keys

递归地加载 Fairseq 模型的权重到 Hugging Face 模型中

使用 Torch 的 no_grad 装饰器，将 PyTorch 模型权重转换为 Transformers 设计的函数

主程序入口，用于执行脚本的入口点

.\models\vits\tokenization_vits.py

.\models\vits\__init__.py

.\models\vit_hybrid\configuration_vit_hybrid.py

公告

`.\models\vit\init.py`

`.\models\vitdet\configuration_vitdet.py`

`.\models\vitdet\modeling_vitdet.py`

`.\models\vitdet\init.py`

`.\models\vitmatte\configuration_vitmatte.py`

`.\models\vitmatte\convert_vitmatte_to_hf.py`

`.\models\vitmatte\image_processing_vitmatte.py`

`.\models\vitmatte\modeling_vitmatte.py`

`.\models\vitmatte\init.py`

`.\models\vits\configuration_vits.py`

`.\models\vits\convert_original_checkpoint.py`

`.\models\vits\tokenization_vits.py`

`.\models\vits\init.py`

`.\models\vit_hybrid\configuration_vit_hybrid.py`