Transformers-源码解析-一百一十四-

Transformers 源码解析（一百一十四）

`.\models\udop\init.py`

# 版权声明和许可证声明，指出此文件受版权保护，并遵循Apache License 2.0
#
# from...utils中导入所需的模块和函数
from typing import TYPE_CHECKING

from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_sentencepiece_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义导入结构，用于延迟导入模块和函数
_import_structure = {
    "configuration_udop": ["UDOP_PRETRAINED_CONFIG_ARCHIVE_MAP", "UdopConfig"],
    "processing_udop": ["UdopProcessor"],
}

# 尝试导入句子分词器，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["tokenization_udop"] = ["UdopTokenizer"]

# 尝试导入tokenizers，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["tokenization_udop_fast"] = ["UdopTokenizerFast"]

# 尝试导入torch，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["modeling_udop"] = [
        "UDOP_PRETRAINED_MODEL_ARCHIVE_LIST",
        "UdopForConditionalGeneration",
        "UdopPreTrainedModel",
        "UdopModel",
        "UdopEncoderModel",
    ]

# 如果在类型检查模式下，导入具体模块和类
if TYPE_CHECKING:
    from .configuration_udop import UDOP_PRETRAINED_CONFIG_ARCHIVE_MAP, UdopConfig
    from .processing_udop import UdopProcessor

    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_udop import UdopTokenizer

    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .tokenization_udop_fast import UdopTokenizerFast

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_udop import (
            UDOP_PRETRAINED_MODEL_ARCHIVE_LIST,
            UdopEncoderModel,
            UdopForConditionalGeneration,
            UdopModel,
            UdopPreTrainedModel,
        )

# 若不是类型检查模式，将该模块设置为LazyModule的实例，以支持延迟加载
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\umt5\configuration_umt5.py`

# coding=utf-8
# 定义文件编码为UTF-8

# 版权声明，版权归2023年T5作者及HuggingFace Inc.所有
#
# 根据Apache许可证2.0版发布；除非符合许可证要求，否则不得使用本文件
# 可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件是基于“按原样提供”的基础分发的，
# 没有任何明示或暗示的担保或条件。请参阅许可证了解特定语言的权限和限制。
""" UMT5模型配置 """
# 导入所需模块
from typing import Mapping

# 从configuration_utils模块导入预训练配置类PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 从onnx模块导入OnnxSeq2SeqConfigWithPast配置类
from ...onnx import OnnxSeq2SeqConfigWithPast
# 从utils模块导入logging工具
from ...utils import logging

# 获取logger对象
logger = logging.get_logger(__name__)

# UMT5预训练模型配置文件映射表，将模型名称映射到其配置文件的URL
UMT5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "google/umt5-small": "https://huggingface.co/google/umt5-small/resolve/main/config.json",
    # 查看所有umt5模型，请访问https://huggingface.co/models?filter=umt5
}

# UMT5配置类，继承自PretrainedConfig
class UMT5Config(PretrainedConfig):
    r"""
    这是用于存储[`UMT5Model`]配置的配置类。根据指定的参数实例化UMT5模型，定义模型架构。
    使用默认值实例化配置将生成类似于UMT5 [google/umt5-small](https://huggingface.co/google/umt5-small) 架构的配置。

    配置对象继承自[`PretrainedConfig`]，可以用于控制模型的输出。阅读[`PretrainedConfig`]的文档以获取更多信息。
    """
    pass  # 空白的配置类，仅作为文档的说明用途
    # 模型类型为 "umt5"
    model_type = "umt5"
    
    # 推理阶段忽略的关键字列表
    keys_to_ignore_at_inference = ["past_key_values"]
    
    # 属性映射字典，将旧名称映射到新名称
    attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
    # 初始化方法，设置模型的各种参数和默认值
    def __init__(
        self,
        vocab_size=250112,  # 词汇表大小，默认为250112
        d_model=512,  # 模型的维度，默认为512
        d_kv=64,  # 键值的维度，默认为64
        d_ff=1024,  # 前馈神经网络中间层的维度，默认为1024
        num_layers=8,  # 编码器和解码器层数，默认为8
        num_decoder_layers=None,  # 解码器层数，默认与编码器层数相同
        num_heads=6,  # 注意力头的数量，默认为6
        relative_attention_num_buckets=32,  # 相对注意力的桶数量，默认为32
        relative_attention_max_distance=128,  # 相对注意力的最大距离，默认为128
        dropout_rate=0.1,  # dropout率，默认为0.1
        layer_norm_epsilon=1e-6,  # Layer Normalization的epsilon，默认为1e-6
        initializer_factor=1.0,  # 初始化因子，默认为1.0
        feed_forward_proj="gated-gelu",  # 前馈网络的投影类型，默认为"gated-gelu"
        is_encoder_decoder=True,  # 是否为编码器-解码器模型，默认为True
        use_cache=True,  # 是否使用缓存，默认为True
        tokenizer_class="T5Tokenizer",  # Tokenizer类的名称，默认为"T5Tokenizer"
        tie_word_embeddings=True,  # 是否共享编码器和解码器的词嵌入，默认为True
        pad_token_id=0,  # 填充token的ID，默认为0
        eos_token_id=1,  # 结束token的ID，默认为1
        decoder_start_token_id=0,  # 解码器起始token的ID，默认为0
        classifier_dropout=0.0,  # 分类器的dropout率，默认为0.0
        **kwargs,  # 其他参数，用于接收未命名的关键字参数
    ):
        self.vocab_size = vocab_size  # 初始化词汇表大小
        self.d_model = d_model  # 初始化模型维度
        self.d_kv = d_kv  # 初始化键值维度
        self.d_ff = d_ff  # 初始化前馈神经网络中间层维度
        self.num_layers = num_layers  # 初始化层数
        self.num_decoder_layers = (
            num_decoder_layers if num_decoder_layers is not None else self.num_layers
        )  # 初始化解码器层数，如果未指定则与编码器层数相同
        self.num_heads = num_heads  # 初始化注意力头数量
        self.relative_attention_num_buckets = relative_attention_num_buckets  # 初始化相对注意力的桶数量
        self.relative_attention_max_distance = relative_attention_max_distance  # 初始化相对注意力的最大距离
        self.dropout_rate = dropout_rate  # 初始化dropout率
        self.classifier_dropout = classifier_dropout  # 初始化分类器的dropout率
        self.layer_norm_epsilon = layer_norm_epsilon  # 初始化Layer Normalization的epsilon
        self.initializer_factor = initializer_factor  # 初始化初始化因子
        self.feed_forward_proj = feed_forward_proj  # 初始化前馈网络的投影类型
        self.use_cache = use_cache  # 初始化是否使用缓存
    
        act_info = self.feed_forward_proj.split("-")  # 根据"-"分割前馈网络投影类型
        self.dense_act_fn = act_info[-1]  # 密集层的激活函数名称
        self.is_gated_act = act_info[0] == "gated"  # 判断是否为门控激活函数类型
    
        # 检查前馈网络投影类型是否合法
        if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2:
            raise ValueError(
                f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer. "
                "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. "
                "'gated-gelu' or 'relu'"
            )
    
        # 如果前馈网络投影类型为"gated-gelu"，则将密集层的激活函数名称设置为"gelu_new"
        if feed_forward_proj == "gated-gelu":
            self.dense_act_fn = "gelu_new"
    
        # 调用父类的初始化方法，传递其他参数
        super().__init__(
            is_encoder_decoder=is_encoder_decoder,
            tokenizer_class=tokenizer_class,
            tie_word_embeddings=tie_word_embeddings,
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
            decoder_start_token_id=decoder_start_token_id,
            **kwargs,
        )
# 定义一个继承自OnnxSeq2SeqConfigWithPast的配置类UMT5OnnxConfig，用于配置UMT5模型在ONNX中的设置

@property
# 从transformers.models.t5.configuration_t5.T5OnnxConfig.inputs中复制的代码片段，定义模型的输入格式
def inputs(self) -> Mapping[str, Mapping[int, str]]:
    # 定义通用的输入格式字典
    common_inputs = {
        "input_ids": {0: "batch", 1: "encoder_sequence"},
        "attention_mask": {0: "batch", 1: "encoder_sequence"},
    }
    # 如果使用过去信息（use_past为True），更新attention_mask的描述信息
    if self.use_past:
        common_inputs["attention_mask"][1] = "past_encoder_sequence + sequence"
        # 添加decoder_input_ids的描述信息
        common_inputs["decoder_input_ids"] = {0: "batch"}
        # 添加decoder_attention_mask的描述信息
        common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
    else:
        # 如果不使用过去信息，更新decoder_input_ids的描述信息
        common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
        # 更新decoder_attention_mask的描述信息
        common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}

    # 如果使用过去信息，调用fill_with_past_key_values_方法填充common_inputs
    if self.use_past:
        self.fill_with_past_key_values_(common_inputs, direction="inputs")

    # 返回配置好的输入格式字典
    return common_inputs

@property
# 从transformers.models.t5.configuration_t5.T5OnnxConfig.default_onnx_opset中复制的代码片段，定义默认的ONNX操作集版本号
def default_onnx_opset(self) -> int:
    # 返回ONNX操作集的版本号
    return 13

@property
# 定义一个属性atol_for_validation，返回浮点数类型的验证绝对容差
def atol_for_validation(self) -> float:
    # 返回5e-4作为验证时的绝对容差
    return 5e-4

`.\models\umt5\convert_umt5_checkpoint_to_pytorch.py`

# 导入必要的库和模块
import argparse  # 用于解析命令行参数的模块
import collections  # 提供额外的数据结构，如Counter，OrderedDict等

import numpy as np  # 处理数值计算的库
import torch  # PyTorch深度学习框架
from flax import traverse_util  # Flax库，用于遍历和操作参数

from t5x import checkpoints  # 导入T5X库中的checkpoints模块

from transformers import MT5Config, UMT5EncoderModel, UMT5ForConditionalGeneration  # 导入transformers中的模型配置和模型类
from transformers.utils import logging  # 导入transformers中的日志功能模块


logging.set_verbosity_info()  # 设置日志的详细级别为INFO


def t5x_relpos_bias_lookup(params, i, prefix):
    """返回一个层的相对位置偏置参数。不进行转置。"""
    return params[f"{prefix}/{prefix}/relpos_bias/rel_embedding"][:, i, :]


def t5x_attention_lookup(params, i, prefix, layer_name="attention"):
    """返回（自）注意力的KOQV参数。不进行转置。"""
    k_tmp = k_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/key/kernel"][:, i, :, :])
    k = k_tmp.reshape(k_tmp.shape[0], k_tmp.shape[1] * k_tmp.shape[2])
    o_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/out/kernel"][:, i, :, :])
    o = o_tmp.reshape(o_tmp.shape[0] * o_tmp.shape[1], o_tmp.shape[2])
    q_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/query/kernel"][:, i, :, :])
    q = q_tmp.reshape(q_tmp.shape[0], q_tmp.shape[1] * q_tmp.shape[2])
    v_tmp = np.ascontiguousarray(params[f"{prefix}/{prefix}/{layer_name}/value/kernel"][:, i, :, :])
    v = v_tmp.reshape(v_tmp.shape[0], v_tmp.shape[1] * v_tmp.shape[2])
    return k, o, q, v


def t5x_mlp_lookup(params, i, prefix, split_mlp_wi=False):
    """返回一个层的MLP参数。不进行转置。"""
    if split_mlp_wi:
        wi_0 = params[f"{prefix}/{prefix}/mlp/wi_0/kernel"][:, i, :]
        wi_1 = params[f"{prefix}/{prefix}/mlp/wi_1/kernel"][:, i, :]
        wi = (wi_0, wi_1)
    # 如果条件不满足，则取出指定路径下的参数，并选取特定索引对应的数据
    else:
        wi = params[f"{prefix}/{prefix}/mlp/wi/kernel"][:, i, :]

    # 取出指定路径下的参数，并选取特定索引对应的数据
    wo = params[f"{prefix}/{prefix}/mlp/wo/kernel"][:, i, :]
    # 返回获取的参数数据 wi 和 wo
    return wi, wo
# 返回指定层的层归一化参数
def t5x_layer_norm_lookup(params, i, prefix, layer_name):
    """Returns the layer norm param of a layer."""
    return params[f"{prefix}/{prefix}/{layer_name}/scale"][:, i]

# 将 T5X-Flax 模型参数转换为 Transformers-PyTorch 的格式
def convert_t5x_to_pytorch(
    variables: dict, *, num_layers: int, is_encoder_only: bool, scalable_attention: bool = False
):
    """Converts the parameters from T5X-Flax to Transformers-PyTorch."""
    # 将目标参数展开为扁平结构
    old = traverse_util.flatten_dict(variables["target"])
    # 使用斜杠连接键，构建新的字典
    old = {"/".join(k): v for k, v in old.items()}

    # v1.1 模型中的 MLP 使用 wi_0 和 wi_1 替代 wi
    split_mlp_wi = "encoder/encoder/mlp/wi_0/kernel" in old
    # 打印是否分割了 MLP
    print("Split MLP:", split_mlp_wi)

    # 新的参数字典，使用有序字典保持顺序
    new = collections.OrderedDict()

    # 共享的嵌入层权重
    new["shared.weight"] = old["token_embedder/embedding"]

    # 编码器部分
    for i in range(num_layers):
        # Block i, layer 0 (Self Attention).
        # 获取层归一化参数
        layer_norm = t5x_layer_norm_lookup(old, i, "encoder", "pre_attention_layer_norm")
        new[f"encoder.block.{i}.layer.0.layer_norm.weight"] = layer_norm
        # 获取注意力机制中的参数
        k, o, q, v = t5x_attention_lookup(old, i, "encoder", "attention")
        new[f"encoder.block.{i}.layer.0.SelfAttention.k.weight"] = k.T
        new[f"encoder.block.{i}.layer.0.SelfAttention.o.weight"] = o.T
        new[f"encoder.block.{i}.layer.0.SelfAttention.q.weight"] = q.T
        new[f"encoder.block.{i}.layer.0.SelfAttention.v.weight"] = v.T

        # Block i, layer 1 (MLP).
        # 获取层归一化参数
        layer_norm = t5x_layer_norm_lookup(old, i, "encoder", "pre_mlp_layer_norm")
        new[f"encoder.block.{i}.layer.1.layer_norm.weight"] = layer_norm
        # 获取 MLP 的参数
        wi, wo = t5x_mlp_lookup(old, i, "encoder", split_mlp_wi)
        if split_mlp_wi:
            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi_0.weight"] = wi[0].T
            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi_1.weight"] = wi[1].T
        else:
            new[f"encoder.block.{i}.layer.1.DenseReluDense.wi.weight"] = wi.T
        new[f"encoder.block.{i}.layer.1.DenseReluDense.wo.weight"] = wo.T
        # 如果启用可扩展注意力机制，转换每层的相对位置编码
        if scalable_attention:
            new[f"encoder.block.{i}.layer.0.SelfAttention.relative_attention_bias.weight"] = t5x_relpos_bias_lookup(
                old, i, "encoder"
            ).T

    # 最终编码器层的归一化参数
    new["encoder.final_layer_norm.weight"] = old["encoder/encoder_norm/scale"]

    # 如果不使用可扩展注意力机制，转换第一个编码器和解码器块的相对注意力偏置
    if not scalable_attention:
        new["encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = t5x_relpos_bias_lookup(
            old, 0, "encoder"
        ).T
        new["decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"] = t5x_relpos_bias_lookup(
            old, 0, "decoder"
        ).T
    # 如果不是仅编码器模式，则执行解码器相关操作
    if not is_encoder_only:
        # 解码器部分的循环，遍历每个层次
        for i in range(num_layers):
            # 第 i 块，第 0 层 (自注意力层)
            # 获取预自注意力层规范化权重
            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_self_attention_layer_norm")
            # 获取自注意力层的 k, o, q, v 权重
            k, o, q, v = t5x_attention_lookup(old, i, "decoder", "self_attention")
            # 更新新模型的权重：层规范化权重、k、o、q、v
            new[f"decoder.block.{i}.layer.0.layer_norm.weight"] = layer_norm
            new[f"decoder.block.{i}.layer.0.SelfAttention.k.weight"] = k.T
            new[f"decoder.block.{i}.layer.0.SelfAttention.o.weight"] = o.T
            new[f"decoder.block.{i}.layer.0.SelfAttention.q.weight"] = q.T
            new[f"decoder.block.{i}.layer.0.SelfAttention.v.weight"] = v.T

            # 第 i 块，第 1 层 (跨注意力层)
            # 获取预交叉注意力层规范化权重
            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_cross_attention_layer_norm")
            # 获取跨注意力层的 k, o, q, v 权重
            k, o, q, v = t5x_attention_lookup(old, i, "decoder", "encoder_decoder_attention")
            # 更新新模型的权重：层规范化权重、k、o、q、v
            new[f"decoder.block.{i}.layer.1.layer_norm.weight"] = layer_norm
            new[f"decoder.block.{i}.layer.1.EncDecAttention.k.weight"] = k.T
            new[f"decoder.block.{i}.layer.1.EncDecAttention.o.weight"] = o.T
            new[f"decoder.block.{i}.layer.1.EncDecAttention.q.weight"] = q.T
            new[f"decoder.block.{i}.layer.1.EncDecAttention.v.weight"] = v.T

            # 第 i 块，第 2 层 (MLP 层)
            # 获取预MLP层规范化权重
            layer_norm = t5x_layer_norm_lookup(old, i, "decoder", "pre_mlp_layer_norm")
            # 获取MLP层的权重 wi 和 wo
            wi, wo = t5x_mlp_lookup(old, i, "decoder", split_mlp_wi)
            # 更新新模型的权重：层规范化权重、wi、wo
            new[f"decoder.block.{i}.layer.2.layer_norm.weight"] = layer_norm
            if split_mlp_wi:
                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_0.weight"] = wi[0].T
                new[f"decoder.block.{i}.layer.2.DenseReluDense.wi_1.weight"] = wi[1].T
            else:
                new[f"encoder.block.{i}.layer.2.DenseReluDense.wi.weight"] = wi.T
            new[f"decoder.block.{i}.layer.2.DenseReluDense.wo.weight"] = wo.T

            # 如果可扩展注意力为真，更新相对注意力偏置权重
            if scalable_attention:
                new[
                    f"decoder.block.{i}.layer.0.SelfAttention.relative_attention_bias.weight"
                ] = t5x_relpos_bias_lookup(old, i, "decoder").T

        # 更新最终解码器层规范化权重
        new["decoder.final_layer_norm.weight"] = old["decoder/decoder_norm/scale"]

        # 语言模型头部 (仅在 v1.1 版本的检查点中存在，在 v1.0 版本中使用嵌入)
        if "decoder/logits_dense/kernel" in old:
            # 更新新模型的语言模型头部权重
            new["lm_head.weight"] = old["decoder/logits_dense/kernel"].T

    # 返回更新后的新模型参数
    return new
# 准备一个 PyTorch 模型的状态字典
def make_state_dict(converted_params, is_encoder_only: bool):
    # 使用 torch 张量创建一个有序字典状态字典
    state_dict = collections.OrderedDict([(k, torch.from_numpy(v.copy())) for (k, v) in converted_params.items()])

    # 添加缺失的部分
    if "encoder.embed_tokens.weight" not in state_dict:
        state_dict["encoder.embed_tokens.weight"] = state_dict["shared.weight"]

    if not is_encoder_only:
        if "decoder.embed_tokens.weight" not in state_dict:
            state_dict["decoder.embed_tokens.weight"] = state_dict["shared.weight"]

        if "lm_head.weight" not in state_dict:  # 对于旧的 1.0 版本的模型
            print("Using shared word embeddings as lm_head.")
            state_dict["lm_head.weight"] = state_dict["shared.weight"]

    return state_dict


# 用 T5X 转换的参数替换模型的参数
def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only, scalable_attention):
    variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
    converted = convert_t5x_to_pytorch(
        variables, num_layers=config.num_layers, is_encoder_only=is_encoder_only, scalable_attention=scalable_attention
    )
    state_dict = make_state_dict(converted, is_encoder_only)
    model.load_state_dict(state_dict, strict=True)


# 将 T5X 检查点转换为 PyTorch 检查点
def convert_t5x_checkpoint_to_pytorch(
    t5x_checkpoint_path,
    config_file,
    pytorch_dump_path,
    is_encoder_only: bool = False,
    scalable_attention: bool = False,
):
    # 加载配置和模型，转换 T5X 检查点，保存 PyTorch 检查点
    config = MT5Config.from_json_file(config_file)
    print(f"Building PyTorch model from configuration: {config}")
    # 非 v1.1 检查点也可以使用 T5Model，但这对所有版本都有效
    # V1.0 检查点将简单地有一个LM头部，即词嵌入
    if is_encoder_only:
        model = UMT5EncoderModel(config)
    else:
        model = UMT5ForConditionalGeneration(config)

    # 从 tf 检查点加载权重
    load_t5x_weights_in_t5(model, config, t5x_checkpoint_path, is_encoder_only, scalable_attention)

    # 保存 PyTorch 模型
    print(f"Save PyTorch model to {pytorch_dump_path}")
    model.save_pretrained(pytorch_dump_path)

    # 验证是否可以加载检查点
    model.from_pretrained(pytorch_dump_path)
    print("Done")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Converts a native T5X checkpoint into a PyTorch checkpoint.")
    # 必填参数
    parser.add_argument(
        "--t5x_checkpoint_path", default=None, type=str, required=True, help="Path to the T5X checkpoint."
    )
    parser.add_argument(
        "--config_file",
        default=None,
        type=str,
        required=True,
        help="The config json file corresponding to the pre-trained T5 model.\nThis specifies the model architecture.",
    )
    # 添加一个参数：用于指定输出的 PyTorch 模型的路径
    parser.add_argument(
        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )
    # 添加一个标志参数：检查模型是否只有编码器部分（encoder-decoder 模型）
    parser.add_argument(
        "--is_encoder_only", action="store_true", help="Check if the model is encoder-decoder model", default=False
    )
    # 添加一个标志参数：指示模型是否使用了缩放的注意力机制（例如 umt5 模型）
    parser.add_argument(
        "--scalable_attention",
        action="store_true",
        help="Whether the model uses scaled attention (umt5 model)",
        default=False,
    )
    # 解析命令行参数
    args = parser.parse_args()
    # 调用函数将 T5X 模型的检查点转换为 PyTorch 模型
    convert_t5x_checkpoint_to_pytorch(
        args.t5x_checkpoint_path,
        args.config_file,
        args.pytorch_dump_path,
        args.is_encoder_only,
        args.scalable_attention,
    )

`.\models\umt5\modeling_umt5.py`

# 设置文件编码为 UTF-8
# 版权声明和许可信息，指定代码使用的许可证为 Apache License, Version 2.0
# 不可使用此文件，除非符合 Apache License, Version 2.0 的规定。可以通过上述链接获取许可证副本。
# 根据适用法律或书面同意，本软件按“原样”分发，无任何担保或条件。
# 有关详细信息，请参阅许可证的特定语言，限制和条件
""" PyTorch UMT5 模型."""

import copy  # 导入 copy 模块
import math  # 导入 math 模块
from typing import List, Optional, Tuple, Union  # 导入类型提示相关的模块

import torch  # 导入 PyTorch 库
from torch import nn  # 从 PyTorch 导入神经网络模块
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss  # 从 PyTorch 神经网络模块导入损失函数

from ...activations import ACT2FN  # 导入激活函数映射
from ...modeling_outputs import (  # 导入模型输出相关类
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
    Seq2SeqQuestionAnsweringModelOutput,
    Seq2SeqSequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型基类
from ...utils import (  # 导入辅助工具函数和常量
    DUMMY_INPUTS,
    DUMMY_MASK,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_torch_fx_proxy,
    logging,
    replace_return_docstrings,
)
from .configuration_umt5 import UMT5Config  # 导入 UMT5 模型的配置类


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

_CONFIG_FOR_DOC = "UMT5Config"  # 文档中显示的配置文件名称
_CHECKPOINT_FOR_DOC = "google/umt5-small"  # 文档中显示的检查点名称


# 从 transformers.models.t5.modeling_t5.T5LayerNorm 复制并改为 UMT5
class UMT5LayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        构造一个 UMT5 风格的 LayerNorm 模块。无偏差和无平均值减法。
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))  # 初始化权重参数为全1张量
        self.variance_epsilon = eps  # 方差的小值偏置

    def forward(self, hidden_states):
        # UMT5 使用一个只进行缩放而不进行偏移的 LayerNorm，这也称为均方根层归一化
        # 因此，方差是在没有均值的情况下计算的，而且没有偏差。另外，我们要确保对于半精度输入的累积是在 fp32 中进行的

        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)  # 计算方差
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)  # 归一化操作

        # 如果权重数据类型是半精度浮点数或 BF16，则将隐藏状态转换为相应的数据类型
        if self.weight.dtype in [torch.float16, torch.bfloat16]:
            hidden_states = hidden_states.to(self.weight.dtype)

        return self.weight * hidden_states  # 返回经归一化处理的隐藏状态乘以权重


# 从 transformers.models.t5.modeling_t5.T5DenseActDense 复制并改为 UMT5
class UMT5DenseActDense(nn.Module):
    # 初始化方法，接收一个UMT5Config类型的配置参数
    def __init__(self, config: UMT5Config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个线性层，输入维度为config.d_model，输出维度为config.d_ff，无偏置项
        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
        # 创建一个线性层，输入维度为config.d_ff，输出维度为config.d_model，无偏置项
        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
        # 创建一个以config.dropout_rate为丢弃率的Dropout层
        self.dropout = nn.Dropout(config.dropout_rate)
        # 根据配置选择合适的激活函数，并赋值给self.act
        self.act = ACT2FN[config.dense_act_fn]

    # 前向传播方法，接收隐藏状态hidden_states作为输入
    def forward(self, hidden_states):
        # 输入hidden_states经过self.wi线性层
        hidden_states = self.wi(hidden_states)
        # 经过激活函数self.act
        hidden_states = self.act(hidden_states)
        # 经过丢弃层self.dropout
        hidden_states = self.dropout(hidden_states)
        # 如果self.wo.weight是torch.Tensor类型，且hidden_states的数据类型不等于self.wo.weight的数据类型，且self.wo.weight的数据类型不是torch.int8
        if (
            isinstance(self.wo.weight, torch.Tensor)
            and hidden_states.dtype != self.wo.weight.dtype
            and self.wo.weight.dtype != torch.int8
        ):
            # 将hidden_states转换为self.wo.weight的数据类型
            hidden_states = hidden_states.to(self.wo.weight.dtype)
        # 输入hidden_states经过self.wo线性层
        hidden_states = self.wo(hidden_states)
        # 返回经过self.wo线性层后的hidden_states
        return hidden_states
# 从 transformers.models.t5.modeling_t5.T5DenseGatedActDense 复制代码，将 T5 替换为 UMT5
class UMT5DenseGatedActDense(nn.Module):
    def __init__(self, config: UMT5Config):
        super().__init__()
        # 定义一个线性层，将输入维度 config.d_model 映射到 config.d_ff，无偏置
        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
        # 定义另一个线性层，同样将输入维度 config.d_model 映射到 config.d_ff，无偏置
        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
        # 定义一个线性层，将输入维度 config.d_ff 映射回 config.d_model，无偏置
        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
        # 定义一个 dropout 层，丢弃概率为 config.dropout_rate
        self.dropout = nn.Dropout(config.dropout_rate)
        # 根据配置选择激活函数 ACT2FN 中的一个作为 self.act
        self.act = ACT2FN[config.dense_act_fn]

    def forward(self, hidden_states):
        # 将 hidden_states 经过 self.wi_0 和激活函数 self.act 处理得到 hidden_gelu
        hidden_gelu = self.act(self.wi_0(hidden_states))
        # 将 hidden_states 经过 self.wi_1 处理得到 hidden_linear
        hidden_linear = self.wi_1(hidden_states)
        # 将 hidden_gelu 和 hidden_linear 逐元素相乘，得到新的 hidden_states
        hidden_states = hidden_gelu * hidden_linear
        # 对 hidden_states 进行 dropout 处理
        hidden_states = self.dropout(hidden_states)

        # 为了使得 8 位量化在 google/flan-t5-xxl 上工作，self.wo 保持为 float32 类型
        # 参考 https://github.com/huggingface/transformers/issues/20287
        # 同时确保权重不是 `int8` 类型，以防用户将 `_keep_in_fp32_modules` 强制设置为 `None`
        if (
            isinstance(self.wo.weight, torch.Tensor)
            and hidden_states.dtype != self.wo.weight.dtype
            and self.wo.weight.dtype != torch.int8
        ):
            # 将 hidden_states 转换为 self.wo.weight 的数据类型
            hidden_states = hidden_states.to(self.wo.weight.dtype)

        # 将 hidden_states 经过 self.wo 处理得到最终的输出
        hidden_states = self.wo(hidden_states)
        return hidden_states


# 从 transformers.models.t5.modeling_t5.T5LayerFF 复制代码，将 T5 替换为 UMT5
class UMT5LayerFF(nn.Module):
    def __init__(self, config: UMT5Config):
        super().__init__()
        # 如果配置为使用 gated activation，则使用 UMT5DenseGatedActDense，否则使用 UMT5DenseActDense
        if config.is_gated_act:
            self.DenseReluDense = UMT5DenseGatedActDense(config)
        else:
            self.DenseReluDense = UMT5DenseActDense(config)

        # 定义层归一化层，输入维度为 config.d_model，epsilon 为 config.layer_norm_epsilon
        self.layer_norm = UMT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        # 定义 dropout 层，丢弃概率为 config.dropout_rate
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, hidden_states):
        # 对输入的 hidden_states 进行层归一化处理
        forwarded_states = self.layer_norm(hidden_states)
        # 将归一化后的 hidden_states 输入到 self.DenseReluDense 中进行处理
        forwarded_states = self.DenseReluDense(forwarded_states)
        # 将原始的 hidden_states 和经过 dropout 处理后的 forwarded_states 相加作为最终输出
        hidden_states = hidden_states + self.dropout(forwarded_states)
        return hidden_states


class UMT5Attention(nn.Module):
    """
    使用 relative_attention_bias 的 T5 注意力模块。
    """
    # 初始化函数，用于初始化一个注意力头部模型
    def __init__(self, config, has_relative_attention_bias=False):
        # 调用父类构造函数初始化
        super().__init__()
        # 根据配置设置是否为解码器
        self.is_decoder = config.is_decoder
        # 设置是否存在相对注意力偏置
        self.has_relative_attention_bias = has_relative_attention_bias
        # 相对注意力的桶数目
        self.relative_attention_num_buckets = config.relative_attention_num_buckets
        # 相对注意力的最大距离
        self.relative_attention_max_distance = config.relative_attention_max_distance
        # 模型的维度
        self.d_model = config.d_model
        # 键值映射的维度
        self.key_value_proj_dim = config.d_kv
        # 注意力头部的数量
        self.n_heads = config.num_heads
        # 丢弃率
        self.dropout = config.dropout_rate
        # 内部维度，等于头部数量乘以键值映射的维度
        self.inner_dim = self.n_heads * self.key_value_proj_dim

        # 创建线性层，用于查询
        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
        # 创建线性层，用于键
        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
        # 创建线性层，用于值
        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
        # 创建线性层，用于输出
        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)

        # 如果存在相对注意力偏置，则创建相对注意力偏置的嵌入层
        if self.has_relative_attention_bias:
            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
        
        # 初始化剪枝的注意力头部集合为空集
        self.pruned_heads = set()

    # 重新形状函数，用于调整注意力头部的投影
    def _shape(self, projection: torch.Tensor) -> torch.Tensor:
        # 计算新的投影形状
        new_projection_shape = projection.size()[:-1] + (self.n_heads, self.key_value_proj_dim)
        # 调整投影的形状，将头部移动到第二个位置 (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
        new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3)
        # 返回调整后的新投影
        return new_projection
    def _relative_position_bucket(self, relative_position):
        """
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        """
        relative_buckets = 0  # 初始化相对位置的桶号为0

        # 获取相对位置的桶数和最大距离
        num_buckets = self.relative_attention_num_buckets
        max_distance = self.relative_attention_max_distance

        # 如果不是解码器模式，调整桶数和相对位置
        if not self.is_decoder:
            num_buckets //= 2  # 桶数减半
            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets  # 根据相对位置正负，选择桶号
            relative_position = torch.abs(relative_position)  # 取相对位置的绝对值
        else:
            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))  # 如果是解码器模式，调整相对位置

        # 现在相对位置在区间[0, inf)

        # 将一半的桶用于精确增量位置
        max_exact = num_buckets // 2
        is_small = relative_position < max_exact  # 判断相对位置是否小于最大精确值

        # 另一半桶用于对数增量位置，直到最大距离
        log_ratio = torch.log(relative_position.float() / max_exact) / math.log(max_distance / max_exact)
        log_ratio = log_ratio * (num_buckets - max_exact)
        relative_position_if_large = max_exact + log_ratio.to(torch.long)
        relative_position_if_large = torch.min(
            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
        )

        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)  # 根据相对位置大小选择最终的桶号
        return relative_buckets  # 返回计算出的相对位置的桶号
    # 计算相对位置偏置
    def compute_bias(self, query_length, key_length, device=None):
        """Compute binned relative position bias"""
        # 如果设备未指定，则使用相对注意力偏置张量的设备
        if device is None:
            device = self.relative_attention_bias.weight.device
        # 创建上下文位置张量，包含长度为 query_length 的序列，dtype 为 long，设备为指定设备
        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
        # 创建记忆位置张量，包含长度为 key_length 的序列，dtype 为 long，设备为指定设备
        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
        # 计算相对位置，形状为 (query_length, key_length)
        relative_position = memory_position - context_position
        # 将相对位置转换为桶索引
        relative_position_bucket = self._relative_position_bucket(relative_position)
        # 使用相对注意力偏置张量计算偏置值，形状为 (query_length, key_length, num_heads)
        values = self.relative_attention_bias(relative_position_bucket)
        # 调整维度顺序，形状变为 (1, num_heads, query_length, key_length)
        values = values.permute([2, 0, 1]).unsqueeze(0)
        # 返回计算得到的偏置值张量
        return values

    def forward(
        self,
        hidden_states: torch.Tensor,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
# UMT5 模型中的自注意力层定义，用于处理自注意力机制
class UMT5LayerSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化自注意力层，配置是否包含相对注意力偏置
        self.SelfAttention = UMT5Attention(config, has_relative_attention_bias=True)
        # 初始化层归一化（Layer Normalization），输入维度为 config.d_model，epsilon 设置为 config.layer_norm_epsilon
        self.layer_norm = UMT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        # 初始化 dropout，丢弃率为 config.dropout_rate
        self.dropout = nn.Dropout(config.dropout_rate)

    # 定义前向传播函数
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        layer_head_mask=None,
        past_key_value=None,
    ):
        # 对输入的 hidden_states 进行层归一化
        normed_hidden_states = self.layer_norm(hidden_states)
        # 将归一化后的 hidden_states 输入到 SelfAttention 层中进行自注意力计算
        attention_output = self.SelfAttention(
            normed_hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
        )
        # 将原始的 hidden_states 和经过 dropout 处理的 attention_output 相加，作为最终输出的 hidden_states
        hidden_states = hidden_states + self.dropout(attention_output[0])
        # 构建输出元组，包含更新后的 hidden_states 和可能的 attention 情况（如果有的话）
        outputs = (hidden_states,) + attention_output[1:]  # 如果有的话，添加 attention
        return outputs


# UMT5 模型中的编码-解码注意力层定义
class UMT5LayerCrossAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化编码-解码注意力层，配置不包含相对注意力偏置
        self.EncDecAttention = UMT5Attention(config, has_relative_attention_bias=False)
        # 初始化层归一化（Layer Normalization），输入维度为 config.d_model，epsilon 设置为 config.layer_norm_epsilon
        self.layer_norm = UMT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        # 初始化 dropout，丢弃率为 config.dropout_rate
        self.dropout = nn.Dropout(config.dropout_rate)

    # 定义前向传播函数
    def forward(
        self,
        hidden_states,
        encoder_hidden_states=None,
        attention_mask=None,
        layer_head_mask=None,
        past_key_value=None,
    ):
        # 对输入的 hidden_states 进行层归一化
        normed_hidden_states = self.layer_norm(hidden_states)
        # 将归一化后的 hidden_states 输入到 EncDecAttention 层中进行编码-解码注意力计算
        attention_output = self.EncDecAttention(
            normed_hidden_states,
            encoder_hidden_states=encoder_hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
        )
        # 将原始的 hidden_states 和经过 dropout 处理的 attention_output 相加，作为最终输出的 hidden_states
        layer_output = hidden_states + self.dropout(attention_output[0])
        # 构建输出元组，包含更新后的 hidden_states 和可能的 attention 情况（如果有的话）
        outputs = (layer_output,) + attention_output[1:]  # 如果有的话，添加 attention
        return outputs


# UMT5 模型中的单个块定义，包含自注意力层、可能的编码-解码注意力层和前馈神经网络层
class UMT5Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 标志是否为解码器
        self.is_decoder = config.is_decoder
        # 层列表，用于存放块内的各层
        self.layer = nn.ModuleList()
        # 添加自注意力层到层列表中
        self.layer.append(UMT5LayerSelfAttention(config))
        # 如果是解码器，添加编码-解码注意力层到层列表中
        if self.is_decoder:
            self.layer.append(UMT5LayerCrossAttention(config))
        # 添加前馈神经网络层到层列表中
        self.layer.append(UMT5LayerFF(config))

    # 定义前向传播函数
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        layer_head_mask=None,
        cross_attn_layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        output_attentions=False,
    ):
        # 依次对层列表中的每一层进行前向传播
        for layer_module in self.layer:
            # 如果层为自注意力或编码-解码注意力层，传递相应参数进行计算
            if isinstance(layer_module, (UMT5LayerSelfAttention, UMT5LayerCrossAttention)):
                hidden_states = layer_module(
                    hidden_states,
                    attention_mask=attention_mask,
                    encoder_hidden_states=encoder_hidden_states,
                    layer_head_mask=cross_attn_layer_head_mask if isinstance(layer_module, UMT5LayerCrossAttention) else layer_head_mask,
                    past_key_value=past_key_value,
                )[0]
            else:
                # 否则，直接对隐藏状态进行前向传播
                layer_outputs = layer_module(hidden_states)
                hidden_states = layer_outputs[0]  # 更新隐藏状态为层的输出

        # 构建输出元组，包含最终更新后的 hidden_states 和可能的 attention 情况（如果有的话）
        outputs = (hidden_states,)
        if output_attentions:
            outputs = outputs + (layer_outputs[1],)  # 添加 attention 情况
        return outputs
        # Self Attention
        # 如果过去的键/值对不为 None，则取其前两个元素作为当前自注意力层的缓存键/值对
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None

        # 调用第一个层的自注意力机制，处理隐藏状态
        hidden_states, self_attn_weights, present_key_value = self.layer[0](
            hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            past_key_value=self_attn_past_key_value,
        )

        # 如果隐藏状态的数据类型为 torch.float16，则将无穷大的值 clamp 到一个较小的值，以支持 fp16 训练
        if hidden_states.dtype == torch.float16:
            max_dtype = torch.finfo(hidden_states.dtype).max
            clamp_value = torch.where(torch.isinf(hidden_states).any(), max_dtype - 1000, max_dtype)
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        # Cross-Attention Block
        cross_attn_present_key_value = None
        cross_attn_weights = None
        # 如果模型是解码器且 encoder_hidden_states 不为 None，则进行交叉注意力计算
        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
        if do_cross_attention:
            # 如果过去的键/值对不为 None，则取其后两个元素作为当前交叉注意力层的缓存键/值对
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # 调用第二个层的交叉注意力机制，处理隐藏状态和编码器的隐藏状态
            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.layer[1](
                hidden_states,
                encoder_hidden_states=encoder_hidden_states,
                attention_mask=encoder_attention_mask,
                layer_head_mask=cross_attn_layer_head_mask,
                past_key_value=cross_attn_past_key_value,
            )
            # 如果隐藏状态的数据类型为 torch.float16，则将无穷大的值 clamp 到一个较小的值，以支持 fp16 训练
            if hidden_states.dtype == torch.float16:
                max_dtype = torch.finfo(hidden_states.dtype).max
                clamp_value = torch.where(torch.isinf(hidden_states).any(), max_dtype - 1000, max_dtype)
                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

            # 更新当前的键/值对，加上交叉注意力的结果
            present_key_value += cross_attn_present_key_value

        # 应用 Feed Forward 层
        hidden_states = self.layer[-1](hidden_states)

        # 如果隐藏状态的数据类型为 torch.float16，则将无穷大的值 clamp 到一个较小的值，以支持 fp16 训练
        if hidden_states.dtype == torch.float16:
            max_dtype = torch.finfo(hidden_states.dtype).max
            clamp_value = torch.where(torch.isinf(hidden_states).any(), max_dtype - 1000, max_dtype)
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        # 设置输出内容
        outputs = (
            hidden_states,  # 最终的隐藏状态
            present_key_value,  # 当前键/值对
        )

        # 如果需要输出注意力权重，则将自注意力和交叉注意力的权重也加入输出
        if output_attentions:
            outputs += (self_attn_weights, cross_attn_weights)

        # 返回最终输出结果
        return outputs
# Copied from transformers.models.t5.modeling_t5.T5ClassificationHead with T5->UMT5
# 在 T5ClassificationHead 的基础上复制并修改为 UMT5ClassificationHead

class UMT5ClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""
    # 用于句子级分类任务的头部模块

    def __init__(self, config: UMT5Config):
        super().__init__()
        # 调用父类构造函数初始化模块
        self.dense = nn.Linear(config.d_model, config.d_model)
        # 全连接层，输入和输出维度为 config.d_model
        self.dropout = nn.Dropout(p=config.classifier_dropout)
        # Dropout 层，使用概率为 config.classifier_dropout 的概率丢弃神经元
        self.out_proj = nn.Linear(config.d_model, config.num_labels)
        # 全连接层，将维度为 config.d_model 的输入映射到 config.num_labels 的输出

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dropout(hidden_states)
        # 对输入 hidden_states 进行 Dropout 处理
        hidden_states = self.dense(hidden_states)
        # 将经过 Dropout 处理的 hidden_states 输入全连接层 self.dense
        hidden_states = torch.tanh(hidden_states)
        # 对全连接层的输出应用 Tanh 激活函数
        hidden_states = self.dropout(hidden_states)
        # 再次对输出进行 Dropout 处理
        hidden_states = self.out_proj(hidden_states)
        # 将处理后的 hidden_states 输入全连接层 self.out_proj
        return hidden_states
        # 返回全连接层的输出作为模型的输出结果


class UMT5PreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    # 处理权重初始化以及预训练模型下载和加载的抽象类

    config_class = UMT5Config
    # 使用 UMT5Config 类配置模型参数
    base_model_prefix = "transformer"
    # 基础模型前缀名为 "transformer"
    supports_gradient_checkpointing = True
    # 支持梯度检查点

    _no_split_modules = ["UMT5Block"]
    # 不拆分的模块列表，包含 "UMT5Block"
    _keep_in_fp32_modules = ["wo"]
    # 在 FP32 精度下保持的模块列表，包含 "wo"

    @property
    def dummy_inputs(self):
        input_ids = torch.tensor(DUMMY_INPUTS)
        input_mask = torch.tensor(DUMMY_MASK)
        dummy_inputs = {
            "decoder_input_ids": input_ids,
            "input_ids": input_ids,
            "decoder_attention_mask": input_mask,
        }
        return dummy_inputs
        # 返回用于测试的虚拟输入数据字典 dummy_inputs

    def _shift_right(self, input_ids):
        decoder_start_token_id = self.config.decoder_start_token_id
        pad_token_id = self.config.pad_token_id

        if decoder_start_token_id is None:
            raise ValueError(
                "self.model.config.decoder_start_token_id has to be defined. In UMT5 it is usually set to the pad_token_id. "
                "See UMT5 docs for more information."
            )
        # 如果 decoder_start_token_id 未定义，则抛出 ValueError

        # shift inputs to the right
        if is_torch_fx_proxy(input_ids):
            # Item assignment is not supported natively for proxies.
            # 对于代理对象，不支持原生的项目分配
            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
        else:
            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
            shifted_input_ids[..., 0] = decoder_start_token_id
            # 将输入向右移动一位，并在开头插入 decoder_start_token_id

        if pad_token_id is None:
            raise ValueError("self.model.config.pad_token_id has to be defined.")
            # 如果 pad_token_id 未定义，则抛出 ValueError
        # replace possible -100 values in labels by `pad_token_id`
        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
        # 将标签中可能的 -100 值替换为 pad_token_id

        return shifted_input_ids
        # 返回右移后的输入张量


class UMT5Stack(UMT5PreTrainedModel):
    # 初始化方法，接受配置和嵌入标记作为参数
    def __init__(self, config, embed_tokens=None):
        # 调用父类初始化方法，传入配置
        super().__init__(config)
        # 设置嵌入标记属性
        self.embed_tokens = embed_tokens
        # 根据配置设置解码器标志
        self.is_decoder = config.is_decoder
        # 创建一个由多个UMT5Block组成的模块列表，列表长度为配置中指定的层数
        self.block = nn.ModuleList([UMT5Block(config) for i in range(config.num_layers)])
        # 创建一个最终层归一化对象，将模型维度和epsilon作为参数
        self.final_layer_norm = UMT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        # 创建一个dropout层，使用配置中的dropout率
        self.dropout = nn.Dropout(config.dropout_rate)

        # 初始化权重并应用最终处理
        # 设置梯度检查点为False
        self.gradient_checkpointing = False
        # 执行后初始化操作
        self.post_init()

    # 返回输入嵌入对象的方法
    def get_input_embeddings(self):
        return self.embed_tokens

    # 设置新的输入嵌入对象的方法
    def set_input_embeddings(self, new_embeddings):
        self.embed_tokens = new_embeddings

    # 前向传播方法，接收多个输入参数
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        inputs_embeds=None,
        head_mask=None,
        cross_attn_head_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
UMT5_START_DOCSTRING = r"""
    UMT5 模型是由 Colin Raffel, Noam Shazeer, Adam Roberts 等人在文献 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text
    Transformer](https://arxiv.org/abs/1910.10683) 中提出的。它是一个编码解码转换器，在文本去噪生成任务中进行预训练。

    该模型继承自 [`PreTrainedModel`]。请查阅其超类文档以了解库实现的通用方法（如下载或保存模型、调整输入嵌入大小、修剪头等）。

    此模型也是 PyTorch 的 [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) 子类。
    您可以像使用常规 PyTorch 模块一样使用它，并参考 PyTorch 文档以获取有关一般使用和行为的所有相关信息。

    参数:
        config ([`UMT5Config`]): 包含模型所有参数的配置类。
            使用配置文件进行初始化不会加载与模型相关的权重，只会加载配置信息。
            查看 [`~PreTrainedModel.from_pretrained`] 方法以加载模型权重。
"""

UMT5_INPUTS_DOCSTRING = r"""
    输入文档字符串未提供具体内容，暂无注释。
"""

UMT5_ENCODER_INPUTS_DOCSTRING = r"""
    编码器输入文档字符串未提供具体内容，暂无注释。
"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            # 输入序列标记的索引，形状为(batch_size, sequence_length)。
            # UMT5 是一个具有相对位置嵌入的模型，因此可以在输入的右侧和左侧进行填充。

            # 可以使用 `AutoTokenizer` 获取这些索引。详见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__`。

            # 要了解如何为预训练准备 `input_ids`，请查看 [UMT5 Training](./umt5#training)。
        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 遮盖掩码，避免在填充的标记索引上执行注意力操作。掩码值在 `[0, 1]` 之间选择：

            # - 1 表示 **未被遮盖** 的标记，
            # - 0 表示 **被遮盖** 的标记。

            # [什么是注意力遮盖？](../glossary#attention-mask)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            # 用于取消选择自注意力模块中特定头部的掩码。掩码值在 `[0, 1]` 之间选择：

            # - 1 表示头部 **未被遮盖**，
            # - 0 表示头部 **被遮盖**。

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            # 可选参数，可以直接传递嵌入表示，而不是传递 `input_ids`。如果希望更多控制如何将 `input_ids` 索引转换为相关联向量，
            # 这非常有用，而不是使用模型内部的嵌入查找矩阵。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。查看返回的张量中的 `attentions` 以获取更多细节。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。查看返回的张量中的 `hidden_states` 以获取更多细节。

        return_dict (`bool`, *optional*):
            # 是否返回 `~utils.ModelOutput` 而不是普通的元组。
"""
The bare UMT5 Model transformer outputting raw hidden-states without any specific head on top.
"""
class UMT5Model(UMT5PreTrainedModel):
    r"""
    Examples:

    ```
    >>> from transformers import UMT5Model, AutoTokenizer

    >>> model = UMT5Model.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> noisy_text = "UN Offizier sagt, dass weiter <extra_id_0> werden muss in Syrien."
    >>> label = "<extra_id_0> verhandelt"
    >>> inputs = tokenizer(inputs, return_tensors="pt")
    >>> labels = tokenizer(label=label, return_tensors="pt")

    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
    >>> hidden_states = outputs.last_hidden_state
    ```

    Initializes UMT5 model with configuration parameters and shared embeddings.

    Args:
        config (UMT5Config): Configuration object defining model parameters.

    Attributes:
        model_type (str): Type of the model ("umt5").
        config_class (UMT5Config): Class defining model configuration settings.
        _tied_weights_keys (List[str]): List of keys for tied weights between encoder and decoder embeddings.
        shared (nn.Embedding): Shared embeddings across encoder and decoder.
        encoder (UMT5Stack): Encoder stack of the UMT5 model.
        decoder (UMT5Stack): Decoder stack of the UMT5 model.
    """
    
    model_type = "umt5"
    config_class = UMT5Config
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        # Initialize encoder with modified configuration
        encoder_config = copy.deepcopy(config)
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        self.encoder = UMT5Stack(encoder_config, self.shared)

        # Initialize decoder with modified configuration
        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        self.decoder = UMT5Stack(decoder_config, self.shared)

        # Initialize weights and apply final processing
        self.post_init()

    # Copied from transformers.models.t5.modeling_t5.T5Model.get_input_embeddings
    def get_input_embeddings(self):
        """
        Returns the shared input embeddings used by the model.
        """
        return self.shared

    # Copied from transformers.models.t5.modeling_t5.T5Model.set_input_embeddings
    def set_input_embeddings(self, new_embeddings):
        """
        Sets new shared input embeddings for the model and propagates them to encoder and decoder.
        """
        self.shared = new_embeddings
        self.encoder.set_input_embeddings(new_embeddings)
        self.decoder.set_input_embeddings(new_embeddings)

    # Copied from transformers.models.t5.modeling_t5.T5Model._tie_weights
    def _tie_weights(self):
        """
        Ties the weights between encoder and decoder embeddings if configured to do so.
        """
        if self.config.tie_word_embeddings:
            self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
            self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)

    # Copied from transformers.models.t5.modeling_t5.T5Model.get_encoder
    def get_encoder(self):
        """
        Returns the encoder stack of the model.
        """
        return self.encoder

    # Copied from transformers.models.t5.modeling_t5.T5Model.get_decoder
    def get_decoder(self):
        """
        Returns the decoder stack of the model.
        """
        return self.decoder

    # Copied from transformers.models.t5.modeling_t5.T5Model._prune_heads
    # 定义模型的前向传播函数，用于执行模型的前向计算过程
    @add_start_docstrings_to_model_forward(UMT5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的 token IDs，类型为可选的长整型张量
        attention_mask: Optional[torch.FloatTensor] = None,  # 注意力遮罩，类型为可选的浮点张量
        decoder_input_ids: Optional[torch.LongTensor] = None,  # 解码器的输入 token IDs，类型为可选的长整型张量
        decoder_attention_mask: Optional[torch.BoolTensor] = None,  # 解码器的注意力遮罩，类型为可选的布尔张量
        head_mask: Optional[torch.FloatTensor] = None,  # 注意力头部的遮罩，类型为可选的浮点张量
        decoder_head_mask: Optional[torch.FloatTensor] = None,  # 解码器注意力头部的遮罩，类型为可选的浮点张量
        cross_attn_head_mask: Optional[torch.Tensor] = None,  # 跨注意力头部的遮罩，类型为可选的张量
        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,  # 编码器的输出，类型为可选的元组
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,  # 过去的键值对，类型为可选的元组
        inputs_embeds: Optional[torch.Tensor] = None,  # 输入的嵌入向量，类型为可选的张量
        decoder_inputs_embeds: Optional[torch.Tensor] = None,  # 解码器输入的嵌入向量，类型为可选的张量
        use_cache: Optional[bool] = None,  # 是否使用缓存，类型为可选的布尔值
        output_attentions: Optional[bool] = None,  # 是否输出注意力，类型为可选的布尔值
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，类型为可选的布尔值
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，类型为可选的布尔值
# 用于给 UMT5ForConditionalGeneration 类添加文档字符串，说明其在语言建模上的应用
@add_start_docstrings("""UMT5 Model with a `language modeling` head on top.""", UMT5_START_DOCSTRING)
class UMT5ForConditionalGeneration(UMT5PreTrainedModel):
    r"""
    Examples:

    ```
    >>> from transformers import UMT5ForConditionalGeneration, AutoTokenizer

    >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

    >>> outputs = model(**inputs)
    >>> loss = outputs.loss
    ```"""

    # 模型类型标识符
    model_type = "umt5"
    # 被绑定权重的键列表
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]

    # 初始化函数，接收一个配置对象并进行初始化
    def __init__(self, config):
        super().__init__(config)
        # 设置模型维度
        self.model_dim = config.d_model

        # 共享的嵌入层，使用 nn.Embedding 初始化
        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        # 复制配置对象用于编码器
        encoder_config = copy.deepcopy(config)
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        # 初始化编码器实例
        self.encoder = UMT5Stack(encoder_config, self.shared)

        # 复制配置对象用于解码器
        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        # 初始化解码器实例
        self.decoder = UMT5Stack(decoder_config, self.shared)

        # 语言建模头部，线性层，将模型维度映射到词汇表大小
        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)

        # 初始化权重并应用最终处理
        self.post_init()

    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration 中复制的方法
    # 返回共享的嵌入层对象
    def get_input_embeddings(self):
        return self.shared

    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration 中复制的方法
    # 设置共享的嵌入层对象
    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        self.encoder.set_input_embeddings(new_embeddings)
        self.decoder.set_input_embeddings(new_embeddings)

    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration 中复制的方法
    # 如果配置要求，将权重绑定到共享的嵌入层上
    def _tie_weights(self):
        if self.config.tie_word_embeddings:
            self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
            self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)

    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration 中复制的方法
    # 设置输出嵌入层对象
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration 中复制的方法
    # 返回输出嵌入层对象
    def get_output_embeddings(self):
        return self.lm_head

    # 从 transformers.models.t5.modeling_t5.T5ForConditionalGeneration 中复制的方法
    # 返回编码器对象
    def get_encoder(self):
        return self.encoder
    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_decoder
    def get_decoder(self):
        return self.decoder
    # 返回模型的解码器对象
    
    @add_start_docstrings_to_model_forward(UMT5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        decoder_head_mask: Optional[torch.FloatTensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
    # 从输入到输出的前向传播函数，接受多个参数用于控制模型行为和计算，返回输出结果
    
    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation
    def prepare_inputs_for_generation(
        self,
        input_ids,
        past_key_values=None,
        attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        decoder_attention_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 如果使用过去的键值对，根据其长度截断输入的序列
        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[2]
    
            # 一些生成方法可能只传递最后一个输入 ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认行为：保留最后一个 ID
                remove_prefix_length = input_ids.shape[1] - 1
    
            input_ids = input_ids[:, remove_prefix_length:]
    
        return {
            "decoder_input_ids": input_ids,
            "past_key_values": past_key_values,
            "encoder_outputs": encoder_outputs,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "decoder_attention_mask": decoder_attention_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,
        }
    # 准备生成过程中需要的输入，根据传入的参数返回一个包含各种输入信息的字典
    
    # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_decoder_input_ids_from_labels
    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
        return self._shift_right(labels)
    # 根据标签生成解码器的输入序列，通过右移操作来实现
    # 定义一个函数 `_reorder_cache`，重新排列缓存中的历史键值
    def _reorder_cache(past_key_values, beam_idx):
        # 初始化一个空元组，用于存储重新排列后的历史键值
        reordered_past = ()
        # 遍历每个层的历史键值
        for layer_past in past_key_values:
            # 对每个层的历史状态按照给定的索引 `beam_idx` 进行重新排序，并转移到对应的设备上
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        # 返回重新排列后的历史键值
        return reordered_past
# 在 UMT5 模型的基础上定义了一个编码器模型 UMT5EncoderModel，用于输出编码器的原始隐藏状态，没有额外的特定头部结构。
# 继承自 UMT5PreTrainedModel，这是一个预训练模型基类。

@add_start_docstrings(
    "The bare UMT5 Model transformer outputting encoder's raw hidden-states without any specific head on top.",
    UMT5_START_DOCSTRING,
)
class UMT5EncoderModel(UMT5PreTrainedModel):
    r"""
    Examples:

    ```
    >>> from transformers import UMT5EncoderModel, AutoTokenizer

    >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
    >>> outputs = model(input_ids)
    >>> hidden_state = outputs.last_hidden_state
    ```"""

    model_type = "umt5"
    # config_class = UMT5Config
    _tied_weights_keys = ["encoder.embed_tokens.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        # 创建编码器配置的深层副本，确保不使用缓存，且不是编码器-解码器结构
        encoder_config = copy.deepcopy(config)
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        # 初始化共享的嵌入层和编码器堆栈
        self.encoder = UMT5Stack(encoder_config, self.shared)

        # 初始化权重并应用最终处理
        self.post_init()

    # 从 transformers.models.t5.modeling_t5.T5EncoderModel.get_input_embeddings 复制过来
    def get_input_embeddings(self):
        return self.shared

    # 从 transformers.models.t5.modeling_t5.T5EncoderModel.set_input_embeddings 复制过来
    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        self.encoder.set_input_embeddings(new_embeddings)

    # 从 transformers.models.t5.modeling_t5.T5EncoderModel._tie_weights 复制过来
    def _tie_weights(self):
        if self.config.tie_word_embeddings:
            self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)

    # 从 transformers.models.t5.modeling_t5.T5EncoderModel.get_encoder 复制过来
    def get_encoder(self):
        return self.encoder

    # 从 transformers.models.t5.modeling_t5.T5EncoderModel._prune_heads 复制过来
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.block[layer].layer[0].SelfAttention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(UMT5_ENCODER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
    # 从 transformers.models.t5.modeling_t5.T5EncoderModel.forward 复制过来，将 T5 替换为 UMT5，google-t5/t5-small 替换为 google/umt5-small
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的token ID序列，可以为空
        attention_mask: Optional[torch.FloatTensor] = None,  # 注意力遮罩，用于指示模型应该关注哪些token
        head_mask: Optional[torch.FloatTensor] = None,  # 头部遮罩，控制每个注意力头的掩盖
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 可选的嵌入输入，代替输入ID
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否以字典形式返回输出
    ) -> Union[Tuple[torch.FloatTensor], BaseModelOutput]:
        r"""
        Returns:
            返回值的类型可以是包含Tensor的元组，或者BaseModelOutput对象

        Example:

        ```
        >>> from transformers import AutoTokenizer, UMT5EncoderModel

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```"""
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 确定是否使用配置中的返回字典选项

        encoder_outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 使用encoder处理输入，返回编码器的输出

        return encoder_outputs
"""
UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
tasks.
"""
# UMT5 序列分类模型，顶部有一个序列分类头部（在汇总输出之上的线性层），例如用于 GLUE 任务。
@add_start_docstrings(
    """
    UMT5 Encoder Model with a token classification head on top (a linear layer on top of the hidden-states output)
    e.g. for Named-Entity-Recognition (NER) tasks.
    """,
    UMT5_START_DOCSTRING,
)
# UMT5 编码器模型，顶部有一个标记分类头部（在隐藏状态输出之上的线性层），例如用于命名实体识别（NER）任务。
class UMT5ForTokenClassification(UMT5PreTrainedModel):
    # Keys to ignore when loading unexpected elements during model loading
    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
    # Keys indicating tied weights between encoder and decoder
    _tied_weights_keys = ["transformer.encoder.embed_tokens.weight"]

    # Copied from transformers.models.t5.modeling_t5.T5ForTokenClassification.__init__ with T5->UMT5
    def __init__(self, config: UMT5Config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # UMT5 编码器模型
        self.transformer = UMT5EncoderModel(config)
        # Dropout layer
        self.dropout = nn.Dropout(config.classifier_dropout)
        # Linear layer for classification
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(UMT5_INPUTS_DOCSTRING)
"""
UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
tasks.
"""
# UMT5 序列分类模型，顶部有一个序列分类头部（在汇总输出之上的线性层），例如用于 GLUE 任务。
@add_start_docstrings(
    """
    UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    """,
    UMT5_START_DOCSTRING,
)
class UMT5ForSequenceClassification(UMT5PreTrainedModel):
    # Keys to ignore when loading unexpected elements during model loading
    _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"]
    # Keys indicating tied weights between encoder and decoder
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    # Copied from transformers.models.t5.modeling_t5.T5ForSequenceClassification.__init__ with T5->UMT5
    def __init__(self, config: UMT5Config):
        super().__init__(config)
        # UMT5 模型的变换器
        self.transformer = UMT5Model(config)
        # UMT5 模型的分类头部
        self.classification_head = UMT5ClassificationHead(config)

        # Initialize weights and apply final processing
        self.post_init()

        # Model parallelism setting
        self.model_parallel = False

    @add_start_docstrings_to_model_forward(UMT5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
    # 替换返回值文档字符串，输出类型为 Seq2SeqSequenceClassifierOutput，配置类为 _CONFIG_FOR_DOC
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
):
    @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
    # 使用装饰器替换返回值文档字符串，指定输出类型为TokenClassifierOutput，配置类为_CONFIG_FOR_DOC
    # 从transformers.models.t5.modeling_t5.T5ForTokenClassification.forward复制而来，将T5替换为UMT5
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        标签 (`torch.LongTensor`，形状为 `(batch_size, sequence_length)`，*可选*):
            用于计算标记分类损失的标签。索引应在 `[0, ..., config.num_labels - 1]` 范围内。
        返回:
        """
        # 如果 return_dict 不为 None，则使用给定值；否则使用 self.config.use_return_dict 的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给 Transformer 模型
        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取隐藏状态并进行 dropout 处理
        hidden_states = outputs[0]
        hidden_states = self.dropout(hidden_states)

        # 将隐藏状态传递给分类器得到 logits
        logits = self.classifier(hidden_states)

        # 如果提供了标签，则计算损失
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不返回字典格式的结果，则返回元组格式的输出
        if not return_dict:
            output = (logits, outputs[2:-1])  # 排除最后一个元素
            return ((loss,) + output) if loss is not None else output

        # 返回 TokenClassifierOutput 类型的结果，包括损失、logits、隐藏状态和注意力
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 为 UMT5ForQuestionAnswering 类添加文档字符串，描述其作为 UMT5 模型的问题回答器的用途和结构
@add_start_docstrings(
    """
    UMT5 Model with a span classification head on top for extractive question-answering tasks like SQuAD (linear layers
    on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    UMT5_START_DOCSTRING,
)
class UMT5ForQuestionAnswering(UMT5PreTrainedModel):
    # 定义一个列表，包含与权重绑定相关的键
    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]

    # 初始化方法，接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 设置模型维度为配置对象中的模型维度
        self.model_dim = config.d_model

        # 创建一个共享的嵌入层，用于共享词汇表和模型维度的嵌入
        self.shared = nn.Embedding(config.vocab_size, config.d_model)

        # 复制编码器配置，设置为非解码器模式，并禁用缓存，创建编码器对象
        encoder_config = copy.deepcopy(config)
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        self.encoder = UMT5Stack(encoder_config, self.shared)

        # 复制解码器配置，设置为解码器模式，并创建解码器对象
        decoder_config = copy.deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        self.decoder = UMT5Stack(decoder_config, self.shared)

        # 设置模型输出标签数量和一个线性层用于问题回答任务的输出
        self.num_labels = config.num_labels
        self.qa_outputs = nn.Linear(config.d_model, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    # 从 transformers 库中 T5ForQuestionAnswering 类的方法复制，返回共享的嵌入层
    def get_input_embeddings(self):
        return self.shared

    # 从 transformers 库中 T5ForQuestionAnswering 类的方法复制，设置新的输入嵌入层
    def set_input_embeddings(self, new_embeddings):
        # 更新共享的嵌入层
        self.shared = new_embeddings
        # 更新编码器和解码器的输入嵌入层
        self.encoder.set_input_embeddings(new_embeddings)
        self.decoder.set_input_embeddings(new_embeddings)

    # 从 transformers 库中 T5ForQuestionAnswering 类的方法复制，用于绑定权重
    def _tie_weights(self):
        # 如果配置指定要绑定词嵌入权重，则将编码器和解码器的词嵌入权重绑定到共享的嵌入层上
        if self.config.tie_word_embeddings:
            self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
            self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)

    # 从 transformers 库中 T5ForQuestionAnswering 类的方法复制，返回编码器对象
    def get_encoder(self):
        return self.encoder

    # 从 transformers 库中 T5ForQuestionAnswering 类的方法复制，返回解码器对象
    def get_decoder(self):
        return self.decoder

    # 使用装饰器添加模型前向方法的文档字符串，描述输入和输出的结构和用途
    @add_start_docstrings_to_model_forward(UMT5_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC)
    # 定义一个方法用于模型的前向传播，接受多个可选参数
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入序列的token IDs，可以为None
        attention_mask: Optional[torch.FloatTensor] = None,  # 输入序列的注意力掩码，可以为None
        decoder_input_ids: Optional[torch.LongTensor] = None,  # 解码器输入序列的token IDs，可以为None
        decoder_attention_mask: Optional[torch.BoolTensor] = None,  # 解码器输入序列的注意力掩码，可以为None
        head_mask: Optional[torch.FloatTensor] = None,  # 多头注意力机制的掩码，可以为None
        decoder_head_mask: Optional[torch.FloatTensor] = None,  # 解码器的多头注意力机制的掩码，可以为None
        cross_attn_head_mask: Optional[torch.Tensor] = None,  # 交叉注意力的多头掩码，可以为None
        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,  # 编码器的输出，可以为None
        start_positions: Optional[torch.LongTensor] = None,  # 开始位置的token ID，可以为None
        end_positions: Optional[torch.LongTensor] = None,  # 结束位置的token ID，可以为None
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 嵌入的输入张量，可以为None
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,  # 解码器的嵌入输入张量，可以为None
        use_cache: Optional[bool] = None,  # 是否使用缓存，可以为None
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可以为None
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可以为None
        return_dict: Optional[bool] = None,  # 是否以字典的形式返回结果，可以为None

`.\models\umt5\init.py`

# 版权声明和许可信息
#
# 版权 2023 年由 HuggingFace 团队所有。保留所有权利。
#
# 根据 Apache 许可证版本 2.0 授权。
# 除非符合许可证的要求，否则您不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件，
# 没有任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。

# 导入所需的类型检查工具
from typing import TYPE_CHECKING

# 引入自定义的异常和模块加载延迟工具
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义要导入的结构化模块的映射字典
_import_structure = {"configuration_umt5": ["UMT5Config", "UMT5OnnxConfig"]}

# 尝试检查是否有 Torch 库可用，如果不可用则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，则定义需要导入的模型相关模块列表
    _import_structure["modeling_umt5"] = [
        "UMT5EncoderModel",
        "UMT5ForConditionalGeneration",
        "UMT5ForQuestionAnswering",
        "UMT5ForSequenceClassification",
        "UMT5ForTokenClassification",
        "UMT5Model",
        "UMT5PreTrainedModel",
    ]

# 如果是类型检查模式
if TYPE_CHECKING:
    # 导入配置相关的类型
    from .configuration_umt5 import UMT5Config, UMT5OnnxConfig

    # 再次尝试检查 Torch 是否可用，如果不可用则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入模型相关的类型
        from .modeling_umt5 import (
            UMT5EncoderModel,
            UMT5ForConditionalGeneration,
            UMT5ForQuestionAnswering,
            UMT5ForSequenceClassification,
            UMT5ForTokenClassification,
            UMT5Model,
            UMT5PreTrainedModel,
        )
# 如果不是类型检查模式
else:
    import sys

    # 将当前模块设置为延迟加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\unispeech\configuration_unispeech.py`

# coding=utf-8
# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
UniSpeech model configuration
"""

import functools  # 导入 functools 模块，用于高阶函数操作
import operator   # 导入 operator 模块，用于函数式编程中的操作符函数

from ...configuration_utils import PretrainedConfig  # 导入预训练配置基类
from ...utils import logging  # 导入日志记录模块

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器实例

UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/unispeech-large-1500h-cv": (
        "https://huggingface.co/microsoft/unispeech-large-1500h-cv/resolve/main/config.json"
    ),
    # 查看所有 UniSpeech 模型，请访问 https://huggingface.co/models?filter=unispeech
}


class UniSpeechConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`UniSpeechModel`]. It is used to instantiate an
    UniSpeech model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the UniSpeech
    [microsoft/unispeech-large-1500h-cv](https://huggingface.co/microsoft/unispeech-large-1500h-cv) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```
    >>> from transformers import UniSpeechConfig, UniSpeechModel

    >>> # Initializing a UniSpeech facebook/unispeech-base-960h style configuration
    >>> configuration = UniSpeechConfig()

    >>> # Initializing a model (with random weights) from the facebook/unispeech-base-960h style configuration
    >>> model = UniSpeechModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    
    model_type = "unispeech"  # 定义模型类型为 unispeech
    # 初始化函数，用于创建一个新的实例
    def __init__(
        self,
        vocab_size=32,  # 词汇表大小，默认为32
        hidden_size=768,  # 隐藏层大小，默认为768
        num_hidden_layers=12,  # Transformer模型中隐藏层的数量，默认为12
        num_attention_heads=12,  # Transformer模型中注意力头的数量，默认为12
        intermediate_size=3072,  # Transformer模型中间层的大小，默认为3072
        hidden_act="gelu",  # 隐藏层激活函数，默认为GELU
        hidden_dropout=0.1,  # 隐藏层dropout比例，默认为0.1
        activation_dropout=0.1,  # 激活函数dropout比例，默认为0.1
        attention_dropout=0.1,  # 注意力机制dropout比例，默认为0.1
        feat_proj_dropout=0.0,  # 特征投影层dropout比例，默认为0.0
        feat_quantizer_dropout=0.0,  # 特征量化器dropout比例，默认为0.0
        final_dropout=0.1,  # 最终输出层dropout比例，默认为0.1
        layerdrop=0.1,  # LayerDrop比例，默认为0.1
        initializer_range=0.02,  # 初始化权重的范围，默认为0.02
        layer_norm_eps=1e-5,  # Layer normalization的epsilon值，默认为1e-5
        feat_extract_norm="group",  # 特征提取层的归一化类型，默认为"group"
        feat_extract_activation="gelu",  # 特征提取层的激活函数，默认为GELU
        conv_dim=(512, 512, 512, 512, 512, 512, 512),  # 卷积层的维度，默认为(512, 512, 512, 512, 512, 512, 512)
        conv_stride=(5, 2, 2, 2, 2, 2, 2),  # 卷积层的步长，默认为(5, 2, 2, 2, 2, 2, 2)
        conv_kernel=(10, 3, 3, 3, 3, 2, 2),  # 卷积层的核大小，默认为(10, 3, 3, 3, 3, 2, 2)
        conv_bias=False,  # 是否使用卷积层的偏置，默认为False
        num_conv_pos_embeddings=128,  # 卷积位置嵌入的数量，默认为128
        num_conv_pos_embedding_groups=16,  # 卷积位置嵌入的分组数量，默认为16
        do_stable_layer_norm=False,  # 是否进行稳定的层归一化，默认为False
        apply_spec_augment=True,  # 是否应用语音数据增强，默认为True
        mask_time_prob=0.05,  # 时间掩码的概率，默认为0.05
        mask_time_length=10,  # 时间掩码的长度，默认为10
        mask_time_min_masks=2,  # 时间掩码的最小掩码数，默认为2
        mask_feature_prob=0.0,  # 特征掩码的概率，默认为0.0
        mask_feature_length=10,  # 特征掩码的长度，默认为10
        mask_feature_min_masks=0,  # 特征掩码的最小掩码数，默认为0
        num_codevectors_per_group=320,  # 每组的编码向量数量，默认为320
        num_codevector_groups=2,  # 编码向量的分组数量，默认为2
        contrastive_logits_temperature=0.1,  # 对比损失的温度参数，默认为0.1
        num_negatives=100,  # 对比损失中的负样本数量，默认为100
        codevector_dim=256,  # 编码向量的维度，默认为256
        proj_codevector_dim=256,  # 投影编码向量的维度，默认为256
        diversity_loss_weight=0.1,  # 多样性损失的权重，默认为0.1
        ctc_loss_reduction="mean",  # CTC损失的减少方式，默认为"mean"
        ctc_zero_infinity=False,  # CTC损失中是否使用零和无穷，默认为False
        use_weighted_layer_sum=False,  # 是否使用加权层求和，默认为False
        classifier_proj_size=256,  # 分类器投影层的大小，默认为256
        num_ctc_classes=80,  # CTC损失中的类别数量，默认为80
        pad_token_id=0,  # 填充标记的ID，默认为0
        bos_token_id=1,  # 起始标记的ID，默认为1
        eos_token_id=2,  # 终止标记的ID，默认为2
        replace_prob=0.5,  # 替换概率，默认为0.5
        **kwargs,  # 其他关键字参数
    ):
        # 返回输入到logits比例的属性值，计算conv_stride列表中所有元素的乘积
        @property
        def inputs_to_logits_ratio(self):
            return functools.reduce(operator.mul, self.conv_stride, 1)

`.\models\unispeech\convert_unispeech_original_pytorch_checkpoint_to_pytorch.py`

# 设置编码格式为 UTF-8
# 版权声明和许可协议信息，指明代码版权及使用条款
# 导入 argparse 用于命令行解析，json 和 os 用于文件操作
import argparse
import json
import os

# 导入 fairseq 和 torch 库
import fairseq
import torch
# 从 fairseq 库中导入 Dictionary 类

from fairseq.data import Dictionary

# 从 transformers 库中导入以下模块和函数
from transformers import (
    UniSpeechConfig,                  # 导入 UniSpeechConfig 类
    UniSpeechForCTC,                  # 导入 UniSpeechForCTC 类
    UniSpeechForPreTraining,          # 导入 UniSpeechForPreTraining 类
    Wav2Vec2FeatureExtractor,         # 导入 Wav2Vec2FeatureExtractor 类
    Wav2Vec2PhonemeCTCTokenizer,      # 导入 Wav2Vec2PhonemeCTCTokenizer 类
    Wav2Vec2Processor,                # 导入 Wav2Vec2Processor 类
    logging,                          # 导入 logging 模块
)

# 设置 logging 的详细级别为 info
logging.set_verbosity_info()
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义映射关系字典，用于转换 UniSpeech 模型的参数名称
MAPPING = {
    "post_extract_proj": "feature_projection.projection",
    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
    "fc2": "encoder.layers.*.feed_forward.output_dense",
    "final_layer_norm": "encoder.layers.*.final_layer_norm",
    "encoder.layer_norm": "encoder.layer_norm",
    "w2v_model.layer_norm": "feature_projection.layer_norm",
    "quantizer.weight_proj": "quantizer.weight_proj",
    "quantizer.vars": "quantizer.codevectors",
    "project_q": "project_q",
    "final_proj": "project_hid",
    "w2v_encoder.proj": "ctc_proj",
    "mask_emb": "masked_spec_embed",
}

# 定义顶层键列表，列出需要处理的顶层参数名称
TOP_LEVEL_KEYS = [
    "ctc_proj",
    "quantizer.weight_proj",
    "quantizer.codevectors",
    "project_q",
    "project_hid",
]

# 定义函数 set_recursively，用于递归设置参数值
def set_recursively(hf_pointer, key, value, full_name, weight_type, is_finetuned):
    # 根据键名逐级获取 hf_pointer 对象的属性
    for attribute in key.split("."):
        # 如果是微调模型并且当前属性属于需要跳过的层，则直接返回
        if is_finetuned:
            if attribute in ["quantizer", "project_q", "project_hid"]:
                return

            # 对于微调的音素模型，将 `ctc_proj` 重命名为 `lm_head`
            if attribute == "ctc_proj":
                attribute = "lm_head"

        # 获取 hf_pointer 对象的下一级属性
        hf_pointer = getattr(hf_pointer, attribute)

    # 如果 weight_type 不为空，则获取 hf_pointer 对应属性的形状
    if weight_type is not None:
        hf_shape = getattr(hf_pointer, weight_type).shape
    else:
        hf_shape = hf_pointer.shape

    # 断言判断当前参数的形状与预期值 value.shape 是否相符
    assert hf_shape == value.shape, (
        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
        f" {value.shape} for {full_name}"
    )
    # 如果 weight_type 是 "weight"，则将 value 赋给 hf_pointer 的权重数据
    if weight_type == "weight":
        hf_pointer.weight.data = value
    # 如果 weight_type 是 "weight_g"，则将 value 赋给 hf_pointer 的权重梯度数据
    elif weight_type == "weight_g":
        hf_pointer.weight_g.data = value
    # 如果 weight_type 是 "weight_v"，则将 value 赋给 hf_pointer 的权重版本数据
    elif weight_type == "weight_v":
        hf_pointer.weight_v.data = value
    # 如果 weight_type 是 "bias"，则将 value 赋给 hf_pointer 的偏置数据
    elif weight_type == "bias":
        hf_pointer.bias.data = value
    else:
        # 如果 weight_type 为空或未知，则直接将 value 赋给 hf_pointer
        hf_pointer.data = value

    # 记录初始化信息，包括 key、weight_type（如果存在）、和 full_name
    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
# 递归加载权重函数，将 Fairseq 模型的权重加载到 Hugging Face 模型中
def recursively_load_weights(fairseq_model, hf_model, is_finetuned):
    # 存储未使用的权重名称列表
    unused_weights = []
    # 获取 Fairseq 模型的状态字典
    fairseq_dict = fairseq_model.state_dict()

    # 获取 Hugging Face 模型的特征提取器
    feature_extractor = hf_model.unispeech.feature_extractor

    # 遍历 Fairseq 模型的状态字典
    for name, value in fairseq_dict.items():
        # 标记当前权重是否被使用的布尔值
        is_used = False
        
        # 如果权重名称中包含 "conv_layers"，则加载卷积层权重
        if "conv_layers" in name:
            load_conv_layer(
                name,
                value,
                feature_extractor,
                unused_weights,
                hf_model.config.feat_extract_norm == "group",
            )
            is_used = True
        else:
            # 否则，遍历 MAPPING 字典，查找是否有匹配的键值对应的权重名称
            for key, mapped_key in MAPPING.items():
                # 如果权重名称中包含键值或者与键值相关的名称，则标记为已使用
                mapped_key = "unispeech." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
                    is_used = True
                    # 如果 mapped_key 中包含通配符 "*"，则替换为层索引
                    if "*" in mapped_key:
                        layer_index = name.split(key)[0].split(".")[-2]
                        mapped_key = mapped_key.replace("*", layer_index)
                    # 确定权重类型
                    if "weight_g" in name:
                        weight_type = "weight_g"
                    elif "weight_v" in name:
                        weight_type = "weight_v"
                    elif "bias" in name:
                        weight_type = "bias"
                    elif "weight" in name:
                        weight_type = "weight"
                    else:
                        weight_type = None
                    # 递归设置 Hugging Face 模型的权重
                    set_recursively(hf_model, mapped_key, value, name, weight_type, is_finetuned)
                continue
        
        # 如果该权重未被使用，则将其名称添加到未使用的权重列表中
        if not is_used:
            unused_weights.append(name)

    # 输出未使用的权重名称列表的警告信息
    logger.warning(f"Unused weights: {unused_weights}")


# 加载卷积层权重的函数
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
    # 提取卷积层名称
    name = full_name.split("conv_layers.")[-1]
    items = name.split(".")
    layer_id = int(items[0])
    type_id = int(items[1])

    # 根据卷积层类型加载权重
    if type_id == 0:
        # 如果权重名称中包含 "bias"，则加载偏置项权重
        if "bias" in name:
            # 断言当前权重的形状与特征提取器中对应卷积层的偏置项数据形状一致
            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
            )
            # 将权重值赋予特征提取器中对应卷积层的偏置项数据，并输出信息
            feature_extractor.conv_layers[layer_id].conv.bias.data = value
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
        # 如果权重名称中包含 "weight"，则加载权重矩阵
        elif "weight" in name:
            # 断言当前权重的形状与特征提取器中对应卷积层的权重矩阵数据形状一致
            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
            )
            # 将权重值赋予特征提取器中对应卷积层的权重矩阵数据，并输出信息
            feature_extractor.conv_layers[layer_id].conv.weight.data = value
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
    # 如果 type_id 等于 2 并且不使用组规范，或者 type_id 等于 2 且 layer_id 等于 0 并且使用组规范
    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
        # 如果变量名中包含 "bias"
        if "bias" in name:
            # 断言当前值的形状与特征提取器中指定卷积层的层归一化偏置数据形状相同
            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
                f"{full_name} has size {value.shape}, but {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was"
                " found."
            )
            # 将当前值赋给特征提取器中指定卷积层的层归一化偏置数据
            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
            # 记录日志，指示从哪里初始化了特征提取器中指定卷积层的层归一化权重
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
        # 如果变量名中包含 "weight"
        elif "weight" in name:
            # 断言当前值的形状与特征提取器中指定卷积层的层归一化权重数据形状相同
            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
            )
            # 将当前值赋给特征提取器中指定卷积层的层归一化权重数据
            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
            # 记录日志，指示从哪里初始化了特征提取器中指定卷积层的层归一化权重
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
    else:
        # 将未使用的权重名称添加到未使用的权重列表中
        unused_weights.append(full_name)
# 使用 torch.no_grad() 装饰器，确保在此函数执行期间不会进行梯度计算
@torch.no_grad()
# 定义函数 convert_unispeech_checkpoint，用于将模型权重从 UniSpeech 转换到 transformers 设计
def convert_unispeech_checkpoint(
    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    # 如果提供了 config_path，则从预训练模型加载 UniSpeechConfig
    if config_path is not None:
        config = UniSpeechConfig.from_pretrained(config_path)
    else:
        # 否则创建一个新的 UniSpeechConfig 实例
        config = UniSpeechConfig()

    # 如果是 finetuned 模型
    if is_finetuned:
        # 如果提供了 dict_path，则加载 Dictionary 对象
        if dict_path:
            target_dict = Dictionary.load_from_json(dict_path)

            # 重要的更改：由于 CTC 符号为 <pad> 而非 <s>（与 fairseq 不同），修改 bos & pad token id
            config.bos_token_id = target_dict.pad_index
            config.pad_token_id = target_dict.bos_index
            config.eos_token_id = target_dict.eos_index
            config.vocab_size = len(target_dict.symbols)
            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")

            # 检查 pytorch_dump_folder_path 是否是一个目录，如果不是则记录错误并返回
            if not os.path.isdir(pytorch_dump_folder_path):
                logger.error("--pytorch_dump_folder_path ({}) should be a directory".format(pytorch_dump_folder_path))
                return

            # 创建 pytorch_dump_folder_path 目录（如果不存在）
            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
            vocab_dict = target_dict.indices

            # fairseq 中 <pad> 和 <s> 被交换了，重新设置 vocab_dict
            vocab_dict["<pad>"] = 42
            vocab_dict["<s>"] = 43

            # 将 vocab_dict 写入 vocab_path 文件中
            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
                json.dump(vocab_dict, vocab_handle)

            # 使用 Wav2Vec2PhonemeCTCTokenizer 初始化 tokenizer
            tokenizer = Wav2Vec2PhonemeCTCTokenizer(
                vocab_path,
                unk_token=target_dict.unk_word,
                pad_token=target_dict.pad_word,
                bos_token=target_dict.bos_word,
                eos_token=target_dict.eos_word,
                word_delimiter_token="|",
                do_lower_case=False,
            )

            # 根据 config 中 feat_extract_norm 的设置确定 return_attention_mask 的值
            return_attention_mask = True if config.feat_extract_norm == "layer" else False

            # 使用 Wav2Vec2FeatureExtractor 初始化 feature_extractor
            feature_extractor = Wav2Vec2FeatureExtractor(
                feature_size=1,
                sampling_rate=16000,
                padding_value=0,
                do_normalize=True,
                return_attention_mask=return_attention_mask,
            )

            # 使用初始化的 feature_extractor 和 tokenizer 创建 processor
            processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

            # 将 processor 的预训练模型保存到 pytorch_dump_folder_path 中
            processor.save_pretrained(pytorch_dump_folder_path)

        # 初始化 hf_unispeech 为 UniSpeechForCTC 实例
        hf_unispeech = UniSpeechForCTC(config)
    else:
        # 初始化 hf_unispeech 为 UniSpeechForPreTraining 实例
        hf_unispeech = UniSpeechForPreTraining(config)

    # 如果是 finetuned 模型
    if is_finetuned:
        # 使用 fairseq.checkpoint_utils.load_model_ensemble_and_task 加载模型及其任务
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
            [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1]), "w2v_path": checkpoint_path}
        )
    else:
        # 使用 fairseq.checkpoint_utils.load_model_ensemble_and_task 加载模型及其任务
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path])

    # 将模型设置为 evaluation 模式
    model = model[0].eval()

    # 递归加载模型权重到 hf_unispeech
    recursively_load_weights(model, hf_unispeech, is_finetuned)

    # 将 hf_unispeech 的预训练模型保存到 pytorch_dump_folder_path 中
    hf_unispeech.save_pretrained(pytorch_dump_folder_path)


if __name__ == "__main__":
    # 创建参数解析器对象
    parser = argparse.ArgumentParser()
    # 添加命令行参数，指定输出 PyTorch 模型的路径
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加命令行参数，指定 fairseq 检查点的路径
    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
    # 添加命令行参数，指定微调模型的字典路径
    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
    # 添加命令行参数，指定要转换的模型的 hf config.json 路径
    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
    # 添加命令行参数，指定要转换的模型是否为微调模型
    parser.add_argument(
        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
    )
    # 解析命令行参数，并将其存储在 args 对象中
    args = parser.parse_args()
    # 调用函数 convert_unispeech_checkpoint，传递解析后的参数来执行模型转换操作
    convert_unispeech_checkpoint(
        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
    )

`.\models\unispeech\modeling_unispeech.py`

# 设置文件编码为UTF-8
# 版权声明
# 2021年由Fairseq作者和HuggingFace团队保留所有权利。
#
# 根据Apache许可证2.0版（“许可证”）许可;
# 除非符合许可证，否则不得使用此文件。
# 您可以获取许可证的副本，详见
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件
# 没有任何形式的担保或条件，明示或暗示。
# 有关更多详细信息，请参阅许可证。
""" PyTorch UniSpeech模型。"""

import math  # 导入数学模块
import warnings  # 导入警告模块
from dataclasses import dataclass  # 导入dataclass用于数据类
from typing import Optional, Tuple, Union  # 导入类型提示相关

import numpy as np  # 导入NumPy库
import torch  # 导入PyTorch库
import torch.utils.checkpoint  # 导入PyTorch的checkpoint模块
from torch import nn  # 从PyTorch中导入神经网络模块
from torch.nn import CrossEntropyLoss  # 导入交叉熵损失函数

from ...activations import ACT2FN  # 导入激活函数
from ...integrations.deepspeed import is_deepspeed_zero3_enabled  # 导入DeepSpeed集成函数
from ...modeling_outputs import (  # 导入模型输出相关类
    BaseModelOutput,
    CausalLMOutput,
    SequenceClassifierOutput,
    Wav2Vec2BaseModelOutput,
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型工具类
from ...utils import (  # 导入通用实用程序功能
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_unispeech import UniSpeechConfig  # 导入UniSpeech模型配置

logger = logging.get_logger(__name__)  # 获取日志记录器


_HIDDEN_STATES_START_POSITION = 2  # 隐藏状态的起始位置索引

# 通用文档字符串
_CONFIG_FOR_DOC = "UniSpeechConfig"

# 基础文档字符串
_CHECKPOINT_FOR_DOC = "patrickvonplaten/unispeech-large-1500h-cv-timit"
_EXPECTED_OUTPUT_SHAPE = [1, 292, 1024]

# CTC（连接时序分类）文档字符串
_CTC_EXPECTED_OUTPUT = "'mister quilter is the apposl of the midle classes and weare glad to welcom his gosepl'"
_CTC_EXPECTED_LOSS = 17.17

UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/unispeech-large-1500h-cv",
    "microsoft/unispeech-large-multi-lingual-1500h-cv",
    # 查看所有UniSpeech模型：https://huggingface.co/models?filter=unispeech
]


@dataclass
class UniSpeechForPreTrainingOutput(ModelOutput):
    """
    [`UniSpeechForPreTrainingOutput`]的输出类型，包含潜在的隐藏状态和注意力。
    """
    # 定义函数的参数及其类型注解
    Args:
        loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
            训练模式下返回的损失，是对比损失（L_m）和多样性损失（L_d）的总和，详见官方论文[https://arxiv.org/pdf/2006.11477.pdf]。
        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
            模型隐藏状态投影到 `config.proj_codevector_dim` 维度后的结果，可以用来预测掩码后的量化投影状态。
        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
            量化提取的特征向量投影到 `config.proj_codevector_dim` 维度后的结果，代表对比损失的正向目标向量。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            包含模型每层输出的隐藏状态的元组，每个元素为 `torch.FloatTensor`，形状为 `(batch_size, sequence_length, hidden_size)`。
            在 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回。
    
            包括每层输出以及初始嵌入输出的隐藏状态。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            包含自注意力每层注意力权重的元组，每个元素为 `torch.FloatTensor`，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            在 `output_attentions=True` 或 `config.output_attentions=True` 时返回。
    
            经过注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
def _compute_mask_indices(
    shape: Tuple[int, int],
    mask_prob: float,
    mask_length: int,
    attention_mask: Optional[torch.LongTensor] = None,
    min_masks: int = 0,
) -> np.ndarray:
    """
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    """
    batch_size, sequence_length = shape  # 解包形状元组为批次大小和序列长度

    if mask_length < 1:
        raise ValueError("`mask_length` has to be bigger than 0.")  # 抛出值错误异常，如果掩码长度小于1

    if mask_length > sequence_length:
        raise ValueError(
            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
            f" and `sequence_length`: {sequence_length}`"
        )  # 抛出值错误异常，如果掩码长度大于序列长度

    # epsilon is used for probabilistic rounding
    epsilon = np.random.rand(1).item()  # 生成一个随机浮点数作为 epsilon，用于概率舍入

    def compute_num_masked_span(input_length):
        """Given input length, compute how many spans should be masked"""
        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)  # 计算应该掩盖的跨度数量
        num_masked_span = max(num_masked_span, min_masks)  # 取较大值，确保掩盖的跨度数量不小于最小要求

        # make sure num masked span <= sequence_length
        if num_masked_span * mask_length > sequence_length:
            num_masked_span = sequence_length // mask_length  # 确保掩盖的跨度数量不超过序列长度

        # make sure num_masked span is also <= input_length - (mask_length - 1)
        if input_length - (mask_length - 1) < num_masked_span:
            num_masked_span = max(input_length - (mask_length - 1), 0)  # 确保掩盖的跨度数量不超过输入长度减去 (掩码长度 - 1)

        return num_masked_span

    # compute number of masked spans in batch
    input_lengths = (
        attention_mask.sum(-1).detach().tolist()
        if attention_mask is not None
        else [sequence_length for _ in range(batch_size)]
    )  # 计算批次中每个序列的输入长度，如果有注意力掩码则使用其求和，否则使用序列长度

    # SpecAugment mask to fill
    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)  # 创建一个布尔类型的全零数组，用于存储 SpecAugment 掩码
    spec_aug_mask_idxs = []  # 创建一个空列表，用于存储 SpecAugment 掩码的索引

    max_num_masked_span = compute_num_masked_span(sequence_length)  # 计算应该掩盖的最大跨度数量
    # 如果最大的被屏蔽段数为0，则直接返回原始的spec_aug_mask
    if max_num_masked_span == 0:
        return spec_aug_mask

    # 遍历输入长度列表
    for input_length in input_lengths:
        # 计算当前输入的被屏蔽段数
        num_masked_span = compute_num_masked_span(input_length)

        # 随机选择要屏蔽的索引位置
        spec_aug_mask_idx = np.random.choice(
            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
        )

        # 选择第一个样本索引作为填充向量的虚拟索引，确保所有批次的维度相同，由于概率性舍入
        # 选择第一个样本只是为了使那些向量填充两次。
        if len(spec_aug_mask_idx) == 0:
            # 如果长度为0，表示input_length严格小于sequence_length，最后一个标记应该是填充标记，我们可以使用它作为虚拟屏蔽id
            dummy_mask_idx = sequence_length - 1
        else:
            dummy_mask_idx = spec_aug_mask_idx[0]

        # 将dummy_mask_idx重复填充到数组，以保证数组长度为max_num_masked_span
        spec_aug_mask_idx = np.concatenate(
            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
        )
        spec_aug_mask_idxs.append(spec_aug_mask_idx)

    # 将列表转换为numpy数组
    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)

    # 将屏蔽索引扩展为屏蔽段
    spec_aug_mask_idxs = np.broadcast_to(
        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
    )
    # 将多维数组展平为一维数组
    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)

    # 添加偏移量以创建屏蔽段的起始索引
    offsets = np.arange(mask_length)[None, None, :]
    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
        batch_size, max_num_masked_span * mask_length
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets

    # 确保索引不超过sequence_length - 1
    if spec_aug_mask_idxs.max() > sequence_length - 1:
        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1

    # 将屏蔽标记散布到数组中的索引位置
    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)

    # 返回生成的spec_aug_mask
    return spec_aug_mask
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->UniSpeech
class UniSpeechNoLayerNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 从配置中获取输入和输出的卷积维度
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个一维卷积层，指定输入、输出维度、卷积核大小、步长和是否有偏置
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        # 根据配置选择激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

    def forward(self, hidden_states):
        # 对输入的隐藏状态进行一维卷积
        hidden_states = self.conv(hidden_states)
        # 应用预先选择的激活函数
        hidden_states = self.activation(hidden_states)
        return hidden_states


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->UniSpeech
class UniSpeechLayerNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 从配置中获取输入和输出的卷积维度
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个一维卷积层，指定输入、输出维度、卷积核大小、步长和是否有偏置
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        # 创建一个层归一化层，归一化输出特征向量并保留可学习的仿射变换
        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
        # 根据配置选择激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

    def forward(self, hidden_states):
        # 对输入的隐藏状态进行一维卷积
        hidden_states = self.conv(hidden_states)

        # 将卷积输出的维度转置，为了适应层归一化的输入要求
        hidden_states = hidden_states.transpose(-2, -1)
        # 应用层归一化到转置后的隐藏状态
        hidden_states = self.layer_norm(hidden_states)
        # 再次转置以恢复原始维度
        hidden_states = hidden_states.transpose(-2, -1)

        # 应用预先选择的激活函数
        hidden_states = self.activation(hidden_states)
        return hidden_states


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->UniSpeech
class UniSpeechGroupNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 从配置中获取输入和输出的卷积维度
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个一维卷积层，指定输入、输出维度、卷积核大小、步长和是否有偏置
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        # 根据配置选择激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

        # 创建一个分组归一化层，根据输出卷积维度进行分组归一化
        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)

    def forward(self, hidden_states):
        # 对输入的隐藏状态进行一维卷积
        hidden_states = self.conv(hidden_states)
        # 应用分组归一化到卷积输出
        hidden_states = self.layer_norm(hidden_states)
        # 应用预先选择的激活函数
        hidden_states = self.activation(hidden_states)
        return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding 复制代码，并将 Wav2Vec2 替换为 UniSpeech
class UniSpeechPositionalConvEmbedding(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一个 1 维卷积层，用于位置编码
        self.conv = nn.Conv1d(
            config.hidden_size,  # 输入通道数和输出通道数都是 hidden_size
            config.hidden_size,
            kernel_size=config.num_conv_pos_embeddings,  # 卷积核大小为 num_conv_pos_embeddings
            padding=config.num_conv_pos_embeddings // 2,  # 填充大小为卷积核大小的一半
            groups=config.num_conv_pos_embedding_groups,  # 分组卷积的组数
        )

        # 使用权重归一化操作，如果支持深度加速的 zero3 功能，则进行相关处理
        weight_norm = nn.utils.weight_norm
        if hasattr(nn.utils.parametrizations, "weight_norm"):
            weight_norm = nn.utils.parametrizations.weight_norm

        if is_deepspeed_zero3_enabled():
            import deepspeed

            # 使用 deepspeed.zero.GatheredParameters 确保权重的全局收集和归一化处理
            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
                self.conv = weight_norm(self.conv, name="weight", dim=2)
            deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
            deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
        else:
            # 否则，直接对卷积层的权重进行归一化处理
            self.conv = weight_norm(self.conv, name="weight", dim=2)

        # 创建用于对输入进行填充的 UniSpeechSamePadLayer 层
        self.padding = UniSpeechSamePadLayer(config.num_conv_pos_embeddings)
        # 选择激活函数，根据 config 中的 feat_extract_activation 选择对应的激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

    def forward(self, hidden_states):
        # 将输入张量进行维度转换，调整为 Conv1d 的输入格式
        hidden_states = hidden_states.transpose(1, 2)

        # 经过卷积层处理
        hidden_states = self.conv(hidden_states)
        # 经过填充层处理
        hidden_states = self.padding(hidden_states)
        # 经过激活函数处理
        hidden_states = self.activation(hidden_states)

        # 最后再次进行维度转换，调整回原始输入的格式
        hidden_states = hidden_states.transpose(1, 2)
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer 复制代码，并将 Wav2Vec2 替换为 UniSpeech
class UniSpeechSamePadLayer(nn.Module):
    def __init__(self, num_conv_pos_embeddings):
        super().__init__()
        # 根据 num_conv_pos_embeddings 的奇偶性决定是否移除最后一列填充
        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0

    def forward(self, hidden_states):
        # 如果需要移除填充，则进行相应的操作
        if self.num_pad_remove > 0:
            hidden_states = hidden_states[:, :, :-self.num_pad_remove]
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder 复制代码，并将 Wav2Vec2 替换为 UniSpeech
class UniSpeechFeatureEncoder(nn.Module):
    """从原始音频波形构造特征"""

    # 该类尚未实现，只是提供了一个文档字符串说明其作用
    # 初始化函数，接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()

        # 根据配置中的特征提取规范选择不同的卷积层列表
        if config.feat_extract_norm == "group":
            # 如果特征提取规范是"group"，则创建使用组归一化的卷积层列表
            conv_layers = [UniSpeechGroupNormConvLayer(config, layer_id=0)] + [
                UniSpeechNoLayerNormConvLayer(config, layer_id=i + 1)
                for i in range(config.num_feat_extract_layers - 1)
            ]
        elif config.feat_extract_norm == "layer":
            # 如果特征提取规范是"layer"，则创建使用层归一化的卷积层列表
            conv_layers = [
                UniSpeechLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
            ]
        else:
            # 如果特征提取规范既不是"group"也不是"layer"，则抛出值错误异常
            raise ValueError(
                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
            )
        
        # 将卷积层列表转换为nn.ModuleList并赋值给对象的conv_layers属性
        self.conv_layers = nn.ModuleList(conv_layers)
        
        # 设置梯度检查点标志为False
        self.gradient_checkpointing = False
        
        # 设置_requires_grad属性为True
        self._requires_grad = True

    # 冻结参数的私有方法
    def _freeze_parameters(self):
        # 遍历所有参数，并将其requires_grad属性设为False
        for param in self.parameters():
            param.requires_grad = False
        
        # 将对象的_requires_grad属性设为False
        self._requires_grad = False

    # 前向传播函数，接受输入值作为参数
    def forward(self, input_values):
        # 将输入值的维度扩展为二维，保留第一维
        hidden_states = input_values[:, None]

        # 如果_requires_grad为True并且当前处于训练模式
        if self._requires_grad and self.training:
            # 设置hidden_states需要计算梯度，以便于梯度检查点
            hidden_states.requires_grad = True

        # 遍历所有卷积层进行前向传播
        for conv_layer in self.conv_layers:
            # 如果_requires_grad为True、gradient_checkpointing为True，并且当前处于训练模式
            if self._requires_grad and self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数对当前卷积层进行前向传播
                hidden_states = self._gradient_checkpointing_func(
                    conv_layer.__call__,
                    hidden_states,
                )
            else:
                # 否则，直接对当前卷积层进行前向传播
                hidden_states = conv_layer(hidden_states)

        # 返回最终的隐藏状态
        return hidden_states
class UniSpeechFeatureExtractor(UniSpeechFeatureEncoder):
    # UniSpeechFeatureExtractor 类继承自 UniSpeechFeatureEncoder 类
    def __init__(self, config):
        # 初始化函数，调用父类 UniSpeechFeatureEncoder 的初始化方法
        super().__init__(config)
        # 发出警告信息，提示该类已被弃用，并在 Transformers v5 中将被移除，建议使用其父类名称替代
        warnings.warn(
            f"The class `{self.__class__.__name__}` has been depreciated "
            "and will be removed in Transformers v5. "
            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
            FutureWarning,
        )


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection 复制而来，将 Wav2Vec2 替换为 UniSpeech
class UniSpeechFeatureProjection(nn.Module):
    def __init__(self, config):
        super().__init__()
        # LayerNorm 对象，对最后一个卷积维度进行归一化，使用的 epsilon 为 config 中的 layer_norm_eps
        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
        # 线性映射，将最后一个卷积维度映射到隐藏大小，config.hidden_size 为隐藏层大小
        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
        # 随机失活层，使用的丢弃率为 config 中的 feat_proj_dropout
        self.dropout = nn.Dropout(config.feat_proj_dropout)

    def forward(self, hidden_states):
        # 对隐藏状态进行 LayerNorm 归一化
        norm_hidden_states = self.layer_norm(hidden_states)
        # 将归一化后的隐藏状态进行线性映射投影到隐藏大小
        hidden_states = self.projection(norm_hidden_states)
        # 对投影后的隐藏状态进行随机失活
        hidden_states = self.dropout(hidden_states)
        return hidden_states, norm_hidden_states


# 从 transformers.models.bart.modeling_bart.BartAttention 复制而来，将 Bart 替换为 UniSpeech
class UniSpeechAttention(nn.Module):
    """来自 'Attention Is All You Need' 论文的多头注意力"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[UniSpeechConfig] = None,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        self.config = config

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        # 缩放因子，根据头维度进行初始化
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder
        self.is_causal = is_causal

        # 线性映射对象，用于处理 key、value、query 和输出
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 对张量进行形状变换，用于注意力计算
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        ):
        # 前向传播函数，执行注意力计算和映射
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->UniSpeech
class UniSpeechFeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义中间层的 Dropout 操作，使用配置中的激活函数的 dropout 率
        self.intermediate_dropout = nn.Dropout(config.activation_dropout)

        # 定义中间层的全连接层，输入大小为隐藏大小，输出大小为中间大小
        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置中的激活函数类型选择对应的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

        # 定义输出层的全连接层，输入大小为中间大小，输出大小为隐藏大小
        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 定义输出层的 Dropout 操作，使用配置中的隐藏层 dropout 率
        self.output_dropout = nn.Dropout(config.hidden_dropout)

    def forward(self, hidden_states):
        # 中间层的全连接操作
        hidden_states = self.intermediate_dense(hidden_states)
        # 应用中间层的激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 应用中间层的 Dropout 操作
        hidden_states = self.intermediate_dropout(hidden_states)

        # 输出层的全连接操作
        hidden_states = self.output_dense(hidden_states)
        # 应用输出层的 Dropout 操作
        hidden_states = self.output_dropout(hidden_states)
        return hidden_states


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayer with Wav2Vec2->UniSpeech
class UniSpeechEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义自注意力层，使用 UniSpeechAttention 模块，设置 embed_dim 和 num_heads，关闭解码器模式
        self.attention = UniSpeechAttention(
            embed_dim=config.hidden_size,
            num_heads=config.num_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=False,
        )
        # 定义 Dropout 操作
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 定义层归一化操作，设置输入大小为隐藏大小，epsilon 为配置中的 layer_norm_eps
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 定义前馈网络层，使用 UniSpeechFeedForward 模块
        self.feed_forward = UniSpeechFeedForward(config)
        # 定义最终的层归一化操作，设置输入大小为隐藏大小，epsilon 为配置中的 layer_norm_eps
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
        # 复制注意力前的隐藏状态
        attn_residual = hidden_states
        # 进行自注意力计算，获取输出隐藏状态、注意力权重和其他信息（根据输出_attentions 参数）
        hidden_states, attn_weights, _ = self.attention(
            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
        )
        # 应用 Dropout 操作
        hidden_states = self.dropout(hidden_states)
        # 添加自注意力前的隐藏状态，形成残差连接
        hidden_states = attn_residual + hidden_states

        # 应用层归一化操作
        hidden_states = self.layer_norm(hidden_states)
        # 应用前馈网络层
        hidden_states = hidden_states + self.feed_forward(hidden_states)
        # 最终的层归一化操作
        hidden_states = self.final_layer_norm(hidden_states)

        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则添加到输出中
        if output_attentions:
            outputs += (attn_weights,)

        return outputs


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AttnAdapterLayer with Wav2Vec2->UniSpeech
class UniSpeechAttnAdapterLayer(nn.Module):
    # 初始化方法，接受一个配置对象作为参数
    def __init__(self, config):
        """
        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
        up training throughput.
        """
        # 调用父类的初始化方法
        super().__init__()
        # 设置输入维度为配置对象中的适配器注意力维度
        self.input_dim = config.adapter_attn_dim
        # 设置隐藏维度为配置对象中的隐藏大小
        self.hidden_dim = config.hidden_size

        # 初始化 LayerNorm 层，标准化隐藏状态
        self.norm = nn.LayerNorm(self.hidden_dim)
        # 第一个线性层，将隐藏状态映射到适配器注意力维度
        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
        # 激活函数 ReLU
        self.act_fn = nn.ReLU()
        # 第二个线性层，将适配器注意力维度映射回隐藏状态维度
        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)

    # 前向传播方法，接收一个形状为 [batch_size, seq_length, hidden_size] 的张量作为输入
    def forward(self, hidden_states: torch.FloatTensor):
        # 对输入的隐藏状态进行 LayerNorm 处理
        hidden_states = self.norm(hidden_states)

        # 经过第一个线性层的变换
        hidden_states = self.linear_1(hidden_states)
        # 应用 ReLU 激活函数
        hidden_states = self.act_fn(hidden_states)
        # 经过第二个线性层的变换
        hidden_states = self.linear_2(hidden_states)

        # 返回变换后的隐藏状态张量
        return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayerStableLayerNorm 复制并修改为 UniSpeechEncoderLayerStableLayerNorm
class UniSpeechEncoderLayerStableLayerNorm(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化自注意力模块 UniSpeechAttention
        self.attention = UniSpeechAttention(
            embed_dim=config.hidden_size,
            num_heads=config.num_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=False,
        )
        # 初始化 Dropout 模块
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 初始化层归一化模块
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化前馈神经网络模块 UniSpeechFeedForward
        self.feed_forward = UniSpeechFeedForward(config)
        # 初始化最终层归一化模块
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 如果配置中存在适配器注意力维度，则初始化适配器层 UniSpeechAttnAdapterLayer；否则设为 None
        if getattr(config, "adapter_attn_dim", None) is not None:
            self.adapter_layer = UniSpeechAttnAdapterLayer(config)
        else:
            self.adapter_layer = None

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 保留注意力残差连接
        attn_residual = hidden_states
        # 应用层归一化到隐藏状态
        hidden_states = self.layer_norm(hidden_states)
        # 使用自注意力模块进行注意力计算，并返回注意力权重（如果设置输出注意力的话）
        hidden_states, attn_weights, _ = self.attention(
            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
        )
        # 应用 Dropout 到注意力输出上
        hidden_states = self.dropout(hidden_states)
        # 添加注意力残差到处理后的隐藏状态上
        hidden_states = attn_residual + hidden_states
        # 应用前馈神经网络，并在最终层归一化后加到处理后的隐藏状态上
        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))

        # 如果适配器层不为 None，则将适配器层应用到处理后的隐藏状态上
        if self.adapter_layer is not None:
            hidden_states = hidden_states + self.adapter_layer(hidden_states)

        # 返回处理后的隐藏状态，可能包含注意力权重（取决于输出设置）
        outputs = (hidden_states,)

        # 如果设置输出注意力，将注意力权重添加到输出中
        if output_attentions:
            outputs += (attn_weights,)

        return outputs


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Encoder 复制并修改为 UniSpeechEncoder
class UniSpeechEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 初始化位置卷积嵌入模块 UniSpeechPositionalConvEmbedding
        self.pos_conv_embed = UniSpeechPositionalConvEmbedding(config)
        # 初始化层归一化模块
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 Dropout 模块
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 初始化编码器层模块列表，数量为配置中指定的隐藏层数
        self.layers = nn.ModuleList([UniSpeechEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        # 梯度检查点标志设为 False
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states: torch.tensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        ):
            all_hidden_states = () if output_hidden_states else None
            all_self_attentions = () if output_attentions else None

            if attention_mask is not None:
                # 确保填充的标记输出为0
                expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
                hidden_states[~expand_attention_mask] = 0

                # 扩展 attention_mask
                attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
                attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
                attention_mask = attention_mask.expand(
                    attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
                )

            # 通过位置卷积嵌入层处理位置嵌入
            position_embeddings = self.pos_conv_embed(hidden_states)
            hidden_states = hidden_states + position_embeddings
            hidden_states = self.layer_norm(hidden_states)
            hidden_states = self.dropout(hidden_states)

            deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()

            for layer in self.layers:
                if output_hidden_states:
                    all_hidden_states = all_hidden_states + (hidden_states,)

                # 添加 LayerDrop（参见 https://arxiv.org/abs/1909.11556 进行描述）
                dropout_probability = torch.rand([])

                skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
                if not skip_the_layer or deepspeed_zero3_is_enabled:
                    # 在深度速度（deepspeed）zero3下，所有GPU必须同步运行
                    if self.gradient_checkpointing and self.training:
                        layer_outputs = self._gradient_checkpointing_func(
                            layer.__call__,
                            hidden_states,
                            attention_mask,
                            output_attentions,
                        )
                    else:
                        layer_outputs = layer(
                            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
                        )
                    hidden_states = layer_outputs[0]

                if skip_the_layer:
                    layer_outputs = (None, None)

                if output_attentions:
                    all_self_attentions = all_self_attentions + (layer_outputs[1],)

            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            if not return_dict:
                return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
            return BaseModelOutput(
                last_hidden_state=hidden_states,
                hidden_states=all_hidden_states,
                attentions=all_self_attentions,
            )
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderStableLayerNorm 复制而来，将 Wav2Vec2 替换为 UniSpeech
class UniSpeechEncoderStableLayerNorm(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 初始化 UniSpeechPositionalConvEmbedding 对象，用于位置编码和卷积嵌入
        self.pos_conv_embed = UniSpeechPositionalConvEmbedding(config)
        # 初始化 LayerNorm 层，用于层归一化，eps 参数为配置中的层归一化 epsilon 值
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 Dropout 层，用于随机失活，丢弃率为配置中的隐藏层丢弃率
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 使用列表推导式创建多个 UniSpeechEncoderLayerStableLayerNorm 层，层数为配置中的隐藏层数量
        self.layers = nn.ModuleList(
            [UniSpeechEncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)]
        )
        # 是否启用梯度检查点，默认为 False
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    # 如果设置了输出所有隐藏状态，则初始化一个空元组；否则设为 None
    all_hidden_states = () if output_hidden_states else None
    # 如果设置了输出所有自注意力头，则初始化一个空元组；否则设为 None
    all_self_attentions = () if output_attentions else None
    
    # 如果存在注意力遮罩，则扩展注意力遮罩以确保填充的令牌不被关注
    if attention_mask is not None:
        # 将注意力遮罩扩展为与隐藏状态的最后一个维度相同
        expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
        # 将不需要关注的位置的隐藏状态置为0
        hidden_states[~expand_attention_mask] = 0
    
        # 扩展注意力遮罩，将其用于层间关注权重
        attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
        attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
        attention_mask = attention_mask.expand(
            attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
        )
    
    # 通过位置卷积嵌入层处理隐藏状态
    position_embeddings = self.pos_conv_embed(hidden_states)
    # 将位置嵌入加到隐藏状态上
    hidden_states = hidden_states + position_embeddings
    # 对隐藏状态应用丢弃（Dropout）
    hidden_states = self.dropout(hidden_states)
    
    # 检查是否启用了 Deepspeed Zero3
    deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
    
    # 遍历每一个层
    for layer in self.layers:
        # 如果要输出所有隐藏状态，则将当前层的隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)
    
        # 添加层丢弃（LayerDrop）机制，根据配置随机决定是否跳过该层
        dropout_probability = torch.rand([])
        skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
    
        # 如果不跳过该层或者启用了 Deepspeed Zero3，则执行该层的前向传播
        if not skip_the_layer or deepspeed_zero3_is_enabled:
            # 如果启用了梯度检查点且正在训练，则使用梯度检查点函数执行该层的前向传播
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer.__call__,
                    hidden_states,
                    attention_mask,
                    output_attentions,
                )
            else:
                # 否则直接调用层对象进行前向传播
                layer_outputs = layer(
                    hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
                )
            hidden_states = layer_outputs[0]
    
        # 如果跳过了该层，则输出设置为 None
        if skip_the_layer:
            layer_outputs = (None, None)
    
        # 如果要输出所有自注意力头，则将当前层的自注意力权重添加到 all_self_attentions 中
        if output_attentions:
            all_self_attentions = all_self_attentions + (layer_outputs[1],)
    
    # 对最终的隐藏状态应用层归一化
    hidden_states = self.layer_norm(hidden_states)
    
    # 如果要输出所有隐藏状态，则将最终的隐藏状态添加到 all_hidden_states 中
    if output_hidden_states:
        all_hidden_states = all_hidden_states + (hidden_states,)
    
    # 如果不返回字典形式的输出，则返回元组形式的结果
    if not return_dict:
        return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
    
    # 返回基础模型输出对象，包含最终的隐藏状态、所有隐藏状态和所有自注意力权重
    return BaseModelOutput(
        last_hidden_state=hidden_states,
        hidden_states=all_hidden_states,
        attentions=all_self_attentions,
    )
# 定义一个名为 UniSpeechGumbelVectorQuantizer 的类，继承自 nn.Module，用于实现使用 Gumbel Softmax 进行向量量化的功能。
"""
Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
"""
class UniSpeechGumbelVectorQuantizer(nn.Module):

    def __init__(self, config):
        # 调用父类构造方法进行初始化
        super().__init__()
        # 设置向量量化的组数和每组的变量数目
        self.num_groups = config.num_codevector_groups
        self.num_vars = config.num_codevectors_per_group

        # 检查配置中的 codevector_dim 是否可以被 num_groups 整除
        if config.codevector_dim % self.num_groups != 0:
            raise ValueError(
                f"`config.codevector_dim {config.codevector_dim} must be divisible by `config.num_codevector_groups`"
                f" {self.num_groups} for concatenation"
            )

        # 创建一个可训练的参数，用于存储码书（codebook）变量（码字）
        self.codevectors = nn.Parameter(
            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
        )
        
        # 定义一个线性层，用于将卷积的最后一个维度映射到 num_groups * num_vars 的大小
        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)

        # 设定温度参数，用于 Gumbel Softmax 分布
        self.temperature = 2

    @staticmethod
    def _compute_perplexity(probs):
        # 计算概率分布的困惑度
        marginal_probs = probs.mean(dim=0)
        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
        return perplexity
    def forward(self, hidden_states):
        # 获取输入张量的批大小、序列长度和隐藏大小
        batch_size, sequence_length, hidden_size = hidden_states.shape

        # 将隐藏状态投影到代码向量维度
        hidden_states = self.weight_proj(hidden_states)
        # 将投影后的张量形状转换为(batch_size * sequence_length * self.num_groups, -1)
        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)

        if self.training:
            # 使用 gumbel_softmax 方法在可微分的方式中采样代码向量概率
            codevector_probs = nn.functional.gumbel_softmax(
                hidden_states.float(), tau=self.temperature, hard=True
            ).type_as(hidden_states)

            # 计算困惑度
            codevector_soft_dist = torch.softmax(
                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
            )
            perplexity = self._compute_perplexity(codevector_soft_dist)
        else:
            # 非训练状态下，以非可微分方式取 argmax
            # 计算硬代码向量分布（one hot 编码）
            codevector_idx = hidden_states.argmax(dim=-1)
            codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
                -1, codevector_idx.view(-1, 1), 1.0
            )
            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)

            perplexity = self._compute_perplexity(codevector_probs)

        # 将代码向量概率张量重新调整为(batch_size * sequence_length, -1)的形状
        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
        # 使用概率从代码向量集中检索代码向量
        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
        # 将每个组中的代码向量求和，形状为(batch_size, sequence_length, -1)
        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)

        # 返回结果：代码向量和困惑度
        return codevectors, perplexity
class UniSpeechPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为UniSpeechConfig
    config_class = UniSpeechConfig
    # 模型的基础名称前缀
    base_model_prefix = "unispeech"
    # 主要输入名称
    main_input_name = "input_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果module是UniSpeechGumbelVectorQuantizer类型，初始化其权重
        if isinstance(module, UniSpeechGumbelVectorQuantizer):
            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
            module.weight_proj.bias.data.zero_()
            nn.init.uniform_(module.codevectors)
        # 如果module是UniSpeechPositionalConvEmbedding类型，初始化其权重
        elif isinstance(module, UniSpeechPositionalConvEmbedding):
            nn.init.normal_(
                module.conv.weight,
                mean=0,
                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
            )
            nn.init.constant_(module.conv.bias, 0)
        # 如果module是UniSpeechFeatureProjection类型，初始化其权重
        elif isinstance(module, UniSpeechFeatureProjection):
            k = math.sqrt(1 / module.projection.in_features)
            nn.init.uniform_(module.projection.weight, a=-k, b=k)
            nn.init.uniform_(module.projection.bias, a=-k, b=k)
        # 如果module是nn.Linear类型，初始化其权重和偏置
        elif isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)

            if module.bias is not None:
                module.bias.data.zero_()
        # 如果module是nn.LayerNorm或nn.GroupNorm类型，初始化其偏置和权重
        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        # 如果module是nn.Conv1d类型，使用Kaiming正态分布初始化其权重
        elif isinstance(module, nn.Conv1d):
            nn.init.kaiming_normal_(module.weight)

            if module.bias is not None:
                # 计算适当的均匀分布边界并初始化偏置
                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
                nn.init.uniform_(module.bias, a=-k, b=k)

    def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
        """
        Computes the output length of the convolutional layers
        """

        def _conv_out_length(input_length, kernel_size, stride):
            # 从https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html获取的一维卷积层输出长度公式
            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1

        # 根据配置中的卷积核大小和步长计算卷积层的输出长度
        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

        return input_lengths
    # 计算非填充部分的长度，即每个样本中非零元素的累积和的最后一个值
    non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]

    # 根据非填充长度计算输出长度，转换为长整型并移至对应设备
    output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths).to(torch.long)

    # 获取批次大小
    batch_size = attention_mask.shape[0]

    # 创建一个与注意力掩码相同大小的零张量，但使用指定的数据类型和设备
    attention_mask = torch.zeros(
        (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
    )

    # 将输出长度对应位置的值设为1，确保在这些位置之前的所有值都受到注意
    attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1

    # 反转张量并对每行进行累积求和，然后再次反转，最后转换为布尔类型
    attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()

    # 返回处理后的注意力掩码张量
    return attention_mask
# UNISPEECH_START_DOCSTRING 是一个长字符串，包含了有关 UniSpeech 模型的详细介绍和引用的论文信息
UNISPEECH_START_DOCSTRING = r"""
    UniSpeech was proposed in [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled
    Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei,
    Michael Zeng, Xuedong Huang.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving etc.).

    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`UniSpeechConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# UNISPEECH_INPUTS_DOCSTRING 是一个空字符串，用于承载有关 UniSpeech 模型输入的文档字符串信息
UNISPEECH_INPUTS_DOCSTRING = r"""
"""
    Args:
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            # 输入的原始语音波形的浮点数值。可以通过加载 `.flac` 或 `.wav` 音频文件并转换成类型为 `List[float]` 或 `numpy.ndarray` 的数组来获得这些值，例如可以使用 `soundfile` 库 (`pip install soundfile`)。使用 [`AutoProcessor`] 进行填充和转换成 `torch.FloatTensor` 类型的张量。详见 [`Wav2Vec2Processor.__call__`] 的详细信息。
        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 遮罩，用于避免在填充标记索引上执行卷积和注意力操作。遮罩中的值选自 `[0, 1]`：

            # - 1 表示**未屏蔽**的标记，
            # - 0 表示**已屏蔽**的标记。

            # [什么是注意力遮罩？](../glossary#attention-mask)

            <Tip warning={true}>
            如果对应的处理器具有 `config.return_attention_mask == True`，则应传递 `attention_mask`。对于所有处理器具有 `config.return_attention_mask == False` 的模型，在进行批处理推理时，应**不要**传递 `attention_mask`，以避免性能下降。对于这些模型，`input_values` 应简单地填充为 0 并在不传递 `attention_mask` 的情况下传递。请注意，这些模型的结果也会根据 `input_values` 是否填充而略有不同。
            </Tip>

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。详见返回的张量中的 `attentions` 获取更多细节。
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。详见返回的张量中的 `hidden_states` 获取更多细节。
        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通元组。
"""
@add_start_docstrings(
    "The bare UniSpeech Model transformer outputting raw hidden-states without any specific head on top.",
    UNISPEECH_START_DOCSTRING,
)
"""
# 使用装饰器添加文档字符串，描述这是一个裸的 UniSpeech 模型，输出未经特定顶部头处理的原始隐藏状态。

class UniSpeechModel(UniSpeechPreTrainedModel):
    def __init__(self, config: UniSpeechConfig):
        super().__init__(config)
        self.config = config
        # 初始化特征提取器和特征投影器
        self.feature_extractor = UniSpeechFeatureEncoder(config)
        self.feature_projection = UniSpeechFeatureProjection(config)

        # 如果配置中包含时间或特征掩码概率，则初始化掩码的频谱嵌入
        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())

        # 根据配置选择稳定层归一化编码器或一般编码器
        if config.do_stable_layer_norm:
            self.encoder = UniSpeechEncoderStableLayerNorm(config)
        else:
            self.encoder = UniSpeechEncoder(config)

        # 初始化权重并进行最终处理
        self.post_init()

    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states 复制而来
    def _mask_hidden_states(
        self,
        hidden_states: torch.FloatTensor,
        mask_time_indices: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
    ):
        """
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        """

        # `config.apply_spec_augment` can set masking to False
        if not getattr(self.config, "apply_spec_augment", True):
            return hidden_states

        # generate indices & apply SpecAugment along time axis
        batch_size, sequence_length, hidden_size = hidden_states.size()

        if mask_time_indices is not None:
            # apply SpecAugment along time axis with given mask_time_indices
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
        elif self.config.mask_time_prob > 0 and self.training:
            # Compute mask indices for time axis based on configuration parameters
            mask_time_indices = _compute_mask_indices(
                (batch_size, sequence_length),
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                attention_mask=attention_mask,
                min_masks=self.config.mask_time_min_masks,
            )
            # Convert mask indices to boolean tensor on the same device as hidden_states
            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
            # Apply SpecAugment along time axis using masked_spec_embed
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)

        if self.config.mask_feature_prob > 0 and self.training:
            # generate indices & apply SpecAugment along feature axis
            mask_feature_indices = _compute_mask_indices(
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
                min_masks=self.config.mask_feature_min_masks,
            )
            # Convert mask indices to boolean tensor on the same device as hidden_states
            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
            # Expand feature mask indices to match the shape of hidden_states
            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
            # Apply SpecAugment along feature axis by setting masked values to 0
            hidden_states[mask_feature_indices] = 0

        return hidden_states

    @add_start_docstrings_to_model_forward(UNISPEECH_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=Wav2Vec2BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        mask_time_indices: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    # 定义函数的返回类型为 Tuple 或 Wav2Vec2BaseModelOutput
    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
        # 如果未显式指定输出注意力权重，则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未显式指定输出隐藏状态，则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未显式指定返回字典，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 提取特征向量
        extract_features = self.feature_extractor(input_values)
        # 调整特征向量的维度顺序
        extract_features = extract_features.transpose(1, 2)

        # 如果存在注意力掩码，则计算对应于特征向量的减少的注意力掩码
        if attention_mask is not None:
            attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)

        # 使用特征投影层对特征向量进行投影
        hidden_states, extract_features = self.feature_projection(extract_features)
        # 根据时间索引掩码和注意力掩码对隐藏状态进行掩码处理
        hidden_states = self._mask_hidden_states(
            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
        )

        # 使用编码器处理隐藏状态和注意力掩码等参数
        encoder_outputs = self.encoder(
            hidden_states,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器的最后隐藏状态
        hidden_states = encoder_outputs[0]

        # 如果不需要返回字典，则返回元组形式的输出
        if not return_dict:
            return (hidden_states, extract_features) + encoder_outputs[1:]

        # 如果需要返回字典，则使用 Wav2Vec2BaseModelOutput 类包装输出
        return Wav2Vec2BaseModelOutput(
            last_hidden_state=hidden_states,
            extract_features=extract_features,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
@add_start_docstrings(
    """UniSpeech Model with a vector-quantization module and ctc loss for pre-training.""", UNISPEECH_START_DOCSTRING
)
class UniSpeechForPreTraining(UniSpeechPreTrainedModel):
    def __init__(self, config: UniSpeechConfig):
        super().__init__(config)
        # 初始化 UniSpeech 模型
        self.unispeech = UniSpeechModel(config)
        # 特征量化器的 dropout
        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)

        # 初始化量化器
        self.quantizer = UniSpeechGumbelVectorQuantizer(config)
        # 将编码向量维度映射到投影编码向量维度
        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
        # 将投影编码向量维度映射到隐藏层大小
        self.project_hid = nn.Linear(config.proj_codevector_dim, config.hidden_size)

        # CTC 层，将隐藏状态映射到 CTC 类别数
        self.ctc_proj = nn.Linear(config.hidden_size, config.num_ctc_classes)
        # 最终 dropout
        self.dropout = nn.Dropout(config.final_dropout)

        # 初始化权重并应用最终处理
        self.post_init()

    def set_gumbel_temperature(self, temperature: int):
        """
        Set the Gumbel softmax temperature to a given value. Only necessary for training
        """
        # 设置 Gumbel softmax 温度值，仅在训练时需要
        self.quantizer.temperature = temperature

    def freeze_feature_extractor(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        """
        # 弃用警告，禁止特征提取器的梯度计算，使其参数在训练期间不更新
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        self.freeze_feature_encoder()

    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        # 禁止特征编码器的梯度计算，使其参数在训练期间不更新
        self.unispeech.feature_extractor._freeze_parameters()

    @staticmethod
    def compute_contrastive_logits(
        target_features: torch.FloatTensor,
        negative_features: torch.FloatTensor,
        predicted_features: torch.FloatTensor,
        temperature: int = 1,
    ):
        """
        Compute logits for contrastive loss based using cosine similarity as the distance measure between
        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
        """
        # 将目标特征和负样本特征连接在一起
        target_features = torch.cat([target_features, negative_features], dim=0)

        # 计算余弦相似度作为距离度量的对比损失的 logits
        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1)
        logits = logits.type_as(target_features)

        # 应用温度
        logits = logits / temperature
        return logits

    @add_start_docstrings_to_model_forward(UNISPEECH_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=UniSpeechForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    # 定义模型的前向传播函数，接受输入值、注意力掩码、输出注意力权重、输出隐藏状态、是否返回字典格式结果等参数
    def forward(
        self,
        input_values: Optional[torch.Tensor],  # 输入值，类型为可选的 PyTorch 张量
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，类型为可选的 PyTorch 张量，默认为 None
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，类型为可选的布尔值，默认为 None
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，类型为可选的布尔值，默认为 None
        return_dict: Optional[bool] = None,  # 是否返回字典格式结果，类型为可选的布尔值，默认为 None
@add_start_docstrings(
    """UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
    UNISPEECH_START_DOCSTRING,
    """
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechForCTC`] with adapters. Uses 'eng'
            by default.
    """,
)
# 定义了一个新的类UniSpeechForCTC，继承自UniSpeechPreTrainedModel类，用于基于CTC进行语言建模
# UniSpeechForCTC类扩展了UniSpeech模型，并添加了用于CTC的语言建模头部

class UniSpeechForCTC(UniSpeechPreTrainedModel):
    def __init__(self, config, target_lang: Optional[str] = None):
        super().__init__(config)

        # 初始化UniSpeech模型
        self.unispeech = UniSpeechModel(config)
        # 添加一个dropout层
        self.dropout = nn.Dropout(config.final_dropout)

        # 设置目标语言，默认为'eng'
        self.target_lang = target_lang

        # 检查配置是否定义了语言模型头的词汇表大小
        if config.vocab_size is None:
            raise ValueError(
                f"You are trying to instantiate {self.__class__} with a configuration that "
                "does not define the vocabulary size of the language model head. Please "
                "instantiate the model as follows: `UniSpeechForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
                "or define `vocab_size` of your model's configuration."
            )
        
        # 根据配置设置输出隐藏层的大小
        output_hidden_size = (
            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
        )
        # 添加一个线性层，用于语言建模头部
        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)

        # 初始化权重并应用最终处理
        self.post_init()

    def tie_weights(self):
        """
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        """

        # 重写`~PreTrainedModel.tie_weights`方法，以便在通过`from_pretrained(...)`传递`target_lang=...`时能够正确加载适配器权重
        # 这个方法不应该由用户调用，并且可能在将来被更改

        target_lang = self.target_lang

        # 如果定义了target_lang但未定义config.adapter_attn_dim，则抛出值错误异常
        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
        # 如果未定义target_lang但定义了config.adapter_attn_dim，则记录默认情况下target_lang被设置为'eng'
        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
            logger.info("By default `target_lang` is set to 'eng'.")
        # 如果定义了target_lang，则加载适配器
        elif target_lang is not None:
            self.load_adapter(target_lang, force_load=True)
    # 警告用户方法已过时，并将在 Transformers v5 版本中移除。建议使用 `freeze_feature_encoder` 方法代替。
    def freeze_feature_extractor(self):
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        # 调用 `freeze_feature_encoder` 方法来冻结特征编码器，禁止其在训练期间更新参数。
        self.freeze_feature_encoder()

    # 禁止特征编码器的梯度计算，防止其在训练期间更新参数。
    def freeze_feature_encoder(self):
        self.unispeech.feature_extractor._freeze_parameters()

    # 禁止基础模型的梯度计算，使其参数在训练期间不会被更新。只有分类头会被更新。
    def freeze_base_model(self):
        for param in self.unispeech.parameters():
            param.requires_grad = False

    # 添加模型前向传播方法的文档字符串，包括输入、输出等信息。
    @add_start_docstrings_to_model_forward(UNISPEECH_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_CTC_EXPECTED_OUTPUT,
        expected_loss=_CTC_EXPECTED_LOSS,
    )
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
        ) -> Union[Tuple, CausalLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        """
        # 默认情况下，如果未提供 return_dict，则使用模型配置中的 use_return_dict 设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 unispeech 模型生成输出
        outputs = self.unispeech(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出的隐藏状态
        hidden_states = outputs[0]
        # 对隐藏状态进行 dropout 处理
        hidden_states = self.dropout(hidden_states)

        # 使用语言模型头部生成 logits
        logits = self.lm_head(hidden_states)

        # 初始化 loss
        loss = None
        # 如果提供了标签
        if labels is not None:
            # 检查标签是否超出词汇表大小
            if labels.max() >= self.config.vocab_size:
                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")

            # 从 attention_mask 中获取输入长度
            attention_mask = (
                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
            )
            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)

            # 假设填充的标记用 -100 填充，当没有被注意到时
            labels_mask = labels >= 0
            target_lengths = labels_mask.sum(-1)
            flattened_targets = labels.masked_select(labels_mask)

            # 使用 log_softmax 转换 logits，以备 ctc_loss 使用
            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)

            # 关闭 cudnn 加速，因为 ctc_loss 不支持 fp16
            with torch.backends.cudnn.flags(enabled=False):
                loss = nn.functional.ctc_loss(
                    log_probs,
                    flattened_targets,
                    input_lengths,
                    target_lengths,
                    blank=self.config.pad_token_id,
                    reduction=self.config.ctc_loss_reduction,
                    zero_infinity=self.config.ctc_zero_infinity,
                )

        # 如果不要求返回字典形式的输出
        if not return_dict:
            # 组装输出元组
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # 返回 CausalLMOutput 对象，包含 loss、logits、隐藏状态和注意力权重
        return CausalLMOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )
@add_start_docstrings(
    """
    UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    """,
    UNISPEECH_START_DOCSTRING,
)
class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        if hasattr(config, "add_adapter") and config.add_adapter:
            raise ValueError(
                "Sequence classification does not support the use of UniSpeech adapters (config.add_adapter=True)"
            )
        self.unispeech = UniSpeechModel(config)  # 初始化UniSpeech模型
        num_layers = config.num_hidden_layers + 1  # 计算层数：transformer层 + 输入嵌入层
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)  # 初始化层权重
        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)  # 线性投影层
        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)  # 分类器线性层

        # Initialize weights and apply final processing
        self.post_init()  # 执行后初始化操作

    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_extractor
    def freeze_feature_extractor(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        """
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        self.freeze_feature_encoder()  # 冻结特征编码器的参数

    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_encoder with wav2vec2->unispeech
    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        self.unispeech.feature_extractor._freeze_parameters()  # 冻结特征编码器的参数

    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_base_model with wav2vec2->unispeech
    def freeze_base_model(self):
        """
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        """
        for param in self.unispeech.parameters():
            param.requires_grad = False  # 冻结UniSpeech模型的所有参数

    @add_start_docstrings_to_model_forward(UNISPEECH_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
    )
    # 从transformers.models.unispeech.modeling_unispeech.UniSpeech.forward复制，将Wav2Vec2->UniSpeech，wav2vec2->unispeech
    def forward(
        self,
        input_values: Optional[torch.Tensor],  # 输入值，可以是张量的可选类型
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，可以是张量的可选类型，默认为None
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可以是布尔值的可选类型，默认为None
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可以是布尔值的可选类型，默认为None
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，可以是布尔值的可选类型，默认为None
        labels: Optional[torch.Tensor] = None,  # 标签，用于计算序列分类/回归损失的张量，可选类型，默认为None
    ) -> Union[Tuple, SequenceClassifierOutput]:  # 返回值类型注释，可以是元组或SequenceClassifierOutput

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict  # 确定是否使用返回字典，如果未提供，则使用配置中的默认值
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states  # 根据配置决定是否输出加权层的隐藏状态

        outputs = self.unispeech(  # 使用UniSpeech模型进行前向传播
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        if self.config.use_weighted_layer_sum:  # 如果配置指定使用加权层求和
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]  # 获取隐藏状态的起始位置
            hidden_states = torch.stack(hidden_states, dim=1)  # 在维度1上堆叠隐藏状态
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)  # 对层权重进行softmax归一化
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)  # 加权求和隐藏状态
        else:
            hidden_states = outputs[0]  # 否则直接获取第一个输出作为隐藏状态

        hidden_states = self.projector(hidden_states)  # 将隐藏状态投影到指定维度

        if attention_mask is None:  # 如果没有提供注意力掩码
            pooled_output = hidden_states.mean(dim=1)  # 对隐藏状态进行平均池化
        else:
            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)  # 获取特征向量注意力掩码
            hidden_states[~padding_mask] = 0.0  # 将非填充部分的隐藏状态置为0
            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)  # 使用注意力掩码进行池化

        logits = self.classifier(pooled_output)  # 使用分类器预测逻辑回归

        loss = None  # 初始化损失为None
        if labels is not None:  # 如果提供了标签
            loss_fct = CrossEntropyLoss()  # 使用交叉熵损失函数
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))  # 计算损失

        if not return_dict:  # 如果不要求返回字典形式的输出
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]  # 组装输出元组
            return ((loss,) + output) if loss is not None else output  # 返回带有损失的输出元组或仅输出元组

        return SequenceClassifierOutput(  # 返回SequenceClassifierOutput对象，包含损失、逻辑回归、隐藏状态和注意力权重
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\unispeech\init.py`

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 从 typing 模块导入 TYPE_CHECKING
from typing import TYPE_CHECKING

# 从 ...utils 中导入必要的类和函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_torch_available,
)

# 定义导入结构
_import_structure = {"configuration_unispeech": ["UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP", "UniSpeechConfig"]}

# 检查是否为静态类型检查
try:
    # 如果 Torch 不可用则抛出 OptionalDependencyNotAvailable 异常
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，则添加相关模型到 _import_structure 中
    _import_structure["modeling_unispeech"] = [
        "UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST",
        "UniSpeechForCTC",
        "UniSpeechForPreTraining",
        "UniSpeechForSequenceClassification",
        "UniSpeechModel",
        "UniSpeechPreTrainedModel",
    ]

# 如果在静态类型检查环境中
if TYPE_CHECKING:
    # 从 .configuration_unispeech 中导入必要的类和函数
    from .configuration_unispeech import UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechConfig

    try:
        # 如果 Torch 不可用则抛出 OptionalDependencyNotAvailable 异常
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从 .modeling_unispeech 中导入必要的类和函数
        from .modeling_unispeech import (
            UNISPEECH_PRETRAINED_MODEL_ARCHIVE_LIST,
            UniSpeechForCTC,
            UniSpeechForPreTraining,
            UniSpeechForSequenceClassification,
            UniSpeechModel,
            UniSpeechPreTrainedModel,
        )

# 如果不在静态类型检查环境中
else:
    import sys

    # 将当前模块设为懒加载模块 _LazyModule 的实例
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\unispeech_sat\configuration_unispeech_sat.py`

# 定义 UniSpeechSatConfig 类，用于存储 UniSpeechSat 模型的配置信息
class UniSpeechSatConfig(PretrainedConfig):
    r"""
    这是一个配置类，用于存储 [`UniSpeechSatModel`] 的配置信息。根据指定的参数实例化 UniSpeechSat 模型，定义模型架构。
    使用默认配置实例化将产生与 UniSpeechSat [microsoft/unispeech-sat-base-100h-libri-ft](https://huggingface.co/microsoft/unispeech-sat-base-100h-libri-ft)
    架构类似的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型输出。阅读 [`PretrainedConfig`] 的文档以获取更多信息。

    示例:

    ```
    >>> from transformers import UniSpeechSatModel, UniSpeechSatConfig

    >>> # 初始化一个 UniSpeechSat microsoft/unispeech-sat-base-100h-libri-ft 风格的配置
    >>> configuration = UniSpeechSatConfig()

    >>> # 从 microsoft/unispeech-sat-base-100h-libri-ft 风格的配置初始化一个模型
    >>> model = UniSpeechSatModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    # 初始化函数，用于创建一个新的对象实例，设置各种模型参数和配置
    def __init__(
        self,
        vocab_size=32,  # 词汇表大小，默认为32
        hidden_size=768,  # 隐藏层大小，默认为768
        num_hidden_layers=12,  # 隐藏层的数量，默认为12
        num_attention_heads=12,  # 注意力头的数量，默认为12
        intermediate_size=3072,  # 中间层大小，默认为3072
        hidden_act="gelu",  # 隐藏层激活函数，默认为gelu
        hidden_dropout=0.1,  # 隐藏层的dropout率，默认为0.1
        activation_dropout=0.1,  # 激活函数的dropout率，默认为0.1
        attention_dropout=0.1,  # 注意力层的dropout率，默认为0.1
        feat_proj_dropout=0.0,  # 特征投影层的dropout率，默认为0.0
        feat_quantizer_dropout=0.0,  # 特征量化器的dropout率，默认为0.0
        final_dropout=0.1,  # 最终输出层的dropout率，默认为0.1
        layerdrop=0.1,  # 层级丢弃率，默认为0.1
        initializer_range=0.02,  # 初始化范围，默认为0.02
        layer_norm_eps=1e-5,  # 层归一化的epsilon值，默认为1e-5
        feat_extract_norm="group",  # 特征提取层的归一化方式，默认为"group"
        feat_extract_activation="gelu",  # 特征提取层的激活函数，默认为gelu
        conv_dim=(512, 512, 512, 512, 512, 512, 512),  # 卷积层的维度，默认为(512, 512, 512, 512, 512, 512, 512)
        conv_stride=(5, 2, 2, 2, 2, 2, 2),  # 卷积层的步幅，默认为(5, 2, 2, 2, 2, 2, 2)
        conv_kernel=(10, 3, 3, 3, 3, 2, 2),  # 卷积核大小，默认为(10, 3, 3, 3, 3, 2, 2)
        conv_bias=False,  # 是否使用卷积层的偏置，默认为False
        num_conv_pos_embeddings=128,  # 卷积位置嵌入的数量，默认为128
        num_conv_pos_embedding_groups=16,  # 卷积位置嵌入的分组数，默认为16
        do_stable_layer_norm=False,  # 是否进行稳定的层归一化，默认为False
        apply_spec_augment=True,  # 是否应用语音数据增强，默认为True
        mask_time_prob=0.05,  # 时间掩码的概率，默认为0.05
        mask_time_length=10,  # 时间掩码的长度，默认为10
        mask_time_min_masks=2,  # 时间掩码的最小数量，默认为2
        mask_feature_prob=0.0,  # 特征掩码的概率，默认为0.0
        mask_feature_length=10,  # 特征掩码的长度，默认为10
        mask_feature_min_masks=0,  # 特征掩码的最小数量，默认为0
        num_codevectors_per_group=320,  # 每组码向量的数量，默认为320
        num_codevector_groups=2,  # 码向量的组数，默认为2
        contrastive_logits_temperature=0.1,  # 对比损失的温度参数，默认为0.1
        num_negatives=100,  # 负样本数量，默认为100
        codevector_dim=256,  # 码向量的维度，默认为256
        proj_codevector_dim=256,  # 投影码向量的维度，默认为256
        diversity_loss_weight=0.1,  # 多样性损失的权重，默认为0.1
        ctc_loss_reduction="mean",  # CTC损失的减少方式，默认为"mean"
        ctc_zero_infinity=False,  # CTC损失是否将无穷值设为零，默认为False
        use_weighted_layer_sum=False,  # 是否使用加权层求和，默认为False
        classifier_proj_size=256,  # 分类器投影大小，默认为256
        tdnn_dim=(512, 512, 512, 512, 1500),  # TDNN层的维度，默认为(512, 512, 512, 512, 1500)
        tdnn_kernel=(5, 3, 3, 1, 1),  # TDNN层的卷积核大小，默认为(5, 3, 3, 1, 1)
        tdnn_dilation=(1, 2, 3, 1, 1),  # TDNN层的膨胀率，默认为(1, 2, 3, 1, 1)
        xvector_output_dim=512,  # X向量的输出维度，默认为512
        pad_token_id=0,  # 填充token的ID，默认为0
        bos_token_id=1,  # 起始token的ID，默认为1
        eos_token_id=2,  # 结束token的ID，默认为2
        num_clusters=504,  # 聚类中心的数量，默认为504
        **kwargs,  # 其他可选参数
    ):
        # 计算输入到logits的比率，即卷积步幅的乘积
        @property
        def inputs_to_logits_ratio(self):
            return functools.reduce(operator.mul, self.conv_stride, 1)

`.\models\unispeech_sat\convert_unispeech_original_s3prl_checkpoint_to_pytorch.py`

# 设置脚本的编码格式为 UTF-8
# 版权声明和许可信息，此处使用的是 Apache License 2.0
# 只允许在符合许可证的情况下使用此文件
# 可以在以下网址获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
# 除非适用法律要求或书面同意，否则按“现状”分发软件
# 没有明示或暗示的任何保证或条件。详见许可证条款。

"""Convert Hubert checkpoint."""

# 导入必要的库和模块
import argparse  # 用于解析命令行参数

import torch  # PyTorch 库

from transformers import (  # 从 transformers 库中导入以下模块和类
    UniSpeechSatConfig,  # UniSpeechSatConfig 配置类
    UniSpeechSatForAudioFrameClassification,  # 用于音频帧分类的模型类
    UniSpeechSatForSequenceClassification,  # 用于序列分类的模型类
    UniSpeechSatForXVector,  # 用于生成 x-vector 的模型类
    Wav2Vec2FeatureExtractor,  # Wav2Vec2 的特征提取器类
    logging,  # 日志记录模块
)

logging.set_verbosity_info()  # 设置日志记录的详细程度为 info
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


def convert_classification(base_model_name, hf_config, downstream_dict):
    # 从预训练模型和配置创建 UniSpeechSatForSequenceClassification 模型
    model = UniSpeechSatForSequenceClassification.from_pretrained(base_model_name, config=hf_config)
    # 设置模型的投影层权重和偏置
    model.projector.weight.data = downstream_dict["projector.weight"]
    model.projector.bias.data = downstream_dict["projector.bias"]
    # 设置模型的分类器权重和偏置
    model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"]
    model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"]
    return model  # 返回转换后的模型


def convert_diarization(base_model_name, hf_config, downstream_dict):
    # 从预训练模型和配置创建 UniSpeechSatForAudioFrameClassification 模型
    model = UniSpeechSatForAudioFrameClassification.from_pretrained(base_model_name, config=hf_config)
    # 设置模型的分类器权重和偏置
    model.classifier.weight.data = downstream_dict["model.linear.weight"]
    model.classifier.bias.data = downstream_dict["model.linear.bias"]
    return model  # 返回转换后的模型


def convert_xvector(base_model_name, hf_config, downstream_dict):
    # 从预训练模型和配置创建 UniSpeechSatForXVector 模型
    model = UniSpeechSatForXVector.from_pretrained(base_model_name, config=hf_config)
    # 设置模型的投影层权重和偏置
    model.projector.weight.data = downstream_dict["connector.weight"]
    model.projector.bias.data = downstream_dict["connector.bias"]
    
    # 遍历模型中的每个 TDNN 层，设置其权重和偏置
    for i, kernel_size in enumerate(hf_config.tdnn_kernel):
        model.tdnn[i].kernel.weight.data = downstream_dict[
            f"model.framelevel_feature_extractor.module.{i}.kernel.weight"
        ]
        model.tdnn[i].kernel.bias.data = downstream_dict[f"model.framelevel_feature_extractor.module.{i}.kernel.bias"]

    # 设置特征提取器的权重和偏置
    model.feature_extractor.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.weight"]
    model.feature_extractor.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.bias"]
    # 设置分类器的权重和偏置
    model.classifier.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.weight"]
    model.classifier.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.bias"]
    # 设置目标函数的权重
    model.objective.weight.data = downstream_dict["objective.W"]
    return model  # 返回转换后的模型


@torch.no_grad()
# 定义函数，用于将 S3PRL 模型的检查点转换为 transformers 设计的模型
def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    # 加载检查点文件，将其映射到 CPU 上
    checkpoint = torch.load(checkpoint_path, map_location="cpu")

    # 从检查点中获取 Downstream 字典
    downstream_dict = checkpoint["Downstream"]

    # 从预训练配置文件加载 UniSpeechSatConfig
    hf_config = UniSpeechSatConfig.from_pretrained(config_path)
    
    # 从预训练模型加载 Wav2Vec2FeatureExtractor
    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
        base_model_name, return_attention_mask=True, do_normalize=False
    )

    # 获取模型的架构名称
    arch = hf_config.architectures[0]
    
    # 根据模型架构选择相应的转换函数，转换成 Hugging Face 的模型
    if arch.endswith("ForSequenceClassification"):
        hf_model = convert_classification(base_model_name, hf_config, downstream_dict)
    elif arch.endswith("ForAudioFrameClassification"):
        hf_model = convert_diarization(base_model_name, hf_config, downstream_dict)
    elif arch.endswith("ForXVector"):
        hf_model = convert_xvector(base_model_name, hf_config, downstream_dict)
    else:
        # 如果架构不被支持，抛出未实现错误
        raise NotImplementedError(f"S3PRL weights conversion is not supported for {arch}")

    # 如果配置要求使用加权层求和，则加载 Featurizer 中的权重数据
    if hf_config.use_weighted_layer_sum:
        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]

    # 将特征提取器的配置保存到模型导出路径
    hf_feature_extractor.save_pretrained(model_dump_path)
    
    # 将转换后的 Hugging Face 模型保存到模型导出路径
    hf_model.save_pretrained(model_dump_path)


# 主程序入口
if __name__ == "__main__":
    # 创建参数解析器
    parser = argparse.ArgumentParser()
    
    # 添加命令行参数：预训练基础模型名称
    parser.add_argument(
        "--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model."
    )
    
    # 添加命令行参数：分类器配置文件路径
    parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.")
    
    # 添加命令行参数：S3PRL 检查点文件路径
    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.")
    
    # 添加命令行参数：转换后模型保存路径
    parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.")
    
    # 解析命令行参数
    args = parser.parse_args()
    
    # 调用转换函数，传入命令行参数指定的参数
    convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path)

`.\models\unispeech_sat\convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py`

# 设置文件编码为 UTF-8

# 引用必要的库和模块
import argparse  # 导入命令行参数解析库 argparse
import fairseq   # 导入 fairseq 库
import torch     # 导入 PyTorch 库

# 从 transformers 库中导入 UniSpeechSatConfig, UniSpeechSatForCTC, UniSpeechSatForPreTraining 和 logging
from transformers import UniSpeechSatConfig, UniSpeechSatForCTC, UniSpeechSatForPreTraining, logging

# 设置日志的详细程度为 info
logging.set_verbosity_info()

# 获取当前模块的 logger 对象
logger = logging.get_logger(__name__)

# 定义一个映射字典，用于将 UniSpeechSat 模型的参数名映射到 HuggingFace 模型的参数名
MAPPING = {
    "post_extract_proj": "feature_projection.projection",
    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
    "fc2": "encoder.layers.*.feed_forward.output_dense",
    "final_layer_norm": "encoder.layers.*.final_layer_norm",
    "encoder.layer_norm": "encoder.layer_norm",
    "encoder.layer_norm_for_extract": "layer_norm_for_extract",
    "w2v_model.layer_norm": "feature_projection.layer_norm",
    "quantizer.weight_proj": "quantizer.weight_proj",
    "quantizer.vars": "quantizer.codevectors",
    "project_q": "project_q",
    "final_proj": "project_hid",
    "w2v_encoder.proj": "lm_head",
    "label_embs_concat": "label_embeddings_concat",
    "mask_emb": "masked_spec_embed",
    "spk_proj": "speaker_proj",
}

# 定义顶层键列表，列出需要逐层设置的顶层参数名
TOP_LEVEL_KEYS = [
    "lm_head",
    "quantizer.weight_proj",
    "quantizer.codevectors",
    "project_q",
    "project_hid",
    "label_embeddings_concat",
    "speaker_proj",
    "layer_norm_for_extract",
]

# 定义一个函数，递归设置 HuggingFace 模型的参数
def set_recursively(hf_pointer, key, value, full_name, weight_type):
    # 按照 key 的路径逐级获取 hf_pointer 的属性
    for attribute in key.split("."):
        hf_pointer = getattr(hf_pointer, attribute)

    # 根据 weight_type 确定需要设置的参数的形状
    if weight_type is not None:
        hf_shape = getattr(hf_pointer, weight_type).shape
    else:
        hf_shape = hf_pointer.shape

    # 检查待设置参数的形状是否与传入 value 的形状一致，若不一致则抛出异常
    if hf_shape != value.shape:
        raise ValueError(
            f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
            f" {value.shape} for {full_name}"
        )

    # 根据 weight_type 类型设置不同的参数值
    if weight_type == "weight":
        hf_pointer.weight.data = value
    elif weight_type == "weight_g":
        hf_pointer.weight_g.data = value
    elif weight_type == "weight_v":
        hf_pointer.weight_v.data = value
    # 如果权重类型是偏置（bias），设置模型指针的偏置数据为给定的值
    elif weight_type == "bias":
        hf_pointer.bias.data = value
    # 否则，设置模型指针的数据为给定的值
    else:
        hf_pointer.data = value

    # 记录初始化信息到日志，包括模型中的键（如果存在）、权重类型（如果存在）、以及从哪里初始化的信息
    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
# 递归加载权重到 HF 模型中
def recursively_load_weights(fairseq_model, hf_model):
    # 未使用的权重列表
    unused_weights = []
    # 获取 Fairseq 模型的状态字典
    fairseq_dict = fairseq_model.state_dict()

    # 获取 HF 模型的特征提取器
    feature_extractor = hf_model.unispeech_sat.feature_extractor

    # 遍历 Fairseq 模型的状态字典中的每个键值对
    for name, value in fairseq_dict.items():
        # 是否被使用的标志
        is_used = False
        # 如果名称中包含 "conv_layers"
        if "conv_layers" in name:
            # 调用加载卷积层的函数
            load_conv_layer(
                name,
                value,
                feature_extractor,
                unused_weights,
                hf_model.config.feat_extract_norm == "group",
            )
            is_used = True
        else:
            # 遍历映射字典 MAPPING 中的每个键值对
            for key, mapped_key in MAPPING.items():
                # 将 mapped_key 添加前缀 "unispeech_sat."，如果它不在 TOP_LEVEL_KEYS 中
                mapped_key = "unispeech_sat." + mapped_key if mapped_key not in TOP_LEVEL_KEYS else mapped_key
                # 如果名称中包含 key，或者 key 的最后一部分等于名称的第一个部分
                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
                    # 如果名称包含 "layer_norm_for_extract" 并且名称不完全匹配 key，则继续下一个循环
                    if "layer_norm_for_extract" in name and (".".join(name.split(".")[:-1]) != key):
                        continue
                    is_used = True
                    # 如果 mapped_key 包含通配符 "*", 则替换为名称中的层索引
                    if "*" in mapped_key:
                        layer_index = name.split(key)[0].split(".")[-2]
                        mapped_key = mapped_key.replace("*", layer_index)
                    # 根据名称的后缀确定权重类型
                    if "weight_g" in name:
                        weight_type = "weight_g"
                    elif "weight_v" in name:
                        weight_type = "weight_v"
                    elif "bias" in name:
                        weight_type = "bias"
                    elif "weight" in name:
                        # TODO: 不匹配 quantizer.weight_proj
                        weight_type = "weight"
                    else:
                        weight_type = None
                    # 递归设置 HF 模型的权重
                    set_recursively(hf_model, mapped_key, value, name, weight_type)
                continue
        # 如果没有使用，则将名称添加到未使用的权重列表中
        if not is_used:
            unused_weights.append(name)

    # 记录未使用的权重列表
    logger.warning(f"Unused weights: {unused_weights}")


# 加载卷积层
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
    # 获取卷积层名称
    name = full_name.split("conv_layers.")[-1]
    # 将名称拆分为项目列表
    items = name.split(".")
    # 提取层 ID 和类型 ID
    layer_id = int(items[0])
    type_id = int(items[1])
    # 如果类型ID为0
    if type_id == 0:
        # 如果变量名包含"bias"
        if "bias" in name:
            # 检查值的形状是否与卷积层偏置数据的形状相匹配，若不匹配则引发数值错误
            if value.shape != feature_extractor.conv_layers[layer_id].conv.bias.data.shape:
                raise ValueError(
                    f"{full_name} has size {value.shape}, but"
                    f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
                )
            # 将值赋给特征提取器的卷积层偏置数据，并记录日志
            feature_extractor.conv_layers[layer_id].conv.bias.data = value
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
        # 如果变量名包含"weight"
        elif "weight" in name:
            # 检查值的形状是否与卷积层权重数据的形状相匹配，若不匹配则引发数值错误
            if value.shape != feature_extractor.conv_layers[layer_id].conv.weight.data.shape:
                raise ValueError(
                    f"{full_name} has size {value.shape}, but"
                    f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
                )
            # 将值赋给特征提取器的卷积层权重数据，并记录日志
            feature_extractor.conv_layers[layer_id].conv.weight.data = value
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
    
    # 如果类型ID为2且不使用组规范，或者类型ID为2且为第一层且使用组规范
    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
        # 如果变量名包含"bias"
        if "bias" in name:
            # 检查值的形状是否与特征提取器的层归一化偏置数据的形状相匹配，若不匹配则引发数值错误
            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape:
                raise ValueError(
                    f"{full_name} has size {value.shape}, but"
                    f" {feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape} was found."
                )
            # 将值赋给特征提取器的层归一化偏置数据，并记录日志
            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
        # 如果变量名包含"weight"
        elif "weight" in name:
            # 检查值的形状是否与特征提取器的层归一化权重数据的形状相匹配，若不匹配则引发数值错误
            if value.shape != feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape:
                raise ValueError(
                    f"{full_name} has size {value.shape}, but"
                    f" {feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape} was found."
                )
            # 将值赋给特征提取器的层归一化权重数据，并记录日志
            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
    
    # 如果以上条件都不满足，则将变量名添加到未使用的权重列表中
    else:
        unused_weights.append(full_name)
# 声明一个装饰器，表示在该函数执行时不需要计算梯度信息
@torch.no_grad()
# 定义一个函数，将 UniSpeech 模型的检查点转换为 Transformers 设计
def convert_unispeech_sat_checkpoint(
    checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True
):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    # 如果提供了配置文件路径，则使用预训练配置文件创建 UniSpeechSatConfig 对象
    if config_path is not None:
        config = UniSpeechSatConfig.from_pretrained(config_path)
    else:
        # 否则，创建一个空的 UniSpeechSatConfig 对象
        config = UniSpeechSatConfig()

    # 重置 dict_path 变量为空字符串
    dict_path = ""

    # 根据是否微调标志，选择不同类型的 UniSpeechSat 模型
    if is_finetuned:
        hf_wav2vec = UniSpeechSatForCTC(config)
    else:
        hf_wav2vec = UniSpeechSatForPreTraining(config)

    # 使用 fairseq 提供的工具加载模型集合和任务信息
    model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
        [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}
    )
    # 将加载的模型设置为评估模式，即不计算梯度
    model = model[0].eval()

    # 递归地加载模型的权重到 hf_wav2vec 模型中
    recursively_load_weights(model, hf_wav2vec)

    # 将转换后的 hf_wav2vec 模型保存到指定的 PyTorch 输出文件夹中
    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)


# 如果该脚本作为主程序运行，则执行以下代码
if __name__ == "__main__":
    # 解析命令行参数
    parser = argparse.ArgumentParser()
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
    parser.add_argument("--dict_path", default=None, type=str, help="Path to dict of fine-tuned model")
    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
    parser.add_argument(
        "--not_finetuned", action="store_true", help="Whether the model to convert is a fine-tuned model or not"
    )
    args = parser.parse_args()
    
    # 调用 convert_unispeech_sat_checkpoint 函数，并根据命令行参数决定是否微调模型
    convert_unispeech_sat_checkpoint(
        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.dict_path, not args.not_finetuned
    )

`.\models\unispeech_sat\modeling_unispeech_sat.py`

# 指定编码方式为 UTF-8，确保脚本可以正确处理 Unicode 字符
# 版权声明，版权归 Fairseq 作者和 HuggingFace Inc. 团队所有，保留所有权利
#
# 根据 Apache 许可证版本 2.0 许可使用本文件，除非符合许可证的规定，否则不得使用本文件
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件是基于"原样"提供的，不提供任何明示或暗示的担保或条件
# 有关特定语言的详细信息，请参阅许可证
""" PyTorch UniSpeechSat model."""

import math  # 导入数学模块
import warnings  # 导入警告模块
from dataclasses import dataclass  # 导入数据类装饰器
from typing import Optional, Tuple, Union  # 导入类型提示

import numpy as np  # 导入 NumPy 库
import torch  # 导入 PyTorch 库
import torch.utils.checkpoint  # 导入 PyTorch 检查点工具
from torch import nn  # 导入 PyTorch 神经网络模块
from torch.nn import CrossEntropyLoss  # 导入交叉熵损失函数

from ...activations import ACT2FN  # 导入激活函数
from ...integrations.deepspeed import is_deepspeed_zero3_enabled  # 导入深度加速相关函数
from ...modeling_outputs import (  # 导入模型输出类
    BaseModelOutput,
    CausalLMOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
    Wav2Vec2BaseModelOutput,
    XVectorOutput,
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型类
from ...utils import (  # 导入实用函数
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_peft_available,
    logging,
    replace_return_docstrings,
)
from .configuration_unispeech_sat import UniSpeechSatConfig  # 导入 UniSpeechSat 配置类


logger = logging.get_logger(__name__)  # 获取记录器

_HIDDEN_STATES_START_POSITION = 2  # 隐藏状态的起始位置

# 通用文档字符串
_CONFIG_FOR_DOC = "UniSpeechSatConfig"

# 基础文档字符串
_CHECKPOINT_FOR_DOC = "microsoft/unispeech-sat-base-100h-libri-ft"
_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]

# CTC（连续文本转录）文档字符串
_CTC_EXPECTED_OUTPUT = "'MISTER QUILDER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
_CTC_EXPECTED_LOSS = 39.88

# 帧级分类文档字符串
_FRAME_CLASS_CHECKPOINT = "microsoft/unispeech-sat-base-plus-sd"
_FRAME_EXPECTED_OUTPUT = [0, 0]

# 说话人验证文档字符串
_XVECTOR_CHECKPOINT = "microsoft/unispeech-sat-base-plus-sv"
_XVECTOR_EXPECTED_OUTPUT = 0.97

UNISPEECH_SAT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    # 查看所有 UniSpeechSat 模型的详细信息，请访问 https://huggingface.co/models?filter=unispeech_sat
]


@dataclass
class UniSpeechSatForPreTrainingOutput(ModelOutput):
    """
    [`UniSpeechSatForPreTrainingOutput`] 的输出类型，包括潜在的隐藏状态和注意力。
    """
    # 定义函数的参数和返回值的类型注解，以及可选的描述信息
    Args:
        loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
            在训练模式下返回的总损失，包括对比损失（L_m）和多样性损失（L_d），参考官方论文[https://arxiv.org/pdf/2006.11477.pdf]中的分类损失。
        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
            模型隐藏状态投影到 `config.proj_codevector_dim` 维度，可用于预测掩码后的量化投影状态。
        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
            量化提取的特征向量投影到 `config.proj_codevector_dim` 维度，代表对比损失的正样本向量。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            一个元组，包含 `torch.FloatTensor` 类型的张量（一个用于嵌入层输出，每层一个用于层输出），形状为 `(batch_size, sequence_length, hidden_size)`。

            每层模型的隐藏状态以及初始嵌入层输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            一个元组，包含 `torch.FloatTensor` 类型的张量（每层一个），形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。

            注意力机制 softmax 后的注意力权重，用于计算自注意力头部的加权平均值。
    """

    # 定义可选的变量，用于存储不同类型的模型输出
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    projected_states: torch.FloatTensor = None
    projected_quantized_states: torch.FloatTensor = None
    codevector_perplexity: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices

def _compute_mask_indices(
    shape: Tuple[int, int],                            # 定义函数 _compute_mask_indices 的参数 shape，是一个二元组，表示输入的形状
    mask_prob: float,                                  # 概率参数，确定要屏蔽的轴的百分比
    mask_length: int,                                  # 屏蔽长度
    attention_mask: Optional[torch.LongTensor] = None,  # 可选的注意力掩码，用于在每个批次维度上独立地缩短特征轴
    min_masks: int = 0,                                # 最小屏蔽数量
) -> np.ndarray:                                       # 返回一个 NumPy 数组，表示生成的屏蔽索引

    """
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    """

    batch_size, sequence_length = shape               # 解包形状参数，分别得到批次大小和序列长度

    if mask_length < 1:
        raise ValueError("`mask_length` has to be bigger than 0.")  # 如果 mask_length 小于 1，抛出值错误异常

    if mask_length > sequence_length:
        raise ValueError(
            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
            f" and `sequence_length`: {sequence_length}`"
        )                                               # 如果 mask_length 大于 sequence_length，抛出值错误异常

    # epsilon is used for probabilistic rounding
    epsilon = np.random.rand(1).item()                # 生成一个随机数作为 epsilon 用于概率舍入

    def compute_num_masked_span(input_length):
        """Given input length, compute how many spans should be masked"""
        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)  # 计算应屏蔽的 span 数量
        num_masked_span = max(num_masked_span, min_masks)  # 确保屏蔽的 span 数量不小于 min_masks

        # make sure num masked span <= sequence_length
        if num_masked_span * mask_length > sequence_length:
            num_masked_span = sequence_length // mask_length  # 确保屏蔽的 span 数量不超过 sequence_length

        # make sure num_masked span is also <= input_length - (mask_length - 1)
        if input_length - (mask_length - 1) < num_masked_span:
            num_masked_span = max(input_length - (mask_length - 1), 0)  # 确保屏蔽的 span 数量不超过 input_length - (mask_length - 1)

        return num_masked_span

    # compute number of masked spans in batch
    input_lengths = (
        attention_mask.sum(-1).detach().tolist()      # 如果 attention_mask 存在，计算每个批次维度的特征轴的长度并转换为列表
        if attention_mask is not None
        else [sequence_length for _ in range(batch_size)]  # 否则，默认为每个批次维度都是 sequence_length
    )

    # SpecAugment mask to fill
    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)  # 创建一个全为 False 的布尔数组，形状为 (batch_size, sequence_length)
    spec_aug_mask_idxs = []                         # 初始化用于存储屏蔽索引的列表

    max_num_masked_span = compute_num_masked_span(sequence_length)  # 计算序列长度中的最大屏蔽 span 数量
    # 如果最大被遮蔽跨度为0，则直接返回特定的遮蔽掩码
    if max_num_masked_span == 0:
        return spec_aug_mask

    # 对每个输入长度进行循环处理
    for input_length in input_lengths:
        # 计算当前输入的被遮蔽跨度的数量
        num_masked_span = compute_num_masked_span(input_length)

        # 随机选择要遮蔽的索引位置
        spec_aug_mask_idx = np.random.choice(
            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
        )

        # 如果没有选择任何索引，则说明输入长度严格小于序列长度，
        # 此时最后一个标记必须是填充标记，我们将其作为虚拟遮蔽标识符
        if len(spec_aug_mask_idx) == 0:
            dummy_mask_idx = sequence_length - 1
        else:
            dummy_mask_idx = spec_aug_mask_idx[0]

        # 将虚拟遮蔽标识符添加到遮蔽索引列表中，以保证所有批次具有相同的维度
        spec_aug_mask_idx = np.concatenate(
            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
        )
        spec_aug_mask_idxs.append(spec_aug_mask_idx)

    # 将遮蔽索引列表转换为NumPy数组
    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)

    # 将遮蔽索引扩展为遮蔽跨度
    spec_aug_mask_idxs = np.broadcast_to(
        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)

    # 添加偏移量以创建遮蔽跨度的起始索引
    offsets = np.arange(mask_length)[None, None, :]
    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
        batch_size, max_num_masked_span * mask_length
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets

    # 确保索引不超过序列长度
    if spec_aug_mask_idxs.max() > sequence_length - 1:
        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1

    # 在遮蔽掩码中根据索引设置遮蔽
    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)

    # 返回生成的遮蔽掩码
    return spec_aug_mask
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer 复制而来，修改为 UniSpeechSatNoLayerNormConvLayer
class UniSpeechSatNoLayerNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1  # 获取输入通道维度
        self.out_conv_dim = config.conv_dim[layer_id]  # 获取输出通道维度

        # 定义一维卷积层
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],  # 卷积核大小
            stride=config.conv_stride[layer_id],      # 卷积步长
            bias=config.conv_bias,                    # 是否使用偏置
        )
        self.activation = ACT2FN[config.feat_extract_activation]  # 激活函数

    def forward(self, hidden_states):
        hidden_states = self.conv(hidden_states)  # 执行卷积操作
        hidden_states = self.activation(hidden_states)  # 应用激活函数
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer 复制而来，修改为 UniSpeechSatLayerNormConvLayer
class UniSpeechSatLayerNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1  # 获取输入通道维度
        self.out_conv_dim = config.conv_dim[layer_id]  # 获取输出通道维度

        # 定义一维卷积层
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],  # 卷积核大小
            stride=config.conv_stride[layer_id],      # 卷积步长
            bias=config.conv_bias,                    # 是否使用偏置
        )
        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)  # 层归一化
        self.activation = ACT2FN[config.feat_extract_activation]  # 激活函数

    def forward(self, hidden_states):
        hidden_states = self.conv(hidden_states)  # 执行卷积操作

        hidden_states = hidden_states.transpose(-2, -1)  # 转置操作
        hidden_states = self.layer_norm(hidden_states)  # 执行层归一化
        hidden_states = hidden_states.transpose(-2, -1)  # 转置操作

        hidden_states = self.activation(hidden_states)  # 应用激活函数
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer 复制而来，修改为 UniSpeechSatGroupNormConvLayer
class UniSpeechSatGroupNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1  # 获取输入通道维度
        self.out_conv_dim = config.conv_dim[layer_id]  # 获取输出通道维度

        # 定义一维卷积层
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],  # 卷积核大小
            stride=config.conv_stride[layer_id],      # 卷积步长
            bias=config.conv_bias,                    # 是否使用偏置
        )
        self.activation = ACT2FN[config.feat_extract_activation]  # 激活函数

        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)  # 分组归一化

    def forward(self, hidden_states):
        hidden_states = self.conv(hidden_states)  # 执行卷积操作
        hidden_states = self.layer_norm(hidden_states)  # 执行分组归一化
        hidden_states = self.activation(hidden_states)  # 应用激活函数
        return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding 复制并修改为 UniSpeechSatPositionalConvEmbedding
class UniSpeechSatPositionalConvEmbedding(nn.Module):
    def __init__(self, config):
        super().__init__()
        
        # 创建一个一维卷积层，用于位置编码
        self.conv = nn.Conv1d(
            config.hidden_size,
            config.hidden_size,
            kernel_size=config.num_conv_pos_embeddings,
            padding=config.num_conv_pos_embeddings // 2,
            groups=config.num_conv_pos_embedding_groups,
        )
        
        # 设置权重归一化方法为 weight_norm
        weight_norm = nn.utils.weight_norm
        if hasattr(nn.utils.parametrizations, "weight_norm"):
            weight_norm = nn.utils.parametrizations.weight_norm
        
        # 如果启用了 deepspeed zero3 加速，使用 gathered parameter 和 weight normalization
        if is_deepspeed_zero3_enabled():
            import deepspeed
            
            # 使用 deepspeed 的 gathered parameter 来管理权重
            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
                self.conv = weight_norm(self.conv, name="weight", dim=2)
            deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
            deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
        else:
            # 否则使用常规的 weight normalization
            self.conv = weight_norm(self.conv, name="weight", dim=2)
        
        # 创建一个用于填充的层，用于处理卷积后的输出
        self.padding = UniSpeechSatSamePadLayer(config.num_conv_pos_embeddings)
        
        # 激活函数由配置中的 feat_extract_activation 决定
        self.activation = ACT2FN[config.feat_extract_activation]

    def forward(self, hidden_states):
        # 调整输入张量的维度顺序以适应卷积层的要求
        hidden_states = hidden_states.transpose(1, 2)
        
        # 通过卷积层进行位置编码
        hidden_states = self.conv(hidden_states)
        
        # 使用填充层处理卷积后的张量
        hidden_states = self.padding(hidden_states)
        
        # 应用激活函数
        hidden_states = self.activation(hidden_states)
        
        # 调整输出张量的维度顺序，返回最终的隐藏状态张量
        hidden_states = hidden_states.transpose(1, 2)
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer 复制并修改为 UniSpeechSatSamePadLayer
class UniSpeechSatSamePadLayer(nn.Module):
    def __init__(self, num_conv_pos_embeddings):
        super().__init__()
        
        # 根据 num_conv_pos_embeddings 的奇偶性确定要移除的填充数目
        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0

    def forward(self, hidden_states):
        # 如果需要移除填充，则截取相应长度的张量
        if self.num_pad_remove > 0:
            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder 复制并修改为 UniSpeechSatFeatureEncoder
class UniSpeechSatFeatureEncoder(nn.Module):
    """从原始音频波形构建特征"""
    # 初始化函数，接受一个配置参数对象 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()

        # 根据配置参数选择不同的特征提取层规范化方式
        if config.feat_extract_norm == "group":
            # 如果配置为 "group"，创建一个包含 GroupNorm 的卷积层列表
            conv_layers = [UniSpeechSatGroupNormConvLayer(config, layer_id=0)] + [
                UniSpeechSatNoLayerNormConvLayer(config, layer_id=i + 1)
                for i in range(config.num_feat_extract_layers - 1)
            ]
        elif config.feat_extract_norm == "layer":
            # 如果配置为 "layer"，创建一个包含 LayerNorm 的卷积层列表
            conv_layers = [
                UniSpeechSatLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
            ]
        else:
            # 如果配置不是预期的 "group" 或 "layer"，抛出数值错误异常
            raise ValueError(
                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
            )
        
        # 将卷积层列表转换为模块列表，并存储在 self.conv_layers 中
        self.conv_layers = nn.ModuleList(conv_layers)
        
        # 初始化梯度检查点标志为 False
        self.gradient_checkpointing = False
        
        # 初始化需要梯度计算标志为 True
        self._requires_grad = True

    # 冻结模型参数的函数
    def _freeze_parameters(self):
        # 遍历模型的所有参数，并将其 requires_grad 属性设为 False
        for param in self.parameters():
            param.requires_grad = False
        
        # 将模型的自定义需要梯度计算标志设为 False
        self._requires_grad = False

    # 前向传播函数
    def forward(self, input_values):
        # 将输入值的维度扩展为 (batch_size, 1, ...)，用于卷积层的输入
        hidden_states = input_values[:, None]

        # 如果模型需要梯度计算且处于训练状态，确保 hidden_states 需要梯度
        if self._requires_grad and self.training:
            hidden_states.requires_grad = True

        # 遍历所有卷积层，对 hidden_states 进行卷积操作
        for conv_layer in self.conv_layers:
            # 如果模型需要梯度计算且启用了梯度检查点功能且处于训练状态
            if self._requires_grad and self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数来执行卷积操作
                hidden_states = self._gradient_checkpointing_func(
                    conv_layer.__call__,  # 调用卷积层的 __call__ 方法
                    hidden_states,  # 当前的 hidden_states
                )
            else:
                # 直接调用卷积层进行前向传播计算
                hidden_states = conv_layer(hidden_states)

        # 返回最终的隐藏状态结果
        return hidden_states
class UniSpeechSatFeatureExtractor(UniSpeechSatFeatureEncoder):
    # 继承自UniSpeechSatFeatureEncoder类，用于提取特征
    def __init__(self, config):
        # 调用父类构造函数初始化
        super().__init__(config)
        # 发出警告，提醒该类已被弃用，将在Transformers v5中移除，建议使用父类UniSpeechSatFeatureEncoder
        warnings.warn(
            f"The class `{self.__class__.__name__}` has been depreciated "
            "and will be removed in Transformers v5. "
            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
            FutureWarning,
        )


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->UniSpeechSat
class UniSpeechSatFeatureProjection(nn.Module):
    # 用于特征投影的类，继承自nn.Module
    def __init__(self, config):
        # 初始化函数
        super().__init__()
        # LayerNorm层，用于归一化最后一个卷积维度的特征
        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
        # 线性映射层，将卷积维度映射到隐藏层维度
        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
        # Dropout层，用于特征投影的dropout操作
        self.dropout = nn.Dropout(config.feat_proj_dropout)

    def forward(self, hidden_states):
        # 对未投影的隐藏状态执行LayerNorm操作
        norm_hidden_states = self.layer_norm(hidden_states)
        # 执行特征投影，将归一化后的隐藏状态映射到隐藏层维度
        hidden_states = self.projection(norm_hidden_states)
        # 应用dropout操作
        hidden_states = self.dropout(hidden_states)
        return hidden_states, norm_hidden_states


# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->UniSpeechSat
class UniSpeechSatAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[UniSpeechSatConfig] = None,
    ):
        # 初始化函数
        super().__init__()
        # 设置注意力机制的维度和头数
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        # 设置dropout概率
        self.dropout = dropout
        # 计算每个头的维度
        self.head_dim = embed_dim // num_heads
        # 存储配置
        self.config = config

        # 检查embed_dim必须能被num_heads整除
        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        # 设置缩放因子
        self.scaling = self.head_dim**-0.5
        # 设置是否为解码器注意力
        self.is_decoder = is_decoder
        # 设置是否为因果注意力
        self.is_causal = is_causal

        # 线性映射层，用于查询、键、值的投影
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        # 输出映射层，用于最终输出的线性映射
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 对张量进行形状变换，用于多头注意力计算
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->UniSpeechSat
class UniSpeechSatFeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 使用给定的激活函数的 dropout
        self.intermediate_dropout = nn.Dropout(config.activation_dropout)

        # 中间层的全连接层，输入维度是 hidden_size，输出维度是 intermediate_size
        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置选择相应的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

        # 输出层的全连接层，输入维度是 intermediate_size，输出维度是 hidden_size
        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 输出层的 dropout
        self.output_dropout = nn.Dropout(config.hidden_dropout)

    def forward(self, hidden_states):
        # 中间层的全连接操作
        hidden_states = self.intermediate_dense(hidden_states)
        # 中间层的激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 中间层的 dropout
        hidden_states = self.intermediate_dropout(hidden_states)

        # 输出层的全连接操作
        hidden_states = self.output_dense(hidden_states)
        # 输出层的 dropout
        hidden_states = self.output_dropout(hidden_states)
        return hidden_states


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayer with Wav2Vec2->UniSpeechSat
class UniSpeechSatEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # UniSpeechSatEncoderLayer 中使用的自定义注意力层
        self.attention = UniSpeechSatAttention(
            embed_dim=config.hidden_size,
            num_heads=config.num_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=False,
        )
        # Encoder 层的 dropout
        self.dropout = nn.Dropout(config.hidden_dropout)
        # Layer normalization 层，输入维度是 hidden_size
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # FeedForward 层，使用 UniSpeechSatFeedForward 初始化
        self.feed_forward = UniSpeechSatFeedForward(config)
        # 最终的 layer normalization 层
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
        # 注意力层前的残差连接
        attn_residual = hidden_states
        # 调用注意力层的前向传播
        hidden_states, attn_weights, _ = self.attention(
            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
        )
        # 对注意力输出进行 dropout
        hidden_states = self.dropout(hidden_states)
        # 残差连接加上注意力输出
        hidden_states = attn_residual + hidden_states

        # Layer normalization
        hidden_states = self.layer_norm(hidden_states)
        # 加上 FeedForward 层的输出
        hidden_states = hidden_states + self.feed_forward(hidden_states)
        # 最终的 layer normalization
        hidden_states = self.final_layer_norm(hidden_states)

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (attn_weights,)

        return outputs


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AttnAdapterLayer with Wav2Vec2->UniSpeechSat
class UniSpeechSatAttnAdapterLayer(nn.Module):
    # 这里省略部分代码
    def __init__(self, config):
        """
        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
        up training throughput.
        """
        # 调用父类的初始化方法
        super().__init__()
        # 从配置中获取适配器注意力维度和隐藏层大小
        self.input_dim = config.adapter_attn_dim
        self.hidden_dim = config.hidden_size

        # 初始化层归一化层
        self.norm = nn.LayerNorm(self.hidden_dim)
        # 初始化线性层1，将隐藏状态映射到适配器注意力维度
        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
        # 初始化激活函数ReLU
        self.act_fn = nn.ReLU()
        # 初始化线性层2，将适配器注意力维度映射回隐藏层大小
        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)

    def forward(self, hidden_states: torch.FloatTensor):
        # 应用层归一化到隐藏状态
        hidden_states = self.norm(hidden_states)

        # 应用线性层1
        hidden_states = self.linear_1(hidden_states)
        # 应用ReLU激活函数
        hidden_states = self.act_fn(hidden_states)
        # 应用线性层2
        hidden_states = self.linear_2(hidden_states)

        # 返回处理后的隐藏状态
        return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderLayerStableLayerNorm 复制并修改为 UniSpeechSatEncoderLayerStableLayerNorm 类
class UniSpeechSatEncoderLayerStableLayerNorm(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化注意力机制，使用 UniSpeechSatAttention 类
        self.attention = UniSpeechSatAttention(
            embed_dim=config.hidden_size,
            num_heads=config.num_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=False,
        )
        # 初始化 dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 初始化层归一化层
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化前馈神经网络层，使用 UniSpeechSatFeedForward 类
        self.feed_forward = UniSpeechSatFeedForward(config)
        # 初始化最终层归一化层
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 如果配置中定义了 adapter_attn_dim 属性，则初始化适配器层
        if getattr(config, "adapter_attn_dim", None) is not None:
            self.adapter_layer = UniSpeechSatAttnAdapterLayer(config)
        else:
            self.adapter_layer = None

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        # 复制隐藏状态以用于注意力残差连接
        attn_residual = hidden_states
        # 应用层归一化到隐藏状态
        hidden_states = self.layer_norm(hidden_states)
        # 使用注意力层计算新的隐藏状态、注意力权重，并可能返回注意力权重
        hidden_states, attn_weights, _ = self.attention(
            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
        )
        # 应用 dropout 到隐藏状态
        hidden_states = self.dropout(hidden_states)
        # 添加注意力残差到新的隐藏状态
        hidden_states = attn_residual + hidden_states
        # 添加前馈神经网络到最终归一化的隐藏状态
        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))

        # 如果存在适配器层，则将其应用到隐藏状态
        if self.adapter_layer is not None:
            hidden_states = hidden_states + self.adapter_layer(hidden_states)

        # 构建输出元组
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则将其添加到输出元组中
        if output_attentions:
            outputs += (attn_weights,)

        # 返回输出元组
        return outputs


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Encoder 复制并修改为 UniSpeechSatEncoder 类
class UniSpeechSatEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 存储配置
        self.config = config
        # 初始化位置卷积嵌入层，使用 UniSpeechSatPositionalConvEmbedding 类
        self.pos_conv_embed = UniSpeechSatPositionalConvEmbedding(config)
        # 初始化层归一化层
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 初始化编码器层列表，每层使用 UniSpeechSatEncoderLayer 类
        self.layers = nn.ModuleList([UniSpeechSatEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        # 默认关闭梯度检查点
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states: torch.tensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        ):
            # 初始化隐藏状态和自注意力向量的存储，根据需要选择是否输出
            all_hidden_states = () if output_hidden_states else None
            all_self_attentions = () if output_attentions else None

            # 如果存在注意力掩码，则确保填充的令牌输出为0
            if attention_mask is not None:
                expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
                hidden_states[~expand_attention_mask] = 0

                # 扩展注意力掩码以匹配模型输出维度
                attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
                attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
                attention_mask = attention_mask.expand(
                    attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
                )

            # 计算位置嵌入
            position_embeddings = self.pos_conv_embed(hidden_states)
            hidden_states = hidden_states + position_embeddings
            hidden_states = self.layer_norm(hidden_states)
            hidden_states = self.dropout(hidden_states)

            # 检查是否启用了DeepSpeed zero3
            deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()

            # 遍历每一层
            for layer in self.layers:
                if output_hidden_states:
                    # 如果需要输出隐藏状态，则记录当前层的隐藏状态
                    all_hidden_states = all_hidden_states + (hidden_states,)

                # 应用LayerDrop技术（参见论文https://arxiv.org/abs/1909.11556）
                dropout_probability = torch.rand([])
                skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False

                # 如果不跳过当前层或者启用了DeepSpeed zero3
                if not skip_the_layer or deepspeed_zero3_is_enabled:
                    # 使用梯度检查点函数来计算当前层的输出（仅在训练时）
                    if self.gradient_checkpointing and self.training:
                        layer_outputs = self._gradient_checkpointing_func(
                            layer.__call__,
                            hidden_states,
                            attention_mask,
                            output_attentions,
                        )
                    else:
                        layer_outputs = layer(
                            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
                        )
                    hidden_states = layer_outputs[0]

                # 如果跳过当前层，则输出设为None
                if skip_the_layer:
                    layer_outputs = (None, None)

                # 如果需要输出自注意力向量，则记录当前层的自注意力向量
                if output_attentions:
                    all_self_attentions = all_self_attentions + (layer_outputs[1],)

            # 如果需要输出隐藏状态，则记录最终的隐藏状态
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 根据return_dict的值返回模型输出
            if not return_dict:
                return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
            return BaseModelOutput(
                last_hidden_state=hidden_states,
                hidden_states=all_hidden_states,
                attentions=all_self_attentions,
            )
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2EncoderStableLayerNorm 复制而来，修改为 UniSpeechSatEncoderStableLayerNorm
class UniSpeechSatEncoderStableLayerNorm(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config  # 初始化模型配置信息
        self.pos_conv_embed = UniSpeechSatPositionalConvEmbedding(config)  # 使用 UniSpeechSatPositionalConvEmbedding 初始化位置卷积嵌入层
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)  # 初始化层归一化层
        self.dropout = nn.Dropout(config.hidden_dropout)  # 初始化 dropout 层，用于随机失活
        # 使用 UniSpeechSatEncoderLayerStableLayerNorm 复制 config.num_hidden_layers 次，形成编码器层列表
        self.layers = nn.ModuleList(
            [UniSpeechSatEncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)]
        )
        self.gradient_checkpointing = False  # 初始化梯度检查点标志为 False

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        if attention_mask is not None:
            # 确保不对填充的 token 进行注意力计算
            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
            hidden_states[~expand_attention_mask] = 0

            # 扩展 attention_mask
            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
            attention_mask = attention_mask.expand(
                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
            )

        # 使用位置卷积嵌入层对隐藏状态进行处理
        position_embeddings = self.pos_conv_embed(hidden_states)
        hidden_states = hidden_states + position_embeddings
        hidden_states = self.dropout(hidden_states)

        # 检查是否启用了 DeepSpeed Zero3
        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()

        # 遍历每一层进行 Transformer 的前向传播
        for layer in self.layers:
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 添加 LayerDrop（参考 https://arxiv.org/abs/1909.11556）
            dropout_probability = torch.rand([])

            # 根据 LayerDrop 的概率决定是否跳过当前层
            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
            if not skip_the_layer or deepspeed_zero3_is_enabled:
                # 如果启用了梯度检查点和处于训练模式，则使用梯度检查点技术来计算层的输出
                if self.gradient_checkpointing and self.training:
                    layer_outputs = self._gradient_checkpointing_func(
                        layer.__call__,
                        hidden_states,
                        attention_mask,
                        output_attentions,
                    )
                else:
                    # 否则直接调用层的 __call__ 方法计算输出
                    layer_outputs = layer(
                        hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
                    )
                hidden_states = layer_outputs[0]

            # 如果跳过当前层，则输出设为 None
            if skip_the_layer:
                layer_outputs = (None, None)

            # 如果需要输出自注意力机制的结果，则记录每层的注意力矩阵
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 对最终的隐藏状态进行 LayerNorm 处理
        hidden_states = self.layer_norm(hidden_states)

        # 如果需要输出所有隐藏状态，则记录最终的隐藏状态
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 根据返回值的类型，返回不同的结果格式
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )
# 定义一个使用 Gumbel softmax 进行向量量化的类，详见[CATEGORICAL REPARAMETERIZATION WITH GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf)
class UniSpeechSatGumbelVectorQuantizer(nn.Module):
    """
    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
    """

    def __init__(self, config):
        super().__init__()
        self.num_groups = config.num_codevector_groups  # 从配置中获取码矢组数
        self.num_vars = config.num_codevectors_per_group  # 从配置中获取每组的码矢数

        # 确保码矢的维度能够被码矢组数整除
        if config.codevector_dim % self.num_groups != 0:
            raise ValueError(
                f"`config.codevector_dim {config.codevector_dim} must be divisible by `config.num_codevector_groups`"
                f" {self.num_groups} for concatenation"
            )

        # 存储码矢变量（码字）的容器
        self.codevectors = nn.Parameter(
            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
        )
        # 权重投影层，将隐藏状态投影到码矢变量空间
        self.weight_proj = nn.Linear(config.hidden_size, self.num_groups * self.num_vars)

        # 训练中可以衰减的温度参数
        self.temperature = 2

    @staticmethod
    def _compute_perplexity(probs, mask=None):
        # 计算概率分布的复杂度（perplexity）
        marginal_probs = probs.mean(dim=0)
        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
        return perplexity
    def forward(self, hidden_states):
        # 获取输入张量的批大小、序列长度和隐藏大小
        batch_size, sequence_length, hidden_size = hidden_states.shape

        # 将隐藏状态投影到编码向量维度
        hidden_states = self.weight_proj(hidden_states)
        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)

        if self.training:
            # 在可微分的方式中使用 Gumbel Softmax 对隐藏状态进行采样，生成编码向量的概率分布
            codevector_probs = nn.functional.gumbel_softmax(
                hidden_states.float(), tau=self.temperature, hard=True
            ).type_as(hidden_states)

            # 计算 perplexity（复杂度指数）
            codevector_soft_dist = torch.softmax(
                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
            )
            perplexity = self._compute_perplexity(codevector_soft_dist)
        else:
            # 在非可微分的方式中取最大值，生成硬编码向量分布（one-hot）
            codevector_idx = hidden_states.argmax(dim=-1)
            codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
                -1, codevector_idx.view(-1, 1), 1.0
            )
            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)

            perplexity = self._compute_perplexity(codevector_probs)

        # 将编码向量的概率分布重新调整形状，用于检索编码向量
        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
        # 使用概率分布检索编码向量
        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)

        # 返回编码向量和 perplexity
        return codevectors, perplexity

定义 UniSpeechSat 预训练模型的抽象基类，用于处理权重初始化和下载/加载预训练模型的简单接口

class UniSpeechSatPreTrainedModel(PreTrainedModel):
"""
"""
# 配置类，用于加载模型的配置信息
config_class = UniSpeechSatConfig
# 模型前缀，用于关键参数的命名
base_model_prefix = "unispeech_sat"
# 主要输入名称，用于模型输入的指定
main_input_name = "input_values"
# 支持梯度检查点的保存和恢复功能
supports_gradient_checkpointing = True

# 初始化权重的方法
def _init_weights(self, module):
    """初始化模型权重"""
    # Gumbel Softmax 在初始化权重时需要特殊处理
    if isinstance(module, UniSpeechSatGumbelVectorQuantizer):
        # 初始化权重和偏置，确保分布满足模型需求
        module.weight_proj.weight.data.normal_(mean=0.0, std=1)
        module.weight_proj.bias.data.zero_()
        # 使用特定规则进行向量量化维的数量初始化
        nn.init.uniform_(module.codevectors)
    # 卷积嵌入位置组件的初始化
    elif isinstance(module, UniSpeechSatPositionalConvEmbedding):
        # 正态分布初始化卷积权重，确保权重满足模型结构要求
        nn.init.normal_(
            module.conv.weight,
            mean=0,
            std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
        )
        # 初始化偏置项为0
        nn.init.constant_(module.conv.bias, 0)
    # 特定特征处理的投影组件的初始化
    elif isinstance(module, UniSpeechSatFeatureProjection):
        # 分布均匀初始化权重，确保范围符合模型设计
        k = math.sqrt(1 / module.projection.in_features)
        nn.init.uniform_(module.projection.weight, a=-k, b=k)
        # 同样初始化偏置项，采用均匀分布
        nn.init.uniform_(module.projection.bias, a=-k, b=k)
    # 计算softmax线性层的权重时需特定处理
    elif isinstance(module, nn.Linear):
        # 正态分布初始化权重，初始化范围可根据config参数自定义
        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        # 偏置项设为0
        if module.bias is not None:
            module.bias.data.zero_()
    # 对于标准化操作，初始化各项为0和1，分别对应偏置和权重的情况
    elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
        module.bias.data.zero_()
        module.weight.data.fill_(1.0)
    # 对于一维卷积层，使用高斯分布进行权重初始化
    elif isinstance(module, nn.Conv1d):
        nn.init.kaiming_normal_(
            # 1D卷积层的权重初始化通常使用kaiming方法，保证非线性组件的有效激活
            module.weight
        )
        # 初始化偏置项时，可根据特定规则（通常是与权重初始化一致或特殊化处理）进行
        if module.bias is not None:
            k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
            nn.init.uniform_(module.bias, a=-k, b=k)

# 计算卷积层输出长度的方法
def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
    """
    根据输入长度计算卷积层的输出长度
    """
    # 定义用于输出长度计算的函数，以下是关键公式的应用示例。
    def _conv_out_length(input_length, kernel_size, stride):
        # 计算1D卷积层输出长度的方法，通常用于响应某类特定结构（如神经网络）输入输出长度变换的需要。
        return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1

    # 遍历模型配置中的核参数和步长，逐个计算同时考虑到多组参数时的输出长度。
    for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
        input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

    # 返回最终的输出长度信息，这通常用于指导后续层的结构或绑定输入与输出长度关系的简化。
    return input_lengths
# 计算每个样本非填充部分的长度，即每个样本中有效部分的长度
non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]

# 根据非填充长度计算特征提取器的输出长度，转换为长整型
output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths).to(torch.long)

# 获取当前批次的大小
batch_size = attention_mask.shape[0]

# 初始化一个全零张量，用于构建注意力掩码
attention_mask = torch.zeros(
    (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
)

# 将每个样本中特定索引位置设置为 1，确保在输出长度之前的所有值都被注意到
attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1

# 反转张量并在最后一个维度上累加，然后再次反转，最终转换为布尔类型
attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()

# 返回生成的注意力掩码张量
return attention_mask

UniSpeechSat 的开始文档字符串，提供了关于该模型的介绍和参考文献链接

UNISPEECH_SAT_START_DOCSTRING = r"""
UniSpeechSat was proposed in wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
Representations by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
Auli.

This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving etc.).

This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.

Parameters:
    config ([`UniSpeechSatConfig`]): Model configuration class with all the parameters of the model.
        Initializing with a config file does not load the weights associated with the model, only the
        configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.

"""

UniSpeechSat 的输入文档字符串，通常包含在函数或方法的开头，描述了输入参数和期望的格式

UNISPEECH_SAT_INPUTS_DOCSTRING = r"""
Describe the inputs of the UniSpeechSat model here, including their types and expected formats.
This docstring should detail what inputs are required for the model to operate correctly.
Include any specifics regarding input preprocessing or normalization if applicable.
"""
# 定义函数的输入参数及其类型说明
Args:
input_values (torch.FloatTensor of shape (batch_size, sequence_length)):
Float values of input raw speech waveform. Values can be obtained by loading a .flac or .wav audio file
into an array of type List[float] or a numpy.ndarray, e.g. via the soundfile library (pip install soundfile). To prepare the array into input_values, the [AutoProcessor] should be used for padding and
conversion into a tensor of type torch.FloatTensor. See [Wav2Vec2Processor.__call__] for details.
attention_mask (torch.LongTensor of shape (batch_size, sequence_length), optional):
Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)

        <Tip warning={true}>

        `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
        True`. For all models whose processor has `config.return_attention_mask == False`, such as
        [microsoft/unispeech-sat-base-100h-libri-ft](https://huggingface.co/microsoft/unispeech-sat-base-100h-libri-ft),
        `attention_mask` should **not** be passed to avoid degraded performance when doing batched inference. For
        such models `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware
        that these models also yield slightly different results depending on whether `input_values` is padded or
        not.

        </Tip>

    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
        tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
        more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

"""
The bare UniSpeechSat Model transformer outputting raw hidden-states without any specific head on top.
Raw hidden-states指不带特定输出头部的UniSpeechSat模型的原始隐藏状态输出。

@add_start_docstrings(
"The bare UniSpeechSat Model transformer outputting raw hidden-states without any specific head on top.",
UNISPEECH_SAT_START_DOCSTRING,
)
用于为UniSpeechSat模型添加文档字符串装饰器，描述其作为一个原始隐藏状态输出的模型。

class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
定义UniSpeechSat模型类，继承自UniSpeechSatPreTrainedModel。

def __init__(self, config: UniSpeechSatConfig):
    初始化方法，接收UniSpeechSatConfig对象作为参数。

    super().__init__(config)
    调用父类的初始化方法，传入配置参数config。

    self.config = config
    将配置参数保存在self.config中。

    self.feature_extractor = UniSpeechSatFeatureEncoder(config)
    创建UniSpeechSatFeatureEncoder对象，用于特征提取。

    self.feature_projection = UniSpeechSatFeatureProjection(config)
    创建UniSpeechSatFeatureProjection对象，用于特征投影。

    self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
    创建形状为(config.hidden_size,)的可学习参数张量，用于掩码语谱图。

    if config.do_stable_layer_norm:
        根据配置参数判断是否启用稳定层归一化。
        self.encoder = UniSpeechSatEncoderStableLayerNorm(config)
        如果启用，创建UniSpeechSatEncoderStableLayerNorm对象作为编码器。
    else:
        否则，创建普通的UniSpeechSatEncoder对象作为编码器。
        self.encoder = UniSpeechSatEncoder(config)

    # Initialize weights and apply final processing
    self.post_init()
    调用post_init方法，用于初始化权重和应用最终处理。

# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states
定义_mask_hidden_states方法，从wav2vec2模型中复制而来，用于掩码隐藏状态。

def _mask_hidden_states(
    self,
    hidden_states: torch.FloatTensor,
    接收类型为torch.FloatTensor的隐藏状态张量作为输入参数。

    mask_time_indices: Optional[torch.FloatTensor] = None,
    可选参数，类型为torch.FloatTensor，用于掩码时间索引。

    attention_mask: Optional[torch.LongTensor] = None,
    可选参数，类型为torch.LongTensor，用于注意力掩码。
):
    """
    Masks extracted features along time axis and/or along feature axis according to
    [SpecAugment](https://arxiv.org/abs/1904.08779).
    """

    # `config.apply_spec_augment` can set masking to False
    if not getattr(self.config, "apply_spec_augment", True):
        return hidden_states

    # generate indices & apply SpecAugment along time axis
    batch_size, sequence_length, hidden_size = hidden_states.size()

    if mask_time_indices is not None:
        # apply SpecAugment along time axis with given mask_time_indices
        hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
    elif self.config.mask_time_prob > 0 and self.training:
        # compute mask indices for time axis if not provided
        mask_time_indices = _compute_mask_indices(
            (batch_size, sequence_length),
            mask_prob=self.config.mask_time_prob,
            mask_length=self.config.mask_time_length,
            attention_mask=attention_mask,
            min_masks=self.config.mask_time_min_masks,
        )
        mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
        hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)

    if self.config.mask_feature_prob > 0 and self.training:
        # generate indices & apply SpecAugment along feature axis
        mask_feature_indices = _compute_mask_indices(
            (batch_size, hidden_size),
            mask_prob=self.config.mask_feature_prob,
            mask_length=self.config.mask_feature_length,
            min_masks=self.config.mask_feature_min_masks,
        )
        mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
        # expand feature mask indices to match the dimensions of hidden_states
        mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
        hidden_states[mask_feature_indices] = 0

    return hidden_states
    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
    # 如果未指定output_attentions参数，则使用配置中的默认值
    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
    # 如果未指定output_hidden_states参数，则使用配置中的默认值
    output_hidden_states = (
        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
    )
    # 如果未指定return_dict参数，则使用配置中的默认值
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # 提取输入特征向量
    extract_features = self.feature_extractor(input_values)
    # 调整特征向量的维度顺序
    extract_features = extract_features.transpose(1, 2)

    if attention_mask is not None:
        # 计算与特征向量对应的减少的attention_mask
        attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)

    # 特征投影
    hidden_states, extract_features = self.feature_projection(extract_features)
    # 对隐藏状态进行掩码处理
    hidden_states = self._mask_hidden_states(
        hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
    )

    # 编码器前向传播
    encoder_outputs = self.encoder(
        hidden_states,
        attention_mask=attention_mask,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    # 获取编码器的隐藏状态
    hidden_states = encoder_outputs[0]

    # 如果不使用return_dict格式，则返回元组形式的结果
    if not return_dict:
        return (hidden_states, extract_features) + encoder_outputs[1:]

    # 使用return_dict格式返回结果
    return Wav2Vec2BaseModelOutput(
        last_hidden_state=hidden_states,
        extract_features=extract_features,
        hidden_states=encoder_outputs.hidden_states,
        attentions=encoder_outputs.attentions,
    )

使用装饰器添加文档字符串，描述 UniSpeechSat 模型带有量化器和顶部的 `VQ` 头

@add_start_docstrings("""UniSpeechSat Model with a quantizer and VQ head on top.""", UNISPEECH_SAT_START_DOCSTRING)
class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
def init(self, config: UniSpeechSatConfig):
# 调用父类构造函数初始化模型
super().init(config)

    # 初始化 UniSpeechSat 模型
    self.unispeech_sat = UniSpeechSatModel(config)
    
    # 创建用于特征量化的 dropout 层
    self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)

    # 初始化量化器，使用 Gumbel softmax 方法
    self.quantizer = UniSpeechSatGumbelVectorQuantizer(config)
    
    # 线性变换，将 codevector_dim 维度投影到 proj_codevector_dim 维度
    self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
    
    # 线性变换，将 hidden_size 维度投影到 proj_codevector_dim 维度
    self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)

    # 初始化 dropout 层，用于最终处理
    self.dropout = nn.Dropout(config.final_dropout)

    # 线性变换，将 hidden_size 维度投影到 codevector_dim 维度，用于说话人投影
    self.speaker_proj = nn.Linear(config.hidden_size, config.codevector_dim)
    
    # 初始化标签嵌入参数，维度为 num_clusters x codevector_dim，并将其初始化为零
    self.label_embeddings_concat = nn.Parameter(torch.FloatTensor(config.num_clusters, config.codevector_dim))
    self.label_embeddings_concat.data.zero_()

    # 初始化 LayerNorm 层，用于特征提取器的输出，设定 epsilon 为 layer_norm_eps
    self.layer_norm_for_extract = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
    
    # 如果设置了 do_stable_layer_norm，则固定 LayerNorm 的参数不更新
    if self.config.do_stable_layer_norm:
        self.layer_norm_for_extract.requires_grad = False

    # 调用初始化函数 post_init()，用于初始化权重和应用最终处理
    self.post_init()

def set_gumbel_temperature(self, temperature: int):
    """
    设置 Gumbel softmax 温度为指定值。仅在训练时使用。
    """
    self.quantizer.temperature = temperature

def freeze_feature_extractor(self):
    """
    调用此函数将禁用特征编码器的梯度计算，使其在训练期间不更新参数。
    """
    # 引发警告，表明此方法将在 Transformers v5 中移除，建议使用 freeze_feature_encoder 方法代替
    warnings.warn(
        "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
        "Please use the equivalent `freeze_feature_encoder` method instead.",
        FutureWarning,
    )
    self.freeze_feature_encoder()

def freeze_feature_encoder(self):
    """
    调用此函数将禁用特征编码器的梯度计算，使其在训练期间不更新参数。
    """
    # 冻结 wav2vec2 模型的特征提取器参数
    self.wav2vec2.feature_extractor._freeze_parameters()

@staticmethod
def compute_contrastive_logits(
    target_features: torch.FloatTensor,
    negative_features: torch.FloatTensor,
    predicted_features: torch.FloatTensor,
    temperature: int = 1,
    ):
    """
    计算对比损失的逻辑 logits。输入目标特征、负样本特征和预测特征，以及温度参数。
    """
):
    """
    计算对比损失的对数概率，使用余弦相似度作为距离度量，比较 `[positive_feature, negative_features]` 和 `[predicted_features]`。
    此外，可以应用温度参数调节。

    Args:
        target_features (torch.Tensor): 包含正负样本特征的张量。
        negative_features (torch.Tensor): 负样本特征的张量。
        predicted_features (torch.Tensor): 预测特征的张量。
        temperature (float): 温度参数，用于调节对数概率的尺度。

    Returns:
        torch.Tensor: 经过温度调节后的对数概率张量。
    """
    
    # 将目标特征和负样本特征连接起来
    target_features = torch.cat([target_features, negative_features], dim=0)

    # 计算预测特征与目标特征的余弦相似度
    logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1)
    # 将 logits 转换为与目标特征相同的数据类型
    logits = logits.type_as(target_features)

    # 应用温度参数进行尺度调整
    logits = logits / temperature
    return logits

@add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=UniSpeechSatForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
def forward(
    self,
    input_values: Optional[torch.Tensor],
    attention_mask: Optional[torch.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, UniSpeechSatForPreTrainingOutput]:
    r"""
    Returns:

    Example:

    ```
    >>> import torch
    >>> from transformers import AutoFeatureExtractor, UniSpeechSatForPreTraining
    >>> from transformers.models.unispeech_sat.modeling_unispeech_sat import _compute_mask_indices

    >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-sat-base")
    >>> model = UniSpeechSatForPreTraining.from_pretrained("microsoft/unispeech-sat-base")
    >>> # TODO: Add full pretraining example
    ```"""

    # Determine if the function should return a dictionary format as specified by the `return_dict` parameter
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # Perform UniSpeechSat model inference
    outputs = self.unispeech_sat(
        input_values,
        attention_mask=attention_mask,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    transformer_features = outputs[0]  # Extract the transformer features from the model outputs

    # Quantize all extracted features (unmasked) and apply dropout
    extract_features = self.dropout_features(outputs[1])

    # Placeholder variables for future implementations
    logits = extract_features
    loss = quantized_features = codevector_perplexity = None

    # Below are commented-out sections which may be used for future logic implementation:
    # layer normalization (has no effect when `config.do_stable_layer_norm == False`)
    #        extract_features = self.layer_norm_for_extract(extract_features)
    #        quantized_features, codevector_perplexity = self.quantizer(extract_features)
    #
    # project quantized features twice
    #        quantized_features = self.project_q(quantized_features)
    #        quantized_features = self.project_hid(quantized_features)
    #
    #        loss = None
    #        logits = quantized_features

    # If return_dict is False, construct tuple output without loss
    if not return_dict:
        if loss is not None:
            return (loss, logits, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
        return (logits, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]

    # If return_dict is True, construct UniSpeechSatForPreTrainingOutput object with specified attributes
    return UniSpeechSatForPreTrainingOutput(
        loss=loss,
        logits=logits,
        projected_states=transformer_features,
        projected_quantized_states=quantized_features,
        codevector_perplexity=codevector_perplexity,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

@add_start_docstrings(
"""UniSpeechSat Model with a language modeling head on top for Connectionist Temporal Classification (CTC).""",
UNISPEECH_SAT_START_DOCSTRING,
"""
target_lang (str, optional):
Language id of adapter weights. Adapter weights are stored in the format adapter..safetensors or
adapter..bin. Only relevant when using an instance of [UniSpeechSatForCTC] with adapters. Uses
'eng' by default.
""",
)

定义了一个新的类 UniSpeechSatForCTC，继承自 UniSpeechSatPreTrainedModel

该类用于基于 CTC 的语言建模任务，结合了 UniSpeechSat 模型和一个线性输出层（lm_head）

class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel):
def init(self, config, target_lang: Optional[str] = None):
# 调用父类的初始化方法
super().init(config)

    # 初始化 UniSpeechSat 模型和一个 dropout 层
    self.unispeech_sat = UniSpeechSatModel(config)
    self.dropout = nn.Dropout(config.final_dropout)

    # 设置目标语言（适配器权重的语言标识）
    self.target_lang = target_lang

    # 检查配置中是否定义了词汇表大小，如果未定义则抛出异常
    if config.vocab_size is None:
        raise ValueError(
            f"You are trying to instantiate {self.__class__} with a configuration that "
            "does not define the vocabulary size of the language model head. Please "
            "instantiate the model as follows: `UniSpeechSatForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
            "or define `vocab_size` of your model's configuration."
        )

    # 根据配置设置线性输出层的输入和输出大小
    output_hidden_size = (
        config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
    )
    self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)

    # 初始化权重并进行后续处理
    self.post_init()

def tie_weights(self):
    """
    重写 [`~PreTrainedModel.tie_weights`] 方法，以便在 `from_pretrained(...)` 中传递 `target_lang=...` 时正确加载适配器权重。

    用户不应调用此方法，因为未来可能会更改。

    该方法通常用于绑定输入和输出嵌入权重。在这里，我们重新利用它来正确加载 UniSpeechSat 的适配器层，避免引入新的 `PreTrainedModel` API。
    虽然有点 hacky，但 UniSpeechSat 永远不必绑定输入和输出嵌入，因此在这里重新用这个函数是可以的。
    """
    
    # 获取目标语言
    target_lang = self.target_lang

    # 如果 target_lang 不为 None，且配置中未定义 adapter_attn_dim，则抛出 ValueError
    if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
        raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
    # 如果 target_lang 为 None，且配置中定义了 adapter_attn_dim，则记录警告信息
    elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
        logger.info("By default `target_lang` is set to 'eng'.")
    # 如果 target_lang 不为 None，则加载相应的适配器
    elif target_lang is not None:
        self.load_adapter(target_lang, force_load=True)
# 调用此函数将禁用特征编码器的梯度计算，使其参数在训练期间不会更新
def freeze_feature_extractor(self):
    # 发出警告，提示该函数将在 Transformers v5 中删除，建议使用等效的 freeze_feature_encoder 方法
    warnings.warn(
        "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
        "Please use the equivalent `freeze_feature_encoder` method instead.",
        FutureWarning,
    )
    # 调用 freeze_feature_encoder 方法来冻结特征编码器的参数
    self.freeze_feature_encoder()

# 调用此函数将禁用特征编码器的梯度计算，使其参数在训练期间不会更新
def freeze_feature_encoder(self):
    # 调用内部的 _freeze_parameters 方法来冻结特征编码器的参数
    self.unispeech_sat.feature_extractor._freeze_parameters()

# 调用此函数将禁用基础模型的梯度计算，使其参数在训练期间不会更新，只有分类头部会被更新
def freeze_base_model(self):
    # 遍历 unispeech_sat 模型的所有参数，并将其 requires_grad 属性设为 False，从而禁用梯度计算
    for param in self.unispeech_sat.parameters():
        param.requires_grad = False

# 将该函数装饰为模型的前向传播方法，并添加相应的文档字符串注释
@add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
    checkpoint=_CHECKPOINT_FOR_DOC,
    output_type=CausalLMOutput,
    config_class=_CONFIG_FOR_DOC,
    expected_output=_CTC_EXPECTED_OUTPUT,
    expected_loss=_CTC_EXPECTED_LOSS,
)
def forward(
    self,
    input_values: Optional[torch.Tensor],
    attention_mask: Optional[torch.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, CausalLMOutput]:
    r"""
    labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
        Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
        the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
        All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
        config.vocab_size - 1]`.
    """

    # Determine if return_dict is provided; if not, use the model's default setting
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # Pass input_values through unispeech_sat model, with optional outputs controlled by parameters
    outputs = self.unispeech_sat(
        input_values,
        attention_mask=attention_mask,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    # Retrieve the hidden states from the outputs and apply dropout
    hidden_states = outputs[0]
    hidden_states = self.dropout(hidden_states)

    # Compute logits from the language model head based on the processed hidden states
    logits = self.lm_head(hidden_states)

    # Initialize loss as None
    loss = None
    if labels is not None:
        # Check if any label index exceeds the vocabulary size; raise ValueError if so
        if labels.max() >= self.config.vocab_size:
            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")

        # Calculate input_lengths based on attention_mask if provided; otherwise assume all inputs are attended
        attention_mask = (
            attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
        )
        input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)

        # Create a mask to ignore padded tokens and compute target_lengths
        labels_mask = labels >= 0
        target_lengths = labels_mask.sum(-1)
        flattened_targets = labels.masked_select(labels_mask)

        # Apply log softmax to logits and transpose for CTC loss calculation
        log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)

        # Disable cudnn optimization flags for CTC loss calculation
        with torch.backends.cudnn.flags(enabled=False):
            loss = nn.functional.ctc_loss(
                log_probs,
                flattened_targets,
                input_lengths,
                target_lengths,
                blank=self.config.pad_token_id,
                reduction=self.config.ctc_loss_reduction,
                zero_infinity=self.config.ctc_zero_infinity,
            )

    # If return_dict is False, prepare output as a tuple of logits and optional hidden states
    if not return_dict:
        output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
        return ((loss,) + output) if loss is not None else output

    # If return_dict is True, return CausalLMOutput object containing loss, logits, hidden states, and attentions
    return CausalLMOutput(
        loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
    )

用于处理UniSpeechSat模型的序列分类任务的模型定义，添加了一个线性层在池化输出之上作为分类头部。

@add_start_docstrings(
"""
UniSpeechSat Model with a sequence classification head on top (a linear layer over the pooled output) for tasks
like SUPERB Keyword Spotting.
""",
UNISPEECH_SAT_START_DOCSTRING,
)
class UniSpeechSatForSequenceClassification(UniSpeechSatPreTrainedModel):
def init(self, config):
super().init(config)

    # 检查配置是否支持适配器，如果支持则引发错误
    if hasattr(config, "add_adapter") and config.add_adapter:
        raise ValueError(
            "Sequence classification does not support the use of UniSpeechSat adapters (config.add_adapter=True)"
        )
    
    # 初始化UniSpeechSat模型
    self.unispeech_sat = UniSpeechSatModel(config)
    
    # 确定层数（变压器层 + 输入嵌入）
    num_layers = config.num_hidden_layers + 1
    
    # 如果配置使用加权层求和，则初始化层权重
    if config.use_weighted_layer_sum:
        self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
    
    # 定义投影层，将隐藏状态投影到分类器投影尺寸
    self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
    
    # 定义分类器层，将投影后的特征映射到类别数量
    self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)

    # 初始化权重并应用最终处理
    self.post_init()

# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification中复制过来的方法，冻结特征提取器
def freeze_feature_extractor(self):
    """
    Calling this function will disable the gradient computation for the feature encoder so that its parameters will
    not be updated during training.
    """
    warnings.warn(
        "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
        "Please use the equivalent `freeze_feature_encoder` method instead.",
        FutureWarning,
    )
    self.freeze_feature_encoder()

# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification中复制过来的方法，冻结特征编码器
def freeze_feature_encoder(self):
    """
    Calling this function will disable the gradient computation for the feature encoder so that its parameter will
    not be updated during training.
    """
    self.unispeech_sat.feature_extractor._freeze_parameters()

# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification中复制过来的方法，冻结基础模型
def freeze_base_model(self):
    """
    Calling this function will disable the gradient computation for the base model so that its parameters will not
    be updated during training. Only the classification head will be updated.
    """
    for param in self.unispeech_sat.parameters():
        param.requires_grad = False

# 根据UNISPEECH_SAT_INPUTS_DOCSTRING和其他参数添加文档字符串和代码示例文档字符串
@add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
    checkpoint=_CHECKPOINT_FOR_DOC,
    output_type=SequenceClassifierOutput,
    config_class=_CONFIG_FOR_DOC,
    modality="audio",
)
# 从 transformers.models.unispeech_sat.modeling_unispeech_sat.UniSpeechSat.forward 复制而来，将 Wav2Vec2 修改为 UniSpeechSat，将 wav2vec2 修改为 unispeech_sat
def forward(
    self,
    input_values: Optional[torch.Tensor],
    attention_mask: Optional[torch.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    labels: Optional[torch.Tensor] = None,
) -> Union[Tuple, SequenceClassifierOutput]:
    r"""
    labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
        config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
        `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    """

    # 确定是否返回字典格式的输出，默认为 self.config.use_return_dict
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    # 根据 self.config.use_weighted_layer_sum 确定是否输出隐藏状态
    output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

    # 将输入 input_values 传递给 UniSpeechSat 模型进行前向传播
    outputs = self.unispeech_sat(
        input_values,
        attention_mask=attention_mask,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    # 如果使用加权层求和，则计算加权隐藏状态
    if self.config.use_weighted_layer_sum:
        hidden_states = outputs[_HIDDEN_STATES_START_POSITION]  # 获取隐藏状态起始位置的输出
        hidden_states = torch.stack(hidden_states, dim=1)  # 在第1维度上堆叠隐藏状态
        norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)  # 对层权重进行 softmax 归一化
        hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)  # 加权求和隐藏状态
    else:
        hidden_states = outputs[0]  # 否则直接使用第一个输出作为隐藏状态

    hidden_states = self.projector(hidden_states)  # 投影隐藏状态
    if attention_mask is None:
        pooled_output = hidden_states.mean(dim=1)  # 如果没有注意力掩码，则对隐藏状态进行均值池化
    else:
        padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
        hidden_states[~padding_mask] = 0.0  # 使用特征向量注意力掩码设置隐藏状态为0
        pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)  # 汇总隐藏状态

    logits = self.classifier(pooled_output)  # 使用分类器生成 logits

    loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))  # 计算交叉熵损失

    if not return_dict:
        output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]  # 如果不返回字典，则输出 logits 和隐藏状态起始位置后的输出
        return ((loss,) + output) if loss is not None else output

    # 返回 SequenceClassifierOutput 对象，包括 loss、logits、隐藏状态和注意力
    return SequenceClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

@add_start_docstrings(
"""
UniSpeech-SAT Model with a frame classification head on top for tasks like Speaker Diarization.
""",
UNISPEECH_SAT_START_DOCSTRING,
)

从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification 复制过来，将 Wav2Vec2 替换为 UniSpeechSat，将 wav2vec2 替换为 unispeech_sat，将 WAV_2_VEC_2 替换为 UNISPEECH_SAT

class UniSpeechSatForAudioFrameClassification(UniSpeechSatPreTrainedModel):
def init(self, config):
super().init(config)

    # 如果配置中存在 add_adapter 属性且为 True，则抛出 ValueError
    if hasattr(config, "add_adapter") and config.add_adapter:
        raise ValueError(
            "Audio frame classification does not support the use of UniSpeechSat adapters (config.add_adapter=True)"
        )
    
    # 初始化 UniSpeechSatModel，并将其赋值给 self.unispeech_sat
    self.unispeech_sat = UniSpeechSatModel(config)
    
    # 计算 transformer 层数加上输入嵌入层的总数
    num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
    
    # 如果配置中使用加权层求和，则初始化权重参数为均匀分布
    if config.use_weighted_layer_sum:
        self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
    
    # 初始化分类器线性层，输入大小为 config.hidden_size，输出大小为 config.num_labels
    self.classifier = nn.Linear(config.hidden_size, config.num_labels)
    
    # 将 config.num_labels 赋值给 self.num_labels
    self.num_labels = config.num_labels

    # 初始化模型权重
    self.init_weights()

# 将 freeze_feature_extractor 方法标记为过时，未来将在 Transformers v5 中移除，请改用 freeze_feature_encoder 方法
def freeze_feature_extractor(self):
    """
    Calling this function will disable the gradient computation for the feature encoder so that its parameter will
    not be updated during training.
    """
    warnings.warn(
        "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
        "Please use the equivalent `freeze_feature_encoder` method instead.",
        FutureWarning,
    )
    self.freeze_feature_encoder()

# 禁用特征编码器的梯度计算，使其在训练过程中不会更新参数
def freeze_feature_encoder(self):
    """
    Calling this function will disable the gradient computation for the feature encoder so that its parameter will
    not be updated during training.
    """
    self.unispeech_sat.feature_extractor._freeze_parameters()

# 禁用基础模型的梯度计算，使其在训练过程中不会更新参数，只会更新分类头
def freeze_base_model(self):
    """
    Calling this function will disable the gradient computation for the base model so that its parameters will not
    be updated during training. Only the classification head will be updated.
    """
    for param in self.unispeech_sat.parameters():
        param.requires_grad = False

@add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
    checkpoint=_FRAME_CLASS_CHECKPOINT,
    output_type=TokenClassifierOutput,
    config_class=_CONFIG_FOR_DOC,
    modality="audio",
    expected_output=_FRAME_EXPECTED_OUTPUT,
)
# 前向传播函数，接受输入值 input_values、注意力掩码 attention_mask、标签 labels，输出是否返回注意力、隐藏状态、返回字典等
def forward(
    self,
    input_values: Optional[torch.Tensor],
    attention_mask: Optional[torch.Tensor] = None,
    labels: Optional[torch.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,

    # 如果输入值不为空，则调用 UniSpeechSatModel 的前向传播函数，传入相应参数
    self.unispeech_sat
    #
) -> Union[Tuple, TokenClassifierOutput]:
    r"""
    labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
        config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
        `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    """

    # 确定是否返回字典格式的输出结果，如果未指定，则使用配置中的默认设置
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    # 根据配置决定是否输出隐藏状态，如果使用加权层求和，则输出隐藏状态
    output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

    # 调用UniSpeech模型，传入输入值和其他参数，并返回输出结果
    outputs = self.unispeech_sat(
        input_values,
        attention_mask=attention_mask,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    # 如果配置中使用加权层求和，则计算加权后的隐藏状态表示
    if self.config.use_weighted_layer_sum:
        hidden_states = outputs[_HIDDEN_STATES_START_POSITION]  # 提取隐藏状态的起始位置
        hidden_states = torch.stack(hidden_states, dim=1)  # 在指定维度上堆叠张量
        norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)  # 对层权重进行softmax归一化
        hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)  # 加权求和隐藏状态表示
    else:
        hidden_states = outputs[0]  # 否则直接使用第一个输出作为隐藏状态表示

    logits = self.classifier(hidden_states)  # 使用分类器对隐藏状态进行分类

    loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss()  # 使用交叉熵损失函数
        # 计算损失，将logits展平为(batch_size, num_labels)，并计算预测标签的交叉熵损失
        loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))

    if not return_dict:
        output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]  # 构建输出元组
        return output

    # 返回TokenClassifierOutput对象，包括损失、logits、隐藏状态和注意力分布等输出结果
    return TokenClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss

AMSoftmaxLoss 类定义，用于实现 AM Softmax 损失函数

class AMSoftmaxLoss(nn.Module):
def init(self, input_dim, num_labels, scale=30.0, margin=0.4):
super(AMSoftmaxLoss, self).init()
self.scale = scale
self.margin = margin
self.num_labels = num_labels
# 定义可学习的权重参数，用于计算损失
self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
# 使用交叉熵损失函数
self.loss = nn.CrossEntropyLoss()

def forward(self, hidden_states, labels):
    labels = labels.flatten()
    # 对权重参数进行归一化处理
    weight = nn.functional.normalize(self.weight, dim=0)
    # 对输入的隐藏状态进行归一化处理
    hidden_states = nn.functional.normalize(hidden_states, dim=1)
    # 计算余弦相似度
    cos_theta = torch.mm(hidden_states, weight)
    # 计算 AM Softmax 损失中的 psi 值
    psi = cos_theta - self.margin

    # 将标签转换为 one-hot 编码
    onehot = nn.functional.one_hot(labels, self.num_labels)
    # 计算最终的预测 logits
    logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
    # 计算损失值
    loss = self.loss(logits, labels)

    return loss

Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer

TDNNLayer 类定义，用于实现时间延迟神经网络层

class TDNNLayer(nn.Module):
def init(self, config, layer_id=0):
super().init()
# 初始化 TDNN 层的参数
self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
self.out_conv_dim = config.tdnn_dim[layer_id]
self.kernel_size = config.tdnn_kernel[layer_id]
self.dilation = config.tdnn_dilation[layer_id]

    # 定义线性层作为卷积核
    self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
    # 定义激活函数为 ReLU
    self.activation = nn.ReLU()

def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
    # 检查是否有 peft 可用，如果有，则导入 LoraLayer
    if is_peft_available():
        from peft.tuners.lora import LoraLayer

        # 如果卷积核是 LoraLayer 类型，则发出警告，因为不会应用 LoRA 权重
        if isinstance(self.kernel, LoraLayer):
            warnings.warn(
                "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
                "You should exclude TDNNLayer from LoRA's target modules.",
            )

    # 将输入的隐藏状态转置，为了与 conv1d 函数兼容
    hidden_states = hidden_states.transpose(1, 2)
    # 将线性层的权重重塑为卷积核的形状，并进行转置
    weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)
    # 使用 conv1d 函数进行卷积操作
    hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
    # 再次将隐藏状态转置回原来的形状
    hidden_states = hidden_states.transpose(1, 2)

    # 应用激活函数
    hidden_states = self.activation(hidden_states)
    return hidden_states

@add_start_docstrings(
"""
UniSpeech-SAT Model with an XVector feature extraction head on top for tasks like Speaker Verification.
""",
UNISPEECH_SAT_START_DOCSTRING,
)

Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector with Wav2Vec2->UniSpeechSat, wav2vec2->unispeech_sat, WAV_2_VEC_2->UNISPEECH_SAT

UniSpeechSatForXVector 类定义，基于 UniSpeech-SAT 模型，添加了 XVector 特征提取头部，用于说话人验证等任务

class UniSpeechSatForXVector(UniSpeechSatPreTrainedModel):
def init(self, config):
super().init(config)

    self.unispeech_sat = UniSpeechSatModel(config)  # 初始化UniSpeechSatModel模型
    num_layers = config.num_hidden_layers + 1  # 计算层数，包括transformer层和输入嵌入层
    if config.use_weighted_layer_sum:
        self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)  # 如果配置使用加权层求和，则初始化权重参数
    self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])  # 初始化线性投影层

    # 创建TDNN层列表
    tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
    self.tdnn = nn.ModuleList(tdnn_layers)  # 将TDNN层列表封装成ModuleList

    self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)  # 初始化特征提取器的线性层
    self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)  # 初始化分类器的线性层

    self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)  # 初始化AMSoftmax损失函数

    self.init_weights()  # 调用初始化权重方法

def freeze_feature_extractor(self):
    """
    Calling this function will disable the gradient computation for the feature encoder so that its parameter will
    not be updated during training.
    """
    warnings.warn(
        "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
        "Please use the equivalent `freeze_feature_encoder` method instead.",
        FutureWarning,
    )
    self.freeze_feature_encoder()  # 调用freeze_feature_encoder方法冻结特征编码器的参数更新

def freeze_feature_encoder(self):
    """
    Calling this function will disable the gradient computation for the feature encoder so that its parameter will
    not be updated during training.
    """
    self.unispeech_sat.feature_extractor._freeze_parameters()  # 冻结特征提取器的参数更新

def freeze_base_model(self):
    """
    Calling this function will disable the gradient computation for the base model so that its parameters will not
    be updated during training. Only the classification head will be updated.
    """
    for param in self.unispeech_sat.parameters():  # 遍历UniSpeechSatModel的所有参数
        param.requires_grad = False  # 将参数的梯度计算设为False，不更新这些参数的梯度

def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
    """
    Computes the output length of the TDNN layers
    """

    def _conv_out_length(input_length, kernel_size, stride):
        # 1D convolutional layer output length formula taken
        # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
        return (input_length - kernel_size) // stride + 1  # 计算1D卷积层的输出长度公式

    for kernel_size in self.config.tdnn_kernel:
        input_lengths = _conv_out_length(input_lengths, kernel_size, 1)  # 循环计算TDNN层的输出长度

    return input_lengths

@add_start_docstrings_to_model_forward(UNISPEECH_SAT_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
    checkpoint=_XVECTOR_CHECKPOINT,
    output_type=XVectorOutput,
    config_class=_CONFIG_FOR_DOC,
    modality="audio",
    expected_output=_XVECTOR_EXPECTED_OUTPUT,
)
# 定义前向传播方法，用于模型推断阶段
def forward(
    self,
    # 输入值，类型为可选的 PyTorch 张量
    input_values: Optional[torch.Tensor],
    # 注意力掩码，类型为可选的 PyTorch 张量，默认为 None
    attention_mask: Optional[torch.Tensor] = None,
    # 是否输出注意力权重，类型为可选的布尔值，默认为 None
    output_attentions: Optional[bool] = None,
    # 是否输出隐藏状态，类型为可选的布尔值，默认为 None
    output_hidden_states: Optional[bool] = None,
    # 是否返回字典类型的输出，类型为可选的布尔值，默认为 None
    return_dict: Optional[bool] = None,
    # 标签，类型为可选的 PyTorch 张量，默认为 None
    labels: Optional[torch.Tensor] = None,
    r"""
    labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
        config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
        `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    """
    # 初始化返回字典，如果未提供则根据配置决定是否使用返回字典
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    # 根据配置决定是否输出隐藏状态
    output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

    # 调用unispeech_sat模型，传入输入值和其他参数，获取输出
    outputs = self.unispeech_sat(
        input_values,
        attention_mask=attention_mask,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    # 如果配置中指定使用加权层求和，则计算加权后的隐藏状态
    if self.config.use_weighted_layer_sum:
        # 从输出中提取隐藏状态的起始位置索引
        hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
        # 将所有隐藏状态堆叠在一起
        hidden_states = torch.stack(hidden_states, dim=1)
        # 计算层权重的softmax值
        norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
        # 使用权重加权求和隐藏状态
        hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
    else:
        # 否则直接使用输出的第一个元素作为隐藏状态
        hidden_states = outputs[0]

    # 将隐藏状态投影到新的空间
    hidden_states = self.projector(hidden_states)

    # 对每个TDNN层进行循环处理隐藏状态
    for tdnn_layer in self.tdnn:
        hidden_states = tdnn_layer(hidden_states)

    # 统计池化操作
    if attention_mask is None:
        # 如果没有注意力掩码，则对整个隐藏状态进行平均和标准差计算
        mean_features = hidden_states.mean(dim=1)
        std_features = hidden_states.std(dim=1)
    else:
        # 否则，根据注意力掩码计算特征提取的输出长度
        feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
        tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
        mean_features = []
        std_features = []
        # 对每个TDNN层的输出长度进行循环处理
        for i, length in enumerate(tdnn_output_lengths):
            mean_features.append(hidden_states[i, :length].mean(dim=0))
            std_features.append(hidden_states[i, :length].std(dim=0))
        # 将计算得到的均值和标准差特征堆叠起来
        mean_features = torch.stack(mean_features)
        std_features = torch.stack(std_features)
    # 将均值和标准差特征拼接在一起作为统计池化结果
    statistic_pooling = torch.cat([mean_features, std_features], dim=-1)

    # 使用特征提取器对统计池化结果进行处理
    output_embeddings = self.feature_extractor(statistic_pooling)
    # 使用分类器对处理后的特征进行分类得到logits
    logits = self.classifier(output_embeddings)

    # 初始化损失为None
    loss = None
    # 如果提供了标签，则计算损失
    if labels is not None:
        loss = self.objective(logits, labels)

    # 如果不需要返回字典，则直接返回输出元组
    if not return_dict:
        output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
        return ((loss,) + output) if loss is not None else output

    # 如果需要返回字典，则创建XVectorOutput对象并返回
    return XVectorOutput(
        loss=loss,
        logits=logits,
        embeddings=output_embeddings,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

posted @ 2024-07-01 10:57 绝不原创的飞龙阅读(28) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-一百一十四-

Transformers 源码解析（一百一十四）

`.\models\udop\init.py`

`.\models\umt5\configuration_umt5.py`

`.\models\umt5\convert_umt5_checkpoint_to_pytorch.py`

`.\models\umt5\modeling_umt5.py`

`.\models\umt5\init.py`

`.\models\unispeech\configuration_unispeech.py`

`.\models\unispeech\convert_unispeech_original_pytorch_checkpoint_to_pytorch.py`

`.\models\unispeech\modeling_unispeech.py`

`.\models\unispeech\init.py`

`.\models\unispeech_sat\configuration_unispeech_sat.py`

`.\models\unispeech_sat\convert_unispeech_original_s3prl_checkpoint_to_pytorch.py`

`.\models\unispeech_sat\convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py`

`.\models\unispeech_sat\modeling_unispeech_sat.py`

定义 UniSpeechSat 预训练模型的抽象基类，用于处理权重初始化和下载/加载预训练模型的简单接口

UniSpeechSat 的开始文档字符串，提供了关于该模型的介绍和参考文献链接

UniSpeechSat 的输入文档字符串，通常包含在函数或方法的开头，描述了输入参数和期望的格式

使用装饰器添加文档字符串，描述 UniSpeechSat 模型带有量化器和顶部的 `VQ` 头

定义了一个新的类 UniSpeechSatForCTC，继承自 UniSpeechSatPreTrainedModel

该类用于基于 CTC 的语言建模任务，结合了 UniSpeechSat 模型和一个线性输出层（lm_head）

用于处理UniSpeechSat模型的序列分类任务的模型定义，添加了一个线性层在池化输出之上作为分类头部。

从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification 复制过来，将 Wav2Vec2 替换为 UniSpeechSat，将 wav2vec2 替换为 unispeech_sat，将 WAV_2_VEC_2 替换为 UNISPEECH_SAT

Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss

AMSoftmaxLoss 类定义，用于实现 AM Softmax 损失函数

Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer

TDNNLayer 类定义，用于实现时间延迟神经网络层

Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector with Wav2Vec2->UniSpeechSat, wav2vec2->unispeech_sat, WAV_2_VEC_2->UNISPEECH_SAT

UniSpeechSatForXVector 类定义，基于 UniSpeech-SAT 模型，添加了 XVector 特征提取头部，用于说话人验证等任务

公告

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-一百一十四-

Transformers 源码解析（一百一十四）

.\models\udop\__init__.py

.\models\umt5\configuration_umt5.py

.\models\umt5\convert_umt5_checkpoint_to_pytorch.py

.\models\umt5\modeling_umt5.py

.\models\umt5\__init__.py

.\models\unispeech\configuration_unispeech.py

.\models\unispeech\convert_unispeech_original_pytorch_checkpoint_to_pytorch.py

.\models\unispeech\modeling_unispeech.py

.\models\unispeech\__init__.py

.\models\unispeech_sat\configuration_unispeech_sat.py

.\models\unispeech_sat\convert_unispeech_original_s3prl_checkpoint_to_pytorch.py

.\models\unispeech_sat\convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py

.\models\unispeech_sat\modeling_unispeech_sat.py

定义 UniSpeechSat 预训练模型的抽象基类，用于处理权重初始化和下载/加载预训练模型的简单接口

UniSpeechSat 的开始文档字符串，提供了关于该模型的介绍和参考文献链接

UniSpeechSat 的输入文档字符串，通常包含在函数或方法的开头，描述了输入参数和期望的格式

使用装饰器添加文档字符串，描述 UniSpeechSat 模型带有量化器和顶部的 VQ 头

定义了一个新的类 UniSpeechSatForCTC，继承自 UniSpeechSatPreTrainedModel

该类用于基于 CTC 的语言建模任务，结合了 UniSpeechSat 模型和一个线性输出层（lm_head）

用于处理UniSpeechSat模型的序列分类任务的模型定义，添加了一个线性层在池化输出之上作为分类头部。

从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification 复制过来，将 Wav2Vec2 替换为 UniSpeechSat，将 wav2vec2 替换为 unispeech_sat，将 WAV_2_VEC_2 替换为 UNISPEECH_SAT

Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss

AMSoftmaxLoss 类定义，用于实现 AM Softmax 损失函数

Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer

TDNNLayer 类定义，用于实现时间延迟神经网络层

Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector with Wav2Vec2->UniSpeechSat, wav2vec2->unispeech_sat, WAV_2_VEC_2->UNISPEECH_SAT

UniSpeechSatForXVector 类定义，基于 UniSpeech-SAT 模型，添加了 XVector 特征提取头部，用于说话人验证等任务

公告

`.\models\udop\init.py`

`.\models\umt5\configuration_umt5.py`

`.\models\umt5\convert_umt5_checkpoint_to_pytorch.py`

`.\models\umt5\modeling_umt5.py`

`.\models\umt5\init.py`

`.\models\unispeech\configuration_unispeech.py`

`.\models\unispeech\convert_unispeech_original_pytorch_checkpoint_to_pytorch.py`

`.\models\unispeech\modeling_unispeech.py`

`.\models\unispeech\init.py`

`.\models\unispeech_sat\configuration_unispeech_sat.py`

`.\models\unispeech_sat\convert_unispeech_original_s3prl_checkpoint_to_pytorch.py`

`.\models\unispeech_sat\convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py`

`.\models\unispeech_sat\modeling_unispeech_sat.py`

使用装饰器添加文档字符串，描述 UniSpeechSat 模型带有量化器和顶部的 `VQ` 头