Transformers-源码解析-一百三十-

Transformers 源码解析（一百三十）

`.\models\yolos\init.py`

# 版权声明和许可信息
# 2022 年由 HuggingFace 团队保留所有权利
#
# 根据 Apache 许可证 2.0 版本（“许可证”）授权
# 您除非符合许可证的规定，否则不得使用此文件
# 您可以在以下网址获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则软件根据“原样”分发，
# 没有任何明示或暗示的担保或条件。
# 请查阅许可证以了解具体的法律信息和限制。
from typing import TYPE_CHECKING

# 从 utils 中导入所需的模块和异常类
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 指定需要导入的结构
_import_structure = {"configuration_yolos": ["YOLOS_PRETRAINED_CONFIG_ARCHIVE_MAP", "YolosConfig", "YolosOnnxConfig"]}

# 检查视觉相关依赖是否可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果依赖可用，则添加 feature_extraction_yolos 和 image_processing_yolos 到导入结构
    _import_structure["feature_extraction_yolos"] = ["YolosFeatureExtractor"]
    _import_structure["image_processing_yolos"] = ["YolosImageProcessor"]

# 检查 Torch 相关依赖是否可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果依赖可用，则添加 modeling_yolos 到导入结构
    _import_structure["modeling_yolos"] = [
        "YOLOS_PRETRAINED_MODEL_ARCHIVE_LIST",
        "YolosForObjectDetection",
        "YolosModel",
        "YolosPreTrainedModel",
    ]

# 如果在 TYPE_CHECKING 模式下
if TYPE_CHECKING:
    # 从 configuration_yolos 模块导入特定的类和常量
    from .configuration_yolos import YOLOS_PRETRAINED_CONFIG_ARCHIVE_MAP, YolosConfig, YolosOnnxConfig

    # 检查视觉相关依赖是否可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果依赖可用，则从 feature_extraction_yolos 和 image_processing_yolos 模块导入相应的类
        from .feature_extraction_yolos import YolosFeatureExtractor
        from .image_processing_yolos import YolosImageProcessor

    # 检查 Torch 相关依赖是否可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果依赖可用，则从 modeling_yolos 模块导入相应的类和常量
        from .modeling_yolos import (
            YOLOS_PRETRAINED_MODEL_ARCHIVE_LIST,
            YolosForObjectDetection,
            YolosModel,
            YolosPreTrainedModel,
        )

# 如果不在 TYPE_CHECKING 模式下，则将当前模块替换为延迟加载模块
else:
    import sys

    # 使用 _LazyModule 类进行延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\yoso\configuration_yoso.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" YOSO model configuration"""

# 导入所需模块和函数
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取 logger 对象，用于记录日志信息
logger = logging.get_logger(__name__)

# YOSO 预训练模型配置文件的映射字典，将模型名称映射到配置文件的 URL
YOSO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "uw-madison/yoso-4096": "https://huggingface.co/uw-madison/yoso-4096/resolve/main/config.json",
    # 查看所有 YOSO 模型的列表：https://huggingface.co/models?filter=yoso
}

# YosoConfig 类，继承自 PretrainedConfig 类，用于存储 YOSO 模型的配置信息
class YosoConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`YosoModel`]. It is used to instantiate an YOSO
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the YOSO
    [uw-madison/yoso-4096](https://huggingface.co/uw-madison/yoso-4096) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```
    >>> from transformers import YosoConfig, YosoModel

    >>> # Initializing a YOSO uw-madison/yoso-4096 style configuration
    >>> configuration = YosoConfig()

    >>> # Initializing a model (with random weights) from the uw-madison/yoso-4096 style configuration
    >>> model = YosoModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型为 "yoso"
    model_type = "yoso"

    # 初始化方法，定义了 YOSO 模型的各种配置参数
    def __init__(
        self,
        vocab_size=50265,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=4096,
        type_vocab_size=1,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        position_embedding_type="absolute",
        use_expectation=True,
        hash_code_len=9,
        num_hash=64,
        conv_window=None,
        use_fast_hash=True,
        lsh_backward=True,
        pad_token_id=1,
        bos_token_id=0,
        eos_token_id=2,
        **kwargs,
    ):
        # 调用父类的初始化方法，设置默认参数
        super().__init__(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            num_hidden_layers=num_hidden_layers,
            num_attention_heads=num_attention_heads,
            intermediate_size=intermediate_size,
            hidden_act=hidden_act,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_position_embeddings=max_position_embeddings,
            type_vocab_size=type_vocab_size,
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            **kwargs,
        )

        # 特有的配置参数，用于定义 YOSO 模型的一些特性
        self.position_embedding_type = position_embedding_type
        self.use_expectation = use_expectation
        self.hash_code_len = hash_code_len
        self.num_hash = num_hash
        self.conv_window = conv_window
        self.use_fast_hash = use_fast_hash
        self.lsh_backward = lsh_backward
        self.pad_token_id = pad_token_id
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        ):
        # 调用父类的初始化方法，设置模型的特殊标记 ID 和其他关键参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        # 设置模型的词汇表大小
        self.vocab_size = vocab_size
        # 设置模型能处理的最大位置嵌入长度
        self.max_position_embeddings = max_position_embeddings
        # 设置模型隐藏层的尺寸
        self.hidden_size = hidden_size
        # 设置模型的隐藏层层数
        self.num_hidden_layers = num_hidden_layers
        # 设置模型注意力头的数量
        self.num_attention_heads = num_attention_heads
        # 设置模型中间层的大小
        self.intermediate_size = intermediate_size
        # 设置模型隐藏层的激活函数类型
        self.hidden_act = hidden_act
        # 设置模型隐藏层的 dropout 概率
        self.hidden_dropout_prob = hidden_dropout_prob
        # 设置模型注意力头的 dropout 概率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        # 设置模型参数初始化的范围
        self.initializer_range = initializer_range
        # 设置模型的类型词汇表大小
        self.type_vocab_size = type_vocab_size
        # 设置模型层归一化的 epsilon 值
        self.layer_norm_eps = layer_norm_eps
        # 设置位置嵌入的类型
        self.position_embedding_type = position_embedding_type
        # 设置是否使用期望值
        self.use_expectation = use_expectation
        # 设置哈希编码的长度
        self.hash_code_len = hash_code_len
        # 设置哈希函数的数量
        self.num_hash = num_hash
        # 设置卷积窗口的大小
        self.conv_window = conv_window
        # 设置是否使用快速哈希
        self.use_fast_hash = use_fast_hash
        # 设置是否进行 LSH 反向传播
        self.lsh_backward = lsh_backward

`.\models\yoso\convert_yoso_pytorch_to_pytorch.py`

# 定义函数，将原始键名转换为适用于转换后模型的新键名
def rename_key(orig_key):
    # 替换以 "model." 开头的键名为空字符串，去除前缀
    if "model" in orig_key:
        orig_key = orig_key.replace("model.", "")
    # 将 "norm1" 替换为 "attention.output.LayerNorm"
    if "norm1" in orig_key:
        orig_key = orig_key.replace("norm1", "attention.output.LayerNorm")
    # 将 "norm2" 替换为 "output.LayerNorm"
    if "norm2" in orig_key:
        orig_key = orig_key.replace("norm2", "output.LayerNorm")
    # 将 "norm" 替换为 "LayerNorm"
    if "norm" in orig_key:
        orig_key = orig_key.replace("norm", "LayerNorm")
    # 将 "transformer" 替换为 "encoder.layer.<layer_num>"
    if "transformer" in orig_key:
        layer_num = orig_key.split(".")[0].split("_")[-1]
        orig_key = orig_key.replace(f"transformer_{layer_num}", f"encoder.layer.{layer_num}")
    # 将 "mha.attn" 替换为 "attention.self"
    if "mha.attn" in orig_key:
        orig_key = orig_key.replace("mha.attn", "attention.self")
    # 将 "mha" 替换为 "attention"
    if "mha" in orig_key:
        orig_key = orig_key.replace("mha", "attention")
    # 将 "W_q" 替换为 "self.query"
    if "W_q" in orig_key:
        orig_key = orig_key.replace("W_q", "self.query")
    # 将 "W_k" 替换为 "self.key"
    if "W_k" in orig_key:
        orig_key = orig_key.replace("W_k", "self.key")
    # 将 "W_v" 替换为 "self.value"
    if "W_v" in orig_key:
        orig_key = orig_key.replace("W_v", "self.value")
    # 将 "ff1" 替换为 "intermediate.dense"
    if "ff1" in orig_key:
        orig_key = orig_key.replace("ff1", "intermediate.dense")
    # 将 "ff2" 替换为 "output.dense"
    if "ff2" in orig_key:
        orig_key = orig_key.replace("ff2", "output.dense")
    # 将 "ff" 替换为 "output.dense"
    if "ff" in orig_key:
        orig_key = orig_key.replace("ff", "output.dense")
    # 将 "mlm_class" 替换为 "cls.predictions.decoder"
    if "mlm_class" in orig_key:
        orig_key = orig_key.replace("mlm.mlm_class", "cls.predictions.decoder")
    # 将 "mlm" 替换为 "cls.predictions.transform"
    if "mlm" in orig_key:
        orig_key = orig_key.replace("mlm", "cls.predictions.transform")
    # 如果键名不包含 "cls"，则添加 "yoso." 前缀
    if "cls" not in orig_key:
        orig_key = "yoso." + orig_key

    return orig_key


# 定义函数，将原始模型的状态字典进行转换，使其适用于 YOSO 模型
def convert_checkpoint_helper(max_position_embeddings, orig_state_dict):
    # 遍历原始状态字典的键
    for key in orig_state_dict.copy().keys():
        # 弹出键名对应的值
        val = orig_state_dict.pop(key)
        
        # 如果键名中包含 "pooler" 或 "sen_class"，则跳过不处理
        if ("pooler" in key) or ("sen_class" in key):
            continue
        else:
            # 使用定义的函数转换键名，并将其与值重新添加到状态字典中
            orig_state_dict[rename_key(key)] = val

    # 将原始状态字典中 "cls.predictions.decoder.bias" 键的值赋给 "cls.predictions.bias"
    orig_state_dict["cls.predictions.bias"] = orig_state_dict["cls.predictions.decoder.bias"]
    # 生成长度为 max_position_embeddings 的位置 ID，并赋给 "yoso.embeddings.position_ids"
    orig_state_dict["yoso.embeddings.position_ids"] = torch.arange(max_position_embeddings).expand((1, -1)) + 2

    return orig_state_dict
    # 从指定路径加载检查点文件，并使用"cpu"作为目标设备，仅获取其中的"model_state_dict"部分
    orig_state_dict = torch.load(checkpoint_path, map_location="cpu")["model_state_dict"]

    # 从 JSON 文件加载 Yoso 模型的配置信息
    config = YosoConfig.from_json_file(yoso_config_file)

    # 基于给定的配置信息创建一个 YosoForMaskedLM 模型实例
    model = YosoForMaskedLM(config)

    # 使用自定义的辅助函数将原始状态字典转换为新的状态字典
    new_state_dict = convert_checkpoint_helper(config.max_position_embeddings, orig_state_dict)

    # 载入新的状态字典到模型中，返回一个包含加载结果的字典
    print(model.load_state_dict(new_state_dict))

    # 将模型设置为评估模式，即禁用梯度计算
    model.eval()

    # 将当前模型的状态保存到指定路径
    model.save_pretrained(pytorch_dump_path)

    # 打印成功转换检查点并保存模型的消息，显示保存路径
    print(f"Checkpoint successfuly converted. Model saved at {pytorch_dump_path}")
if __name__ == "__main__":
    # 如果脚本作为主程序运行，则执行以下代码块
    
    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # Required parameters（必需的参数）
    parser.add_argument(
        "--pytorch_model_path", default=None, type=str, required=True, help="Path to YOSO pytorch checkpoint."
    )
    # 添加一个命令行参数：pytorch_model_path，用于指定YOSO PyTorch检查点的路径，是必需的参数
    
    parser.add_argument(
        "--config_file",
        default=None,
        type=str,
        required=True,
        help="The json file for YOSO model config.",
    )
    # 添加一个命令行参数：config_file，用于指定YOSO模型配置的JSON文件路径，是必需的参数
    
    parser.add_argument(
        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )
    # 添加一个命令行参数：pytorch_dump_path，用于指定输出PyTorch模型的路径，是必需的参数
    
    # 解析命令行参数并将其存储到args变量中
    args = parser.parse_args()

    # 调用convert_yoso_checkpoint函数，传入解析后的参数
    convert_yoso_checkpoint(args.pytorch_model_path, args.config_file, args.pytorch_dump_path)

`.\models\yoso\modeling_yoso.py`

# coding=utf-8
# Copyright 2022 University of Wisconsin-Madison and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch YOSO model."""

import math
from pathlib import Path
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_ninja_available,
    is_torch_cuda_available,
    logging,
)
from .configuration_yoso import YosoConfig

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 用于文档的检查点和配置信息
_CHECKPOINT_FOR_DOC = "uw-madison/yoso-4096"
_CONFIG_FOR_DOC = "YosoConfig"

# 预训练模型的存档列表
YOSO_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "uw-madison/yoso-4096",
    # 查看所有 YOSO 模型：https://huggingface.co/models?filter=yoso
]

# 全局变量，用于保存加载的 CUDA 内核
lsh_cumulation = None


def load_cuda_kernels():
    """
    加载 CUDA 内核函数。
    """
    global lsh_cumulation
    from torch.utils.cpp_extension import load

    def append_root(files):
        # 获取内核文件夹的路径
        src_folder = Path(__file__).resolve().parent.parent.parent / "kernels" / "yoso"
        return [src_folder / file for file in files]

    # 待加载的 CUDA 内核文件列表
    src_files = append_root(["fast_lsh_cumulation_torch.cpp", "fast_lsh_cumulation.cu", "fast_lsh_cumulation_cuda.cu"])

    # 使用 Torch 提供的 cpp_extension 加载内核文件并命名为 "fast_lsh_cumulation"
    load("fast_lsh_cumulation", src_files, verbose=True)

    # 导入加载后的 CUDA 内核函数并赋值给全局变量 lsh_cumulation
    import fast_lsh_cumulation as lsh_cumulation


def to_contiguous(input_tensors):
    """
    确保输入张量是连续的（contiguous）。
    """
    if isinstance(input_tensors, list):
        # 如果是列表，对每个张量进行连续性检查和处理
        out = []
        for tensor in input_tensors:
            if not tensor.is_contiguous():
                tensor = tensor.contiguous()
            out.append(tensor)
        return out
    else:
        # 如果是单个张量，确保其连续性
        if not input_tensors.is_contiguous():
            input_tensors = input_tensors.contiguous()
        return input_tensors


def normalize(input_tensors):
    """
    对输入张量进行 L2 归一化。
    """
    if isinstance(input_tensors, list):
        # 如果是列表，对每个张量进行 L2 归一化处理
        out = []
        for tensor in input_tensors:
            out.append(nn.functional.normalize(tensor, p=2, dim=-1))
        return out
    else:
        # 如果条件不满足，则执行以下操作：
        # 对输入张量进行 L2 范数归一化，沿着最后一个维度（dim=-1）进行操作
        return nn.functional.normalize(input_tensors, p=2, dim=-1)
    # 定义一个哈希函数，用于生成查询和键的哈希码
    def hashing(query, key, num_hash, hash_len):
        # 检查查询张量的维度是否为3
        if len(query.size()) != 3:
            raise ValueError("Query has incorrect size.")
        # 检查键张量的维度是否为3
        if len(key.size()) != 3:
            raise ValueError("Key has incorrect size.")

        # 生成随机投影矩阵，形状为 (查询张量的第一维度大小, 查询张量的第三维度大小, num_hash * hash_len)
        rmat = torch.randn(query.size(0), query.size(2), num_hash * hash_len, device=query.device)
        # 创建用于计算哈希码的指数表达式
        raise_pow = 2 ** torch.arange(hash_len, device=query.device)

        # 计算查询的投影，并重新形状为 (查询张量的第一维度大小, 查询张量的第二维度大小, num_hash, hash_len)
        query_projection = torch.matmul(query, rmat).reshape(query.size(0), query.size(1), num_hash, hash_len)
        # 计算键的投影，并重新形状为 (键张量的第一维度大小, 键张量的第二维度大小, num_hash, hash_len)
        key_projection = torch.matmul(key, rmat).reshape(key.size(0), key.size(1), num_hash, hash_len)
        # 将查询投影结果转换为二进制表示
        query_binary = (query_projection > 0).int()
        # 将键投影结果转换为二进制表示
        key_binary = (key_projection > 0).int()
        # 计算查询的哈希码
        query_hash = torch.sum(query_binary * raise_pow, dim=-1)
        # 计算键的哈希码
        key_hash = torch.sum(key_binary * raise_pow, dim=-1)

        return query_hash.int(), key_hash.int()


    class YosoCumulation(torch.autograd.Function):
        @staticmethod
        # 前向传播函数，计算期望值并进行累积计算
        def forward(ctx, query_mask, key_mask, query, key, value, config):
            # 从配置中获取哈希码长度
            hash_code_len = config["hash_code_len"]

            # 计算期望值，其值取决于查询与键的余弦相似度
            expectation = (1 - torch.acos(torch.matmul(query, key.transpose(-1, -2))) / math.pi) ** hash_code_len
            # 将期望值乘以查询和键的掩码
            expectation = expectation * query_mask[:, :, None] * key_mask[:, None, :]
            # 计算累积值，通过期望值与值张量的乘积
            cumulation_value = torch.matmul(expectation, value)

            # 保存计算所需的张量，以便反向传播使用
            ctx.save_for_backward(query_mask, key_mask, expectation, query, key, value)
            ctx.config = config

            return cumulation_value

        @staticmethod
        # 反向传播函数，计算梯度
        def backward(ctx, grad):
            grad = to_contiguous(grad)

            # 恢复保存的张量
            query_mask, key_mask, expectation, query, key, value = ctx.saved_tensors
            config = ctx.config

            # 从配置中获取哈希码长度
            hash_code_len = config["hash_code_len"]

            # 计算加权期望值
            weighted_exp = torch.matmul(grad, value.transpose(-1, -2)) * expectation
            # 计算查询的梯度
            grad_query = torch.matmul(weighted_exp, (hash_code_len / 2) * key)
            # 计算键的梯度
            grad_key = torch.matmul(weighted_exp.transpose(-1, -2), (hash_code_len / 2) * query)
            # 计算值的梯度
            grad_value = torch.matmul(expectation.transpose(-1, -2), grad)

            # 返回梯度，前两个为None，对应于前向传播的query_mask和key_mask
            return None, None, grad_query, grad_key, grad_value, None


    class YosoLSHCumulation(torch.autograd.Function):
        @staticmethod
    # 前向传播函数，用于计算查询（query）与键（key）之间的关联值（cumulation_value）
    def forward(ctx, query_mask, key_mask, query, key, value, config):
        # 检查输入张量的维度是否匹配
        if query_mask.size(0) != key_mask.size(0):
            raise ValueError("Query mask and Key mask differ in sizes in dimension 0")
        if query_mask.size(0) != query.size(0):
            raise ValueError("Query mask and Query differ in sizes in dimension 0")
        if query_mask.size(0) != key.size(0):
            raise ValueError("Query mask and Key differ in sizes in dimension 0")
        if query_mask.size(0) != value.size(0):
            raise ValueError("Query mask and Value mask differ in sizes in dimension 0")
        if key.size(1) != value.size(1):
            raise ValueError("Key and Value differ in sizes in dimension 1")
        if query.size(2) != key.size(2):
            raise ValueError("Query and Key differ in sizes in dimension 2")

        # 将输入张量转换为连续内存的形式
        query_mask, key_mask, query, key, value = to_contiguous([query_mask, key_mask, query, key, value])

        # 检测是否使用 CUDA 加速
        use_cuda = query_mask.is_cuda
        # 从配置中获取哈希函数的数量和哈希码长度
        num_hash = config["num_hash"]
        hash_code_len = config["hash_code_len"]
        # 计算哈希表的容量
        hashtable_capacity = int(2**hash_code_len)

        # 根据配置决定使用快速哈希还是普通哈希
        if config["use_fast_hash"]:
            # 如果使用快速哈希，调用快速哈希函数计算哈希码
            query_hash_code, key_hash_code = lsh_cumulation.fast_hash(
                query_mask, query, key_mask, key, num_hash, hash_code_len, use_cuda, 1
            )
        else:
            # 否则，调用普通哈希函数计算哈希码
            query_hash_code, key_hash_code = hashing(query, key, num_hash, hash_code_len)

        # 调用累积哈希表的函数，计算最终的累积值
        cumulation_value = lsh_cumulation.lsh_cumulation(
            query_mask, query_hash_code, key_mask, key_hash_code, value, hashtable_capacity, use_cuda, 1
        )

        # 将所有必要的张量和配置保存在上下文对象中，以便反向传播使用
        ctx.save_for_backward(query_mask, key_mask, query_hash_code, key_hash_code, query, key, value)
        ctx.config = config

        # 返回前向传播计算得到的累积值
        return cumulation_value

    @staticmethod
    # 定义一个反向传播函数，计算输入梯度关于查询、键和值的梯度
    def backward(ctx, grad):
        # 将输入梯度转换为连续内存存储
        grad = to_contiguous(grad)

        # 从上下文中获取保存的张量和配置信息
        query_mask, key_mask, query_hash_code, key_hash_code, query, key, value = ctx.saved_tensors
        config = ctx.config

        # 检查是否在CUDA上计算
        use_cuda = grad.is_cuda
        # 从配置中获取哈希码长度
        hash_code_len = config["hash_code_len"]
        # 计算哈希表容量
        hashtable_capacity = int(2**hash_code_len)

        # 如果配置中指定使用LSH反向传播
        if config["lsh_backward"]:
            # 计算键-值映射的累积梯度
            grad_value = lsh_cumulation.lsh_cumulation(
                key_mask, key_hash_code, query_mask, query_hash_code, grad, hashtable_capacity, use_cuda, 1
            )
            # 计算查询的加权累积梯度
            grad_query = lsh_cumulation.lsh_weighted_cumulation(
                query_mask,
                query_hash_code,
                grad,
                key_mask,
                key_hash_code,
                value,
                (hash_code_len / 2) * key,
                hashtable_capacity,
                use_cuda,
                4,
            )
            # 计算键的加权累积梯度
            grad_key = lsh_cumulation.lsh_weighted_cumulation(
                key_mask,
                key_hash_code,
                value,
                query_mask,
                query_hash_code,
                grad,
                (hash_code_len / 2) * query,
                hashtable_capacity,
                use_cuda,
                4,
            )
        else:
            # 计算期望值，用于非LSH方式的加权梯度计算
            expectation = (1 - torch.acos(torch.matmul(query, key.transpose(-1, -2))) / math.pi) ** hash_code_len
            expectation = expectation * query_mask[:, :, None] * key_mask[:, None, :]
            weighted_exp = torch.matmul(grad, value.transpose(-1, -2)) * expectation
            grad_query = torch.matmul(weighted_exp, (hash_code_len / 2) * key)
            grad_key = torch.matmul(weighted_exp.transpose(-1, -2), (hash_code_len / 2) * query)
            grad_value = torch.matmul(expectation.transpose(-1, -2), grad)

        # 返回梯度，此处返回None表示没有对部分变量进行梯度计算
        return None, None, grad_query, grad_key, grad_value, None
# Copied from transformers.models.nystromformer.modeling_nystromformer.NystromformerEmbeddings
# YosoEmbeddings 类用于构建从单词、位置和令牌类型嵌入得到的总体嵌入。

class YosoEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config):
        super().__init__()
        
        # 创建单词嵌入层，使用 nn.Embedding 类，配置为词汇表大小、隐藏大小，并指定填充标记ID
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        
        # 创建位置嵌入层，使用 nn.Embedding 类，配置为最大位置嵌入数加2和隐藏大小
        self.position_embeddings = nn.Embedding(config.max_position_embeddings + 2, config.hidden_size)
        
        # 创建令牌类型嵌入层，使用 nn.Embedding 类，配置为类型词汇表大小和隐藏大小
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # 用于保持 TensorFlow 模型变量名一致，便于加载 TensorFlow 检查点文件，因此未使用 snake_case 命名
        # 创建 LayerNorm 层，使用 nn.LayerNorm 类，配置为隐藏大小和层标准化的 epsilon 值
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        # 创建 Dropout 层，使用 nn.Dropout 类，配置为隐藏单元的丢弃概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 创建 position_ids 缓冲区张量，包含从2开始的连续最大位置嵌入数，不进行持久化
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False
        )
        
        # 设置位置嵌入类型，默认为 "absolute"，通过 config.position_embedding_type 进行配置
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        
        # 创建 token_type_ids 缓冲区张量，与 position_ids 大小相同，值为0，数据类型为长整型，设备与 position_ids 相同
        self.register_buffer(
            "token_type_ids",
            torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
            persistent=False,
        )
    # 定义模型的前向传播函数，接受输入的参数：input_ids、token_type_ids、position_ids、inputs_embeds
    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
        # 如果输入中包含 input_ids，则获取其形状信息
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            # 否则获取 inputs_embeds 的形状信息，去除最后一维
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列的长度
        seq_length = input_shape[1]

        # 如果 position_ids 为 None，则使用模型构造函数中已注册的 position_ids 的部分切片
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        # 如果 token_type_ids 为 None，则根据模型是否有 token_type_ids 属性进行处理
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                # 获取已注册的 token_type_ids 的部分切片，并扩展到与输入形状相匹配
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                # 否则创建一个全零的 token_type_ids 张量，与输入形状相同，位于模型的设备上
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果 inputs_embeds 为 None，则通过 word_embeddings 层获取输入的词嵌入
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)

        # 根据 token_type_ids 获取 token_type_embeddings
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将输入嵌入和 token_type_embeddings 相加作为最终的嵌入表示
        embeddings = inputs_embeds + token_type_embeddings

        # 如果位置嵌入的类型是 "absolute"，则添加位置嵌入到最终的嵌入表示中
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings

        # 对嵌入表示进行 LayerNorm 处理
        embeddings = self.LayerNorm(embeddings)

        # 对嵌入表示进行 dropout 处理
        embeddings = self.dropout(embeddings)

        # 返回最终的嵌入表示作为模型的输出
        return embeddings
# 定义一个名为 YosoSelfAttention 的自定义神经网络模块，继承自 nn.Module
class YosoSelfAttention(nn.Module):
    # 初始化函数，接收一个 config 对象和一个可选的 position_embedding_type 参数
    def __init__(self, config, position_embedding_type=None):
        super().__init__()  # 调用父类 nn.Module 的初始化函数
        # 检查 hidden_size 是否能被 num_attention_heads 整除，若不能且 config 没有 embedding_size 属性则引发 ValueError
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )
        # 检查是否可用 CUDA 和 Ninja，并且没有加载 lsh_cumulation 内核，则尝试加载 CUDA 内核
        kernel_loaded = lsh_cumulation is not None
        if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
            try:
                load_cuda_kernels()
            except Exception as e:
                # 若加载失败则记录警告信息
                logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")

        # 设置模块的一些属性值，从 config 中获取
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 分别定义 query、key、value 的线性变换层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 定义 dropout 层，用于注意力概率的 dropout
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        # 设置位置编码的类型，默认为 config 中的值，若提供了 position_embedding_type 则使用该值
        self.position_embedding_type = (
            position_embedding_type if position_embedding_type is not None else config.position_embedding_type
        )

        # 设置是否使用期望值
        self.use_expectation = config.use_expectation
        # 设置哈希码的长度
        self.hash_code_len = config.hash_code_len
        # 是否使用卷积
        self.use_conv = config.conv_window is not None
        # 是否使用快速哈希
        self.use_fast_hash = config.use_fast_hash
        # 哈希函数的数量
        self.num_hash = config.num_hash
        # LSH 是否反向
        self.lsh_backward = config.lsh_backward

        # 构建 LSH 配置字典
        self.lsh_config = {
            "hash_code_len": self.hash_code_len,
            "use_fast_hash": self.use_fast_hash,
            "num_hash": self.num_hash,
            "lsh_backward": self.lsh_backward,
        }

        # 如果配置中定义了卷积窗口大小，则定义一个卷积层
        if config.conv_window is not None:
            self.conv = nn.Conv2d(
                in_channels=config.num_attention_heads,
                out_channels=config.num_attention_heads,
                kernel_size=(config.conv_window, 1),
                padding=(config.conv_window // 2, 0),
                bias=False,
                groups=config.num_attention_heads,
            )

    # 定义一个辅助方法，用于将输入层变换为分数矩阵
    def transpose_for_scores(self, layer):
        # 计算新的矩阵形状
        new_layer_shape = layer.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        # 改变层的形状
        layer = layer.view(*new_layer_shape)
        # 对维度进行置换，以便正确计算注意力分数
        return layer.permute(0, 2, 1, 3)


# 从 transformers.models.bert.modeling_bert.BertSelfOutput 复制的自定义神经网络模块 YosoSelfOutput
class YosoSelfOutput(nn.Module):
    # 初始化函数，接收一个 config 对象
    def __init__(self, config):
        super().__init__()  # 调用父类 nn.Module 的初始化函数
        # 定义一个线性变换层 dense
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 定义一个 LayerNorm 层，用于归一化
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 定义一个 dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
    # 定义一个前向传播函数，接收隐藏状态（torch.Tensor 类型）和输入张量（torch.Tensor 类型），返回一个 torch.Tensor 类型的张量
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将隐藏状态通过全连接层 dense 进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的隐藏状态进行 dropout 操作，以防止过拟合
        hidden_states = self.dropout(hidden_states)
        # 将 dropout 后的隐藏状态与输入张量相加，并通过 LayerNorm 进行归一化处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的隐藏状态
        return hidden_states
# 定义 YosoAttention 类，继承自 nn.Module
class YosoAttention(nn.Module):
    # 初始化方法，接受 config 和 position_embedding_type 两个参数
    def __init__(self, config, position_embedding_type=None):
        # 调用父类的初始化方法
        super().__init__()
        # 创建 YosoSelfAttention 实例，并存储在 self.self 中
        self.self = YosoSelfAttention(config, position_embedding_type=position_embedding_type)
        # 创建 YosoSelfOutput 实例，并存储在 self.output 中
        self.output = YosoSelfOutput(config)
        # 创建一个空集合，用于存储被剪枝的注意力头的索引
        self.pruned_heads = set()

    # 剪枝方法，接受 heads 参数作为要剪枝的头部列表
    def prune_heads(self, heads):
        # 如果 heads 列表为空，则直接返回
        if len(heads) == 0:
            return
        # 调用 find_pruneable_heads_and_indices 函数，获取可剪枝头部及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层：query、key、value 和 output.dense
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储被剪枝的头部
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播方法，接受 hidden_states、attention_mask 和 output_attentions 三个参数
    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
        # 调用 self.self 的前向传播方法，获取 self_outputs
        self_outputs = self.self(hidden_states, attention_mask, output_attentions)
        # 调用 self.output 的前向传播方法，计算 attention_output
        attention_output = self.output(self_outputs[0], hidden_states)
        # 将 attention_output 与 self_outputs 中的其它元素组成 outputs
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要输出 attentions，则添加它们
        return outputs


# 从 transformers.models.bert.modeling_bert.BertIntermediate 复制而来
# 定义 YosoIntermediate 类，继承自 nn.Module
class YosoIntermediate(nn.Module):
    # 初始化方法，接受 config 参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个线性层 dense
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据 config 中的 hidden_act 创建激活函数 intermediate_act_fn
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    # 前向传播方法，接受 hidden_states 参数
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 输入 hidden_states 到线性层 dense
        hidden_states = self.dense(hidden_states)
        # 经过激活函数 intermediate_act_fn
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertOutput 复制而来
# 定义 YosoOutput 类，继承自 nn.Module
class YosoOutput(nn.Module):
    # 初始化方法，接受 config 参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个线性层 dense
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建 LayerNorm 层
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建 Dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播方法，接受 hidden_states 和 input_tensor 两个参数
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 输入 hidden_states 到线性层 dense
        hidden_states = self.dense(hidden_states)
        # 经过 Dropout 层
        hidden_states = self.dropout(hidden_states)
        # 输入 hidden_states 和 input_tensor 到 LayerNorm 层
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# 定义 YosoLayer 类，继承自 nn.Module
class YosoLayer(nn.Module):
    pass  # 此处 pass 表示 YosoLayer 类暂无额外代码，仅用作占位符
    # 初始化函数，用于初始化一个 YosoLayer 实例
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 设置前向传播中的块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度的维度，通常为1
        self.seq_len_dim = 1
        # 创建自注意力层 YosoAttention 的实例
        self.attention = YosoAttention(config)
        # 是否添加跨注意力的标志
        self.add_cross_attention = config.add_cross_attention
        # 创建中间层 YosoIntermediate 的实例
        self.intermediate = YosoIntermediate(config)
        # 创建输出层 YosoOutput 的实例
        self.output = YosoOutput(config)

    # 前向传播函数，处理隐藏状态、注意力掩码并返回输出
    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
        # 使用自注意力层处理隐藏状态
        self_attention_outputs = self.attention(hidden_states, attention_mask, output_attentions=output_attentions)
        # 获取自注意力层的输出
        attention_output = self_attention_outputs[0]

        # 如果需要输出注意力权重，将自注意力层的输出添加到输出中
        outputs = self_attention_outputs[1:]  # 如果要输出注意力权重，则添加自注意力

        # 将注意力输出分块处理并进行前向传播
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        # 将块处理后的输出添加到总体输出中
        outputs = (layer_output,) + outputs

        # 返回最终的输出
        return outputs

    # 前向传播中的块处理函数，处理注意力输出并返回层输出
    def feed_forward_chunk(self, attention_output):
        # 使用中间层处理注意力输出
        intermediate_output = self.intermediate(attention_output)
        # 使用输出层处理中间层输出和注意力输出，得到层输出
        layer_output = self.output(intermediate_output, attention_output)
        # 返回处理后的层输出
        return layer_output
# YosoEncoder 类，用于实现自定义的编码器模型
class YosoEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 创建包含多个 YosoLayer 层的列表，层数由配置文件指定
        self.layer = nn.ModuleList([YosoLayer(config) for _ in range(config.num_hidden_layers)])
        # 是否启用梯度检查点，默认为 False
        self.gradient_checkpointing = False

    # 前向传播方法，接受多个参数并返回多个输出
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        # 如果需要输出隐藏状态，则初始化一个空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力矩阵，则初始化一个空元组
        all_self_attentions = () if output_attentions else None

        # 遍历每个 YosoLayer 层进行前向传播
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果启用梯度检查点且处于训练模式，则使用梯度检查点函数进行前向传播
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    output_attentions,
                )
            else:
                # 否则直接调用 YosoLayer 层的前向传播方法
                layer_outputs = layer_module(hidden_states, attention_mask, output_attentions)

            # 更新隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果需要输出注意力矩阵，则将当前层的注意力矩阵添加到 all_self_attentions
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)

        # 如果需要输出隐藏状态，则将最终的隐藏状态添加到 all_hidden_states
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要返回字典形式的输出，则返回包含非空值的元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 否则返回一个包含各输出部分的 BaseModelOutputWithCrossAttentions 对象
        return BaseModelOutputWithCrossAttentions(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
        )


# YosoPredictionHeadTransform 类，用于实现预测头部的转换
# 从 transformers.models.bert.modeling_bert.BertPredictionHeadTransform 复制而来
class YosoPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 使用线性层进行维度转换，输入和输出维度为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 根据配置中的激活函数字符串或函数对象选择激活函数
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        # 使用 LayerNorm 进行归一化，输入维度为 config.hidden_size
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    # 前向传播方法，接受隐藏状态作为输入并返回转换后的隐藏状态
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 线性变换
        hidden_states = self.dense(hidden_states)
        # 应用激活函数
        hidden_states = self.transform_act_fn(hidden_states)
        # LayerNorm 归一化
        hidden_states = self.LayerNorm(hidden_states)
        # 返回转换后的隐藏状态
        return hidden_states


# YosoLMPredictionHead 类，用于实现语言模型预测头部
# 从 transformers.models.bert.modeling_bert.BertLMPredictionHead 复制而来，将 Bert 替换为 Yoso
class YosoLMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 使用给定的配置初始化 YosoPredictionHeadTransform 实例，并将其赋值给 transform 属性
        self.transform = YosoPredictionHeadTransform(config)

        # 创建一个线性层，将隐藏状态的大小映射到词汇表大小，没有偏置项
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 创建一个可学习的偏置项，大小为词汇表大小，并作为 nn.Parameter 对象存储在 bias 属性中
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        # 将 decoder 的偏置项设置为当前定义的偏置项，以便在调整 token embeddings 大小时能正确调整其大小
        self.decoder.bias = self.bias

    def forward(self, hidden_states):
        # 对输入的隐藏状态进行变换，使用 transform 属性的方法
        hidden_states = self.transform(hidden_states)
        
        # 使用 decoder 属性将变换后的隐藏状态映射到词汇表大小的输出空间
        hidden_states = self.decoder(hidden_states)
        
        # 返回映射后的输出隐藏状态
        return hidden_states
# 从transformers.models.bert.modeling_bert.BertOnlyMLMHead复制而来，将Bert更改为Yoso
class YosoOnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化预测头部
        self.predictions = YosoLMPredictionHead(config)

    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
        # 前向传播，生成预测分数
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores


class YosoPreTrainedModel(PreTrainedModel):
    """
    一个抽象类，用于处理权重初始化、下载和加载预训练模型的简单接口。
    """

    config_class = YosoConfig  # 配置类为YosoConfig
    base_model_prefix = "yoso"  # 模型前缀为'yoso'
    supports_gradient_checkpointing = True  # 支持梯度检查点

    def _init_weights(self, module):
        """初始化权重"""
        if isinstance(module, nn.Linear):
            # 与TF版本稍有不同，TF版本使用截断正态分布进行初始化
            # 参考 https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


YOSO_START_DOCSTRING = r"""
    这个模型是一个PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)的子类。
    可以像普通的PyTorch模块一样使用，并且可以参考PyTorch文档了解一切与一般使用和行为相关的问题。

    参数:
        config ([`YosoConfig`]): 包含模型所有参数的模型配置类。
            使用配置文件初始化不会加载模型关联的权重，只会加载配置。
            可以查看[`~PreTrainedModel.from_pretrained`]方法来加载模型权重。
"""

YOSO_INPUTS_DOCSTRING = r"""
    输入:
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            # 输入序列中每个token的索引，用于在词汇表中查找对应的token表示
            Indices of input sequence tokens in the vocabulary.

            # 可以使用AutoTokenizer获取这些索引。参见PreTrainedTokenizer.encode和PreTrainedTokenizer.__call__获取更多细节。
            [What are input IDs?](../glossary#input-ids)

        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            # 注意力掩码，用于避免对填充的token进行注意力计算。掩码值选择在[0, 1]之间：
            # - 1表示该token是**未被掩码**的，
            # - 0表示该token是**被掩码**的。
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            [What are attention masks?](../glossary#attention-mask)

        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 段落token索引，用于指示输入的第一部分和第二部分。索引选择在[0, 1]之间：
            # - 0对应*句子A*的token，
            # - 1对应*句子B*的token。
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:

            [What are token type IDs?](../glossary#token-type-ids)

        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 每个输入序列token在位置嵌入中的位置索引。选取范围为[0, config.max_position_embeddings - 1]。
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            # 用于将自注意力模块的特定头部置为零的掩码。掩码值选择在[0, 1]之间：
            # - 1表示该头部**未被掩码**，
            # - 0表示该头部**被掩码**。
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            # 可选项，可以直接传递嵌入表示而不是传递input_ids。如果需要更多控制如何将input_ids索引转换为关联向量，则这是有用的，比模型内部的嵌入查找矩阵更灵活。
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert *input_ids* indices into associated vectors than the model's internal embedding lookup matrix.

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。详细信息请参见返回的张量中的`attentions`。
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail.

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。详细信息请参见返回的张量中的`hidden_states`。
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail.

        return_dict (`bool`, *optional*):
            # 是否返回`~utils.ModelOutput`而不是普通的元组。
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
YOSO Model transformer outputting raw hidden-states without any specific head on top.

YOSO_START_DOCSTRING: 表示模型开始文档字符串的示例。

"""

class YosoModel(YosoPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.config = config

        self.embeddings = YosoEmbeddings(config)  # 初始化 YosoEmbeddings 对象，用于处理输入的嵌入层
        self.encoder = YosoEncoder(config)  # 初始化 YosoEncoder 对象，用于编码输入数据

        # Initialize weights and apply final processing
        self.post_init()  # 调用模型初始化后处理函数

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings  # 返回输入嵌入层的单词嵌入

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value  # 设置输入嵌入层的单词嵌入为给定的 value

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)  # 剪枝模型中的注意力头部

@add_start_docstrings("""YOSO Model with a `language modeling` head on top.""", YOSO_START_DOCSTRING)
class YosoForMaskedLM(YosoPreTrainedModel):
    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]

    def __init__(self, config):
        super().__init__(config)

        self.yoso = YosoModel(config)  # 初始化 YosoModel 作为 YosoForMaskedLM 的基础模型
        self.cls = YosoOnlyMLMHead(config)  # 初始化 YosoOnlyMLMHead 作为 YosoForMaskedLM 的 MLM 头部

        # Initialize weights and apply final processing
        self.post_init()  # 调用模型初始化后处理函数

    def get_output_embeddings(self):
        return self.cls.predictions.decoder  # 返回输出嵌入层的解码器

    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings  # 设置输出嵌入层的解码器为给定的新嵌入

    @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MaskedLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的token IDs，可以为空
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，用于指示哪些token需要注意，可以为空
        token_type_ids: Optional[torch.Tensor] = None,  # token类型IDs，如用于区分segment A和segment B，可以为空
        position_ids: Optional[torch.Tensor] = None,  # 位置IDs，用于指示token的位置信息，可以为空
        head_mask: Optional[torch.Tensor] = None,  # 头部掩码，用于指定哪些注意力头部被屏蔽，可以为空
        inputs_embeds: Optional[torch.Tensor] = None,  # 输入的嵌入表示，可以为空
        labels: Optional[torch.Tensor] = None,  # 用于计算MLM损失的标签，形状为(batch_size, sequence_length)，可选
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，若为None，则使用self.config.use_return_dict的设置
    ) -> Union[Tuple, MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict  # 确定是否返回字典形式的输出，根据传入的return_dict或self.config.use_return_dict决定

        outputs = self.yoso(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )  # 调用yoso模型，传入各种参数，并根据return_dict确定返回形式

        sequence_output = outputs[0]  # 获取模型输出的序列输出
        prediction_scores = self.cls(sequence_output)  # 使用线性层对序列输出进行预测得分计算

        masked_lm_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # 定义交叉熵损失函数，用于计算MLM损失
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))  # 计算MLM损失

        if not return_dict:
            output = (prediction_scores,) + outputs[1:]  # 如果不返回字典形式的输出，则构造输出元组
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output  # 返回带有损失的输出元组或普通的输出元组

        return MaskedLMOutput(
            loss=masked_lm_loss,  # 返回带有损失信息的MaskedLMOutput对象
            logits=prediction_scores,  # 返回预测得分
            hidden_states=outputs.hidden_states,  # 返回隐藏状态
            attentions=outputs.attentions,  # 返回注意力权重
        )
class YosoClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        # 线性层，输入和输出维度均为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # Dropout 层，用于随机屏蔽输入单元，防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 输出层线性变换，将隐藏状态映射到标签数量维度
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

        self.config = config

    def forward(self, features, **kwargs):
        # 选择 features 中的第一个位置处的特征向量，通常代表 [CLS] 标记的特征
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)  # 应用 Dropout 层
        x = self.dense(x)  # 线性变换
        # 使用配置中指定的激活函数处理 x
        x = ACT2FN[self.config.hidden_act](x)
        x = self.dropout(x)  # 再次应用 Dropout 层
        x = self.out_proj(x)  # 输出层线性变换
        return x


@add_start_docstrings(
    """YOSO Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.""",
    YOSO_START_DOCSTRING,
)
class YosoForSequenceClassification(YosoPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # YOSO 模型的初始化
        self.yoso = YosoModel(config)
        # 分类器头部初始化
        self.classifier = YosoClassificationHead(config)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用该值；否则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用模型的前向传播方法 yoso，传入各种输入参数
        outputs = self.yoso(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取序列输出
        sequence_output = outputs[0]
        # 将序列输出传入分类器，得到 logits
        logits = self.classifier(sequence_output)

        # 初始化损失值
        loss = None
        # 如果给定了标签 labels，则计算相应的损失
        if labels is not None:
            # 如果问题类型未定义，则根据情况进行定义
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择对应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 如果只有一个标签，使用损失函数计算损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                # 使用交叉熵损失函数计算损失
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                # 使用带 logits 的二元交叉熵损失函数计算损失
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则返回 logits 和额外的输出项
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 返回带有损失、logits、隐藏状态和注意力权重的 SequenceClassifierOutput 对象
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 使用装饰器为类添加文档字符串，描述其作为多选分类模型的YOSO模型
@add_start_docstrings(
    """YOSO Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
    YOSO_START_DOCSTRING,
)
# 定义YosoForMultipleChoice类，继承自YosoPreTrainedModel
class YosoForMultipleChoice(YosoPreTrainedModel):
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 初始化Yoso模型
        self.yoso = YosoModel(config)
        # 初始化预分类器，使用线性层将隐藏状态大小映射到相同的隐藏状态大小
        self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
        # 初始化分类器，使用线性层将隐藏状态大小映射到1，用于多选分类任务
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 调用后处理初始化方法
        self.post_init()

    # 使用装饰器添加文档字符串描述forward方法的输入参数
    @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    # 使用装饰器添加代码示例和检查点等文档字符串，指定输出类型为MultipleChoiceModelOutput
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # forward方法的参数说明完毕，没有实现具体的功能逻辑
        ) -> Union[Tuple, MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 确保返回的字典对象不为空，根据配置决定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 获取输入的选项数量，如果是通过 input_ids 计算得到的话
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 重新组织输入，将各种输入类型展平为二维张量，便于模型处理
        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 将处理后的输入传递给模型，获取模型的输出
        outputs = self.yoso(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中获取隐藏状态
        hidden_state = outputs[0]  # (bs * num_choices, seq_len, dim)
        # 从隐藏状态中提取池化输出，通常是第一个位置的隐藏状态
        pooled_output = hidden_state[:, 0]  # (bs * num_choices, dim)
        # 将池化输出传递给预分类器，进行进一步处理
        pooled_output = self.pre_classifier(pooled_output)  # (bs * num_choices, dim)
        # 使用 ReLU 激活函数处理池化输出
        pooled_output = nn.ReLU()(pooled_output)  # (bs * num_choices, dim)
        # 将处理后的池化输出传递给分类器，得到最终的 logits
        logits = self.classifier(pooled_output)

        # 重新调整 logits 的形状，使其与 labels 的形状匹配
        reshaped_logits = logits.view(-1, num_choices)

        # 计算损失，如果提供了 labels 的话
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 根据 return_dict 决定返回的格式
        if not return_dict:
            # 如果不要求返回字典，则返回一个元组
            output = (reshaped_logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果要求返回字典，则返回一个 MultipleChoiceModelOutput 对象
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# YOSO Model with a token classification head on top (a linear layer on top of
# the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
# 继承自 YosoPreTrainedModel 类的 YosoForTokenClassification 类，用于在 YOSO 模型基础上添加标记分类头部，
# 例如用于命名实体识别（NER）任务。

class YosoForTokenClassification(YosoPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # 初始化 YOSO 模型
        self.yoso = YosoModel(config)
        # Dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 线性分类器，将隐藏状态映射到标签数量的输出
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 前向传播函数，接受多种输入参数并返回输出，用于模型推理和训练
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 如果 return_dict 为 None，则根据配置决定是否使用 return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用模型的 forward 方法进行预测
        outputs = self.yoso(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中取出序列输出
        sequence_output = outputs[0]

        # 对序列输出应用 dropout
        sequence_output = self.dropout(sequence_output)
        
        # 使用分类器进行分类得到 logits
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None
        if labels is not None:
            # 使用交叉熵损失函数
            loss_fct = CrossEntropyLoss()
            
            # 只保留激活部分的损失
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不使用 return_dict，则返回 logits 和额外的输出
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 使用 TokenClassifierOutput 封装并返回结果
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# YOSO模型，用于支持像SQuAD这样的抽取式问答任务，具有一个用于计算“起始位置logits”和“结束位置logits”的线性分类头部。
# 该模型继承自YosoPreTrainedModel。
@add_start_docstrings(
    """YOSO Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
    YOSO_START_DOCSTRING,
)
class YosoForQuestionAnswering(YosoPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 设置模型的标签数目为2
        config.num_labels = 2
        self.num_labels = config.num_labels

        # 初始化YOSO模型和用于问答输出的线性层
        self.yoso = YosoModel(config)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(YOSO_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.Tensor] = None,
        end_positions: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 下面是模型前向传播所需的输入参数说明
    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # 如果 return_dict 为 None，则使用配置中的 use_return_dict 值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用模型 yoso，传入各种输入参数，并返回输出
        outputs = self.yoso(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中取出序列输出（通常是 BERT 输出的第一个元素）
        sequence_output = outputs[0]

        # 将序列输出传入问答模型的输出层，得到起始和结束 logits
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # 如果 start_positions 和 end_positions 不为空，则计算损失
            # 如果在多 GPU 下训练，可能需要增加维度
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # 忽略超出模型输入的 start/end positions
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 使用交叉熵损失函数，忽略索引为 ignored_index 的位置
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            # 如果 return_dict 为 False，则返回元组形式的输出
            output = (start_logits, end_logits) + outputs[1:]
            return ((total_loss,) + output) if total_loss is not None else output

        # 如果 return_dict 为 True，则返回 QuestionAnsweringModelOutput 类的实例
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\yoso\init.py`

# 版权声明及许可证信息，指明此文件的版权及使用许可
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 从 typing 模块导入 TYPE_CHECKING 类型检查工具
from typing import TYPE_CHECKING

# 从 ...utils 中导入相关模块和函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available

# 定义模块的导入结构字典
_import_structure = {"configuration_yoso": ["YOSO_PRETRAINED_CONFIG_ARCHIVE_MAP", "YosoConfig"]}

# 尝试检查是否有 torch 可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 torch 可用，则添加 modeling_yoso 模块到导入结构字典中
    _import_structure["modeling_yoso"] = [
        "YOSO_PRETRAINED_MODEL_ARCHIVE_LIST",
        "YosoForMaskedLM",
        "YosoForMultipleChoice",
        "YosoForQuestionAnswering",
        "YosoForSequenceClassification",
        "YosoForTokenClassification",
        "YosoLayer",
        "YosoModel",
        "YosoPreTrainedModel",
    ]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 从 .configuration_yoso 中导入 YOSO_PRETRAINED_CONFIG_ARCHIVE_MAP 和 YosoConfig 类
    from .configuration_yoso import YOSO_PRETRAINED_CONFIG_ARCHIVE_MAP, YosoConfig

    # 尝试检查是否有 torch 可用，如果不可用则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果 torch 可用，则从 .modeling_yoso 中导入相关模块和类
        from .modeling_yoso import (
            YOSO_PRETRAINED_MODEL_ARCHIVE_LIST,
            YosoForMaskedLM,
            YosoForMultipleChoice,
            YosoForQuestionAnswering,
            YosoForSequenceClassification,
            YosoForTokenClassification,
            YosoLayer,
            YosoModel,
            YosoPreTrainedModel,
        )

# 如果不是类型检查模式
else:
    # 导入 sys 模块
    import sys

    # 使用 _LazyModule 定义延迟加载的模块，并将其指定给当前模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

`.\models\init.py`

# 导入模块和库

from . import (
    albert,  # 导入albert模块
    align,  # 导入align模块
    altclip,  # 导入altclip模块
    audio_spectrogram_transformer,  # 导入audio_spectrogram_transformer模块
    auto,  # 导入auto模块
    autoformer,  # 导入autoformer模块
    bark,  # 导入bark模块
    bart,  # 导入bart模块
    barthez,  # 导入barthez模块
    bartpho,  # 导入bartpho模块
    beit,  # 导入beit模块
    bert,  # 导入bert模块
    bert_generation,  # 导入bert_generation模块
    bert_japanese,  # 导入bert_japanese模块
    bertweet,  # 导入bertweet模块
    big_bird,  # 导入big_bird模块
    bigbird_pegasus,  # 导入bigbird_pegasus模块
    biogpt,  # 导入biogpt模块
    bit,  # 导入bit模块
    blenderbot,  # 导入blenderbot模块
    blenderbot_small,  # 导入blenderbot_small模块
    blip,  # 导入blip模块
    blip_2,  # 导入blip_2模块
    bloom,  # 导入bloom模块
    bridgetower,  # 导入bridgetower模块
    bros,  # 导入bros模块
    byt5,  # 导入byt5模块
    camembert,  # 导入camembert模块
    canine,  # 导入canine模块
    chinese_clip,  # 导入chinese_clip模块
    clap,  # 导入clap模块
    clip,  # 导入clip模块
    clipseg,  # 导入clipseg模块
    clvp,  # 导入clvp模块
    code_llama,  # 导入code_llama模块
    codegen,  # 导入codegen模块
    cohere,  # 导入cohere模块
    conditional_detr,  # 导入conditional_detr模块
    convbert,  # 导入convbert模块
    convnext,  # 导入convnext模块
    convnextv2,  # 导入convnextv2模块
    cpm,  # 导入cpm模块
    cpmant,  # 导入cpmant模块
    ctrl,  # 导入ctrl模块
    cvt,  # 导入cvt模块
    data2vec,  # 导入data2vec模块
    deberta,  # 导入deberta模块
    deberta_v2,  # 导入deberta_v2模块
    decision_transformer,  # 导入decision_transformer模块
    deformable_detr,  # 导入deformable_detr模块
    deit,  # 导入deit模块
    deprecated,  # 导入deprecated模块
    depth_anything,  # 导入depth_anything模块
    deta,  # 导入deta模块
    detr,  # 导入detr模块
    dialogpt,  # 导入dialogpt模块
    dinat,  # 导入dinat模块
    dinov2,  # 导入dinov2模块
    distilbert,  # 导入distilbert模块
    dit,  # 导入dit模块
    donut,  # 导入donut模块
    dpr,  # 导入dpr模块
    dpt,  # 导入dpt模块
    efficientformer,  # 导入efficientformer模块
    efficientnet,  # 导入efficientnet模块
    electra,  # 导入electra模块
    encodec,  # 导入encodec模块
    encoder_decoder,  # 导入encoder_decoder模块
    ernie,  # 导入ernie模块
    ernie_m,  # 导入ernie_m模块
    esm,  # 导入esm模块
    falcon,  # 导入falcon模块
    fastspeech2_conformer,  # 导入fastspeech2_conformer模块
    flaubert,  # 导入flaubert模块
    flava,  # 导入flava模块
    fnet,  # 导入fnet模块
    focalnet,  # 导入focalnet模块
    fsmt,  # 导入fsmt模块
    funnel,  # 导入funnel模块
    fuyu,  # 导入fuyu模块
    gemma,  # 导入gemma模块
    git,  # 导入git模块
    glpn,  # 导入glpn模块
    gpt2,  # 导入gpt2模块
    gpt_bigcode,  # 导入gpt_bigcode模块
    gpt_neo,  # 导入gpt_neo模块
    gpt_neox,  # 导入gpt_neox模块
    gpt_neox_japanese,  # 导入gpt_neox_japanese模块
    gpt_sw3,  # 导入gpt_sw3模块
    gptj,  # 导入gptj模块
    gptsan_japanese,  # 导入gptsan_japanese模块
    graphormer,  # 导入graphormer模块
    groupvit,  # 导入groupvit模块
    herbert,  # 导入herbert模块
    hubert,  # 导入hubert模块
    ibert,  # 导入ibert模块
    idefics,  # 导入idefics模块
    imagegpt,  # 导入imagegpt模块
    informer,  # 导入informer模块
    instructblip,  # 导入instructblip模块
    jukebox,  # 导入jukebox模块
    kosmos2,  # 导入kosmos2模块
    layoutlm,  # 导入layoutlm模块
    layoutlmv2,  # 导入layoutlmv2模块
    layoutlmv3,  # 导入layoutlmv3模块
    layoutxlm,  # 导入layoutxlm模块
    led,  # 导入led模块
    levit,  # 导入levit模块
    lilt,  # 导入lilt模块
    llama,  # 导入llama模块
    llava,  # 导入llava模块
    llava_next,  # 导入llava_next模块
    longformer,  # 导入longformer模块
    longt5,  # 导入longt5模块
    luke,  # 导入luke模块
    lxmert,  # 导入lxmert模块
    m2m_100,  # 导入m2m_100模块
    mamba,  # 导入mamba模块
    marian,  # 导入marian模块
    markuplm,  # 导入markuplm模块
    mask2former,  # 导入mask2former模块
    maskformer,  # 导入maskformer模块
    mbart,  # 导入mbart模块
    mbart50,  # 导入mbart50模块
    mega,  # 导入mega模块
    megatron_bert,  # 导入megatron_bert模块
    megatron_gpt2,  # 导入
    # 导入所有模型
    import rembert
    import resnet
    import roberta
    import roberta_prelayernorm
    import roc_bert
    import roformer
    import rwkv
    import sam
    import seamless_m4t
    import seamless_m4t_v2
    import segformer
    import seggpt
    import sew
    import sew_d
    import siglip
    import speech_encoder_decoder
    import speech_to_text
    import speech_to_text_2
    import speecht5
    import splinter
    import squeezebert
    import stablelm
    import starcoder2
    import superpoint
    import swiftformer
    import swin
    import swin2sr
    import swinv2
    import switch_transformers
    import t5
    import table_transformer
    import tapas
    import time_series_transformer
    import timesformer
    import timm_backbone
    import trocr
    import tvlt
    import tvp
    import udop
    import umt5
    import unispeech
    import unispeech_sat
    import univnet
    import upernet
    import videomae
    import vilt
    import vipllava
    import vision_encoder_decoder
    import vision_text_dual_encoder
    import visual_bert
    import vit
    import vit_hybrid
    import vit_mae
    import vit_msn
    import vitdet
    import vitmatte
    import vits
    import vivit
    import wav2vec2
    import wav2vec2_bert
    import wav2vec2_conformer
    import wav2vec2_phoneme
    import wav2vec2_with_lm
    import wavlm
    import whisper
    import x_clip
    import xglm
    import xlm
    import xlm_prophetnet
    import xlm_roberta
    import xlm_roberta_xl
    import xlnet
    import xmod
    import yolos
    import yoso
)

`.\onnx\config.py`

# 版权声明和版权信息
# 版权所有 © 2021 HuggingFace 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）许可；
# 除非符合许可证的条款，否则您不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则软件按“原样”分发，
# 不提供任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。

import copy  # 导入 copy 模块
import dataclasses  # 导入 dataclasses 模块
import warnings  # 导入 warnings 模块
from abc import ABC, abstractmethod  # 从 abc 模块导入 ABC 抽象基类和 abstractmethod 装饰器
from collections import OrderedDict  # 从 collections 模块导入 OrderedDict 类
from typing import (  # 导入多个类型提示，包括 TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Mapping, Optional, Tuple, Union
    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
    Iterable,
    List,
    Mapping,
    Optional,
    Tuple,
    Union,
)

import numpy as np  # 导入 numpy 库，并使用 np 别名
from packaging import version  # 从 packaging 模块导入 version 模块

from ..utils import TensorType, is_torch_available, is_vision_available, logging  # 从相对路径的 ..utils 模块导入多个函数和类
from .utils import (  # 从相对路径的 .utils 模块导入 ParameterFormat, compute_effective_axis_dimension, compute_serialized_parameters_size 函数
    ParameterFormat,
    compute_effective_axis_dimension,
    compute_serialized_parameters_size,
)

if TYPE_CHECKING:
    from ..configuration_utils import PretrainedConfig  # 如果 TYPE_CHECKING 为真，导入 PretrainedConfig 类
    from ..feature_extraction_utils import FeatureExtractionMixin  # 如果 TYPE_CHECKING 为真，导入 FeatureExtractionMixin 类
    from ..image_processing_utils import ImageProcessingMixin  # 如果 TYPE_CHECKING 为真，导入 ImageProcessingMixin 类
    from ..tokenization_utils_base import PreTrainedTokenizerBase  # 如果 TYPE_CHECKING 为真，导入 PreTrainedTokenizerBase 类

if is_vision_available():
    from PIL import Image  # 如果 is_vision_available() 返回真，导入 PIL 库中的 Image 类

logger = logging.get_logger(__name__)  # 获取当前模块的 logger 对象

DEFAULT_ONNX_OPSET = 11  # 设置默认的 ONNX 操作集版本号为 11

# 外部数据格式大小限制为 2 GB
EXTERNAL_DATA_FORMAT_SIZE_LIMIT = 2 * 1024 * 1024 * 1024


@dataclasses.dataclass
class PatchingSpec:
    """
    数据类，保存补丁规范。

    Args:
        o: 包含要打补丁的操作的模块 / 对象
        name: 要打补丁的操作的名称
        custom_op: 打补丁的自定义操作
        orig_op: 正在被打补丁的原始操作
        op_wrapper: 包装器（可选），包装原始操作和自定义操作。
            对于类或静态方法很有用。

    """

    o: Any
    name: str
    custom_op: Callable
    orig_op: Optional[Callable] = None
    op_wrapper: Optional[Callable] = None


class OnnxConfig(ABC):
    """
    ONNX 可导出模型的基类，描述通过 ONNX 格式导出模型的元数据。
    """

    default_fixed_batch = 2  # 默认固定批次大小为 2
    default_fixed_sequence = 8  # 默认固定序列长度为 8
    default_fixed_num_choices = 4  # 默认固定选择数量为 4
    torch_onnx_minimum_version = version.parse("1.8")  # Torch 的最小 ONNX 版本为 1.8
    # 定义一个类变量，映射不同任务到其标准输出格式的有序字典
    _tasks_to_common_outputs = {
        "causal-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
        "default": OrderedDict({"last_hidden_state": {0: "batch", 1: "sequence"}}),
        "image-classification": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
        "image-segmentation": OrderedDict(
            {
                "logits": {0: "batch", 1: "sequence"},
                "pred_boxes": {0: "batch", 1: "sequence"},
                "pred_masks": {0: "batch", 1: "sequence"},
            }
        ),
        "masked-im": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
        "masked-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
        "multiple-choice": OrderedDict({"logits": {0: "batch"}}),
        "object-detection": OrderedDict(
            {
                "logits": {0: "batch", 1: "sequence"},
                "pred_boxes": {0: "batch", 1: "sequence"},
            }
        ),
        "question-answering": OrderedDict(
            {
                "start_logits": {0: "batch", 1: "sequence"},
                "end_logits": {0: "batch", 1: "sequence"},
            }
        ),
        "semantic-segmentation": OrderedDict({"logits": {0: "batch", 1: "num_labels", 2: "height", 3: "width"}}),
        "seq2seq-lm": OrderedDict({"logits": {0: "batch", 1: "decoder_sequence"}}),
        "sequence-classification": OrderedDict({"logits": {0: "batch"}}),
        "token-classification": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
        "vision2seq-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
        "speech2seq-lm": OrderedDict({"logits": {0: "batch", 1: "sequence"}}),
    }

    # 类的构造函数，初始化对象时调用
    def __init__(self, config: "PretrainedConfig", task: str = "default", patching_specs: List[PatchingSpec] = None):
        self._config = config

        # 检查传入的任务是否在支持的任务列表中，如果不在则抛出异常
        if task not in self._tasks_to_common_outputs:
            raise ValueError(
                f"{task} is not a supported task, supported tasks: {self._tasks_to_common_outputs.keys()}"
            )
        self.task = task

        # 初始化对象的属性 _patching_specs，用于记录应用到对象上的补丁规格
        self._patching_specs = []
        # 遍历传入的 patching_specs 列表，如果不为空则逐个处理
        for spec in patching_specs if patching_specs is not None else []:
            final_spec = spec
            # 如果补丁规格中的原始操作为 None，则替换为 spec.o 上的 spec.name 属性的值
            if spec.orig_op is None:
                final_spec = dataclasses.replace(spec, orig_op=getattr(spec.o, spec.name))
            self._patching_specs.append(final_spec)

    # 类方法，根据模型配置生成一个 OnnxConfig 实例
    @classmethod
    def from_model_config(cls, config: "PretrainedConfig", task: str = "default") -> "OnnxConfig":
        """
        根据模型配置生成一个 OnnxConfig 实例

        Args:
            config: 导出到 ONNX 时使用的模型配置

        Returns:
            该模型的 OnnxConfig 实例
        """
        return cls(config, task=task)

    # 抽象属性，子类必须实现该属性
    @property
    @abstractmethod
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        """
        Mapping containing the axis definition of the input tensors to provide to the model

        Returns:
            For each input: its name associated to the axes symbolic name and the axis position within the tensor
        """
        # 返回模型需要的输入张量的轴定义映射
        raise NotImplementedError()

    @property
    def outputs(self) -> Mapping[str, Mapping[int, str]]:
        """
        Mapping containing the axis definition of the output tensors to provide to the model

        Returns:
            For each output: its name associated to the axes symbolic name and the axis position within the tensor
        """
        # 获取当前任务对应的通用输出，并深拷贝返回
        common_outputs = self._tasks_to_common_outputs[self.task]
        return copy.deepcopy(common_outputs)

    @property
    def values_override(self) -> Optional[Mapping[str, Any]]:
        """
        Dictionary of keys to override in the model's config before exporting

        Returns:
            Dictionary with the keys (and their corresponding values) to override
        """
        # 如果模型配置对象有"use_cache"属性，则返回该属性为False的字典
        if hasattr(self._config, "use_cache"):
            return {"use_cache": False}

        # 否则返回None，表示无需覆盖任何配置项
        return None

    @property
    def default_batch_size(self) -> int:
        """
        The default batch size to use if no other indication

        Returns:
            Integer > 0
        """
        # 返回默认的批处理大小，避免ONNX对单个样本批处理的假设
        return OnnxConfig.default_fixed_batch

    @property
    def default_sequence_length(self) -> int:
        """
        The default sequence length to use if no other indication

        Returns:
            Integer > 0
        """
        # 返回默认的序列长度
        return OnnxConfig.default_fixed_sequence

    @property
    def default_num_choices(self) -> int:
        """
        The default number of choices to use if no other indication

        Returns:
            Integer > 0
        """
        # 返回默认的选择数量
        return OnnxConfig.default_fixed_num_choices

    @property
    def default_onnx_opset(self) -> int:
        """
        Which onnx opset to use when exporting the model

        Returns:
            Integer ONNX Opset version
        """
        # 返回导出模型时要使用的ONNX opset版本
        return DEFAULT_ONNX_OPSET

    @property
    def atol_for_validation(self) -> float:
        """
        What absolute tolerance value to use during model conversion validation.

        Returns:
            Float absolute tolerance value.
        """
        # 返回在模型转换验证期间使用的绝对容差值
        return 1e-5

    @property
    def is_torch_support_available(self) -> bool:
        """
        The minimum PyTorch version required to export the model.

        Returns:
            `bool`: Whether the installed version of PyTorch is compatible with the model.
        """
        # 检查是否安装了PyTorch，如果是，则检查版本是否达到要求的最小版本
        if is_torch_available():
            from transformers.utils import get_torch_version

            return version.parse(get_torch_version()) >= self.torch_onnx_minimum_version
        else:
            # 如果未安装PyTorch，则返回False
            return False
    def use_external_data_format(num_parameters: int) -> bool:
        """
        Flag indicating if the model requires using external data format

        Args:
            num_parameters: Number of parameters in the model

        Returns:
            True if the serialized parameter size in float32 >= 2Gb, False otherwise
        """

        return (
            compute_serialized_parameters_size(num_parameters, ParameterFormat.Float)
            >= EXTERNAL_DATA_FORMAT_SIZE_LIMIT
        )

    def _generate_dummy_images(
        self, batch_size: int = 2, num_channels: int = 3, image_height: int = 40, image_width: int = 40
    ):
        """
        Generate dummy images as a list of PIL Image objects.

        Args:
            batch_size: Number of images to generate
            num_channels: Number of color channels per image
            image_height: Height of each image
            image_width: Width of each image

        Returns:
            List of PIL Image objects
        """
        images = []
        for _ in range(batch_size):
            data = np.random.rand(image_height, image_width, num_channels) * 255
            images.append(Image.fromarray(data.astype("uint8")).convert("RGB"))
        return images

    def _generate_dummy_audio(
        self, batch_size: int = 2, sampling_rate: int = 22050, time_duration: float = 5.0, frequency: int = 220
    ):
        """
        Generate dummy audio data as a list of numpy arrays representing audio samples.

        Args:
            batch_size: Number of audio samples to generate
            sampling_rate: Sampling rate of the audio samples
            time_duration: Duration of each audio sample in seconds
            frequency: Frequency of the sine wave to generate

        Returns:
            List of numpy arrays representing audio samples
        """
        audio_data = []
        for _ in range(batch_size):
            t = np.linspace(0, time_duration, int(time_duration * sampling_rate), endpoint=False)
            audio_data.append(0.5 * np.sin(2 * np.pi * frequency * t))
        return audio_data

    def generate_dummy_inputs(
        self,
        preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin", "ImageProcessingMixin"],
        batch_size: int = -1,
        seq_length: int = -1,
        num_choices: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
        num_channels: int = 3,
        image_width: int = 40,
        image_height: int = 40,
        sampling_rate: int = 22050,
        time_duration: float = 5.0,
        frequency: int = 220,
        tokenizer: "PreTrainedTokenizerBase" = None,
    ):
        """
        Generate dummy inputs for the model, such as images, audio, or text tokens.

        Args:
            preprocessor: Preprocessor object for handling different input types
            batch_size: Number of inputs to generate
            seq_length: Length of sequence inputs
            num_choices: Number of choices (for multiple choice scenarios)
            is_pair: Whether the input is a pair
            framework: Framework type for input handling
            num_channels: Number of channels for image inputs
            image_width: Width of image inputs
            image_height: Height of image inputs
            sampling_rate: Sampling rate for audio inputs
            time_duration: Duration of audio inputs
            frequency: Frequency of audio inputs
            tokenizer: Tokenizer object for token-based inputs

        Returns:
            Dummy inputs suitable for the model
        """

    def generate_dummy_inputs_onnxruntime(self, reference_model_inputs: Mapping[str, Any]) -> Mapping[str, Any]:
        """
        Generate inputs for ONNX Runtime using the reference model inputs.

        Args:
            reference_model_inputs: Mapping of inputs for the model

        Returns:
            Mapping of inputs suitable for the model's forward function in ONNX Runtime
        """
        return reference_model_inputs

    def patch_ops(self):
        """
        Patch operations on the model instance using predefined specifications.
        """
        for spec in self._patching_specs:
            custom_op = spec.custom_op if spec.op_wrapper is None else spec.op_wrapper(spec.custom_op)
            setattr(spec.o, spec.name, custom_op)
    # 恢复操作函数原始状态的方法
    def restore_ops(self):
        # 遍历保存在 self._patching_specs 中的所有规格
        for spec in self._patching_specs:
            # 如果规格中的操作包装器为 None，则使用原始操作；否则使用操作包装器包装原始操作
            orig_op = spec.orig_op if spec.op_wrapper is None else spec.op_wrapper(spec.orig_op)
            # 将恢复后的操作设置回原始对象的对应属性上
            setattr(spec.o, spec.name, orig_op)

    @classmethod
    def flatten_output_collection_property(cls, name: str, field: Iterable[Any]) -> Dict[str, Any]:
        """
        Flatten any potential nested structure expanding the name of the field with the index of the element within the
        structure.

        Args:
            name: The name of the nested structure
            field: The structure to, potentially, be flattened

        Returns:
            (Dict[str, Any]): Outputs with flattened structure and key mapping this new structure.

        """
        # 导入 itertools 模块中的 chain 函数，用于将多个可迭代对象连接成一个迭代器
        from itertools import chain

        # 返回一个字典，其键为格式化后的字段名（包含结构的名字和元素在结构中的索引），值为从嵌套结构展开后的元素
        return {f"{name}.{idx}": item for idx, item in enumerate(chain.from_iterable(field))}
class OnnxConfigWithPast(OnnxConfig, ABC):
    # 继承自 OnnxConfig 类，并实现 ABC 抽象类
    def __init__(
        self,
        config: "PretrainedConfig",
        task: str = "default",
        patching_specs: List[PatchingSpec] = None,
        use_past: bool = False,
    ):
        # 调用父类的构造方法初始化对象
        super().__init__(config, task=task, patching_specs=patching_specs)
        # 设置本类特有的 use_past 属性
        self.use_past = use_past

    @classmethod
    def with_past(cls, config: "PretrainedConfig", task: str = "default") -> "OnnxConfigWithPast":
        """
        实例化一个带有 `use_past` 属性设置为 True 的 OnnxConfig 对象

        Args:
            config: 导出到 ONNX 时使用的底层模型配置

        Returns:
            设置了 `.use_past = True` 的 OnnxConfig 对象
        """
        return cls(config, task=task, use_past=True)

    @property
    def outputs(self) -> Mapping[str, Mapping[int, str]]:
        # 获取父类的 outputs 属性
        common_outputs = super().outputs
        # 如果 use_past 属性为 True，则调用本类方法填充输出
        if self.use_past:
            self.fill_with_past_key_values_(common_outputs, direction="outputs")

        return common_outputs

    @property
    def values_override(self) -> Optional[Mapping[str, Any]]:
        # 如果 _config 对象有 use_cache 属性，则返回字典 {"use_cache": self.use_past}
        if hasattr(self._config, "use_cache"):
            return {"use_cache": self.use_past}

        return None

    @property
    def num_layers(self) -> int:
        """
        从模型配置中获取层数属性。对于不称为 `num_layers` 的模型配置，请覆盖此方法。
        """
        if not hasattr(self._config, "num_layers"):
            # 如果模型配置中找不到层数属性，则引发 AttributeError
            raise AttributeError(
                "could not find the number of layers attribute in the model configuration, override the num_layers"
                " property of the model OnnxConfig to solve this"
            )
        return self._config.num_layers

    @property
    def num_attention_heads(self) -> int:
        """
        从模型配置中获取注意力头数属性。对于不称为 `num_attention_heads` 的模型配置，请覆盖此方法。
        """
        if not hasattr(self._config, "num_attention_heads"):
            # 如果模型配置中找不到注意力头数属性，则引发 AttributeError
            raise AttributeError(
                "could not find the number of attention heads attribute in the model configuration, override the"
                " num_attention_heads property of the model OnnxConfig to solve this"
            )
        return self._config.num_attention_heads

    def generate_dummy_inputs(
        self,
        tokenizer: "PreTrainedTokenizerBase",
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
    ):
        # 此方法用于生成虚拟输入，具体实现由子类完成
        pass
    ) -> Mapping[str, Any]:
        # TODO: should we set seq_length = 1 when self.use_past = True?
        # 调用父类方法生成虚拟输入数据，获取通用的输入字典
        common_inputs = super().generate_dummy_inputs(
            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
        )

        if self.use_past:
            # 如果使用过去的状态信息
            if not is_torch_available():
                # 检查是否安装了 PyTorch
                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
            else:
                import torch

            # 获取 batch 和 seq_length
            batch, seqlen = common_inputs["input_ids"].shape
            # 计算过去键值对的长度
            past_key_values_length = seqlen + 2
            # 定义张量的形状
            shape = (
                batch,
                self.num_attention_heads,
                past_key_values_length,
                self._config.hidden_size // self.num_attention_heads,
            )

            if "attention_mask" in common_inputs:
                # 如果存在注意力掩码，扩展掩码的长度
                mask_dtype = common_inputs["attention_mask"].dtype
                common_inputs["attention_mask"] = torch.cat(
                    [common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)],
                    dim=1,
                )

            # 初始化过去键值对列表
            common_inputs["past_key_values"] = []
            for _ in range(self.num_layers):
                # 为每一层添加零初始化的过去键值对
                common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))

        # 返回填充后的通用输入字典
        return common_inputs

    def fill_with_past_key_values_(
        self, inputs_or_outputs: Mapping[str, Mapping[int, str]], direction: str, inverted_values_shape: bool = False
    ):
        """
        Fill the input_or_outputs mapping with past_key_values dynamic axes considering.

        Args:
            inputs_or_outputs: The mapping to fill.
            direction: either "inputs" or "outputs", it specifies whether input_or_outputs is the input mapping or the
                output mapping, this is important for axes naming.
            inverted_values_shape:
                If `True`, store values on dynamic axis 1, else on axis 2.

        """
        # 检查方向是否合法
        if direction not in ["inputs", "outputs"]:
            raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')

        # 根据方向设置键的名称前缀
        name = "past_key_values" if direction == "inputs" else "present"
        for i in range(self.num_layers):
            # 设置键值对的动态轴信息
            inputs_or_outputs[f"{name}.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
            if inverted_values_shape:
                inputs_or_outputs[f"{name}.{i}.value"] = {0: "batch", 1: "past_sequence + sequence"}
            else:
                inputs_or_outputs[f"{name}.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}

    def _flatten_past_key_values_(self, flattened_output, name, idx, t):
        # 将过去键值对扁平化后存入输出字典
        flattened_output[f"{name}.{idx}.key"] = t[0]
        flattened_output[f"{name}.{idx}.value"] = t[1]
    # 定义一个方法用于扁平化输出集合属性
    def flatten_output_collection_property(self, name: str, field: Iterable[Any]) -> Dict[str, Any]:
        # 初始化一个空字典用于存储扁平化后的输出
        flattened_output = {}
        # 如果属性名为 "present" 或 "past_key_values"
        if name in ["present", "past_key_values"]:
            # 遍历字段中的每个元素，使用索引和元素调用 _flatten_past_key_values_ 方法
            for idx, t in enumerate(field):
                self._flatten_past_key_values_(flattened_output, name, idx, t)
        else:
            # 否则调用父类的同名方法处理字段，并将结果赋给 flattened_output
            flattened_output = super().flatten_output_collection_property(name, field)

        # 返回扁平化后的输出字典
        return flattened_output
class OnnxSeq2SeqConfigWithPast(OnnxConfigWithPast):
    @property
    def outputs(self) -> Mapping[str, Mapping[int, str]]:
        # 调用父类方法获取通用输出
        common_outputs = super(OnnxConfigWithPast, self).outputs
        # 对输出的轴进行适当重命名
        for name, axes_names in common_outputs.items():
            # 根据名称中是否包含"encoder"决定序列名称
            sequence_name = "encoder_sequence" if "encoder" in name else "decoder_sequence"
            for axis_idx, name in axes_names.items():
                # 如果轴名称中包含"sequence"，则重命名为对应的序列名称
                if "sequence" in name:
                    axes_names[axis_idx] = sequence_name
                else:
                    # 否则保持原名称不变
                    axes_names[axis_idx] = name
        # 如果使用过去状态信息，则填充通用输出中的键值对
        if self.use_past:
            self.fill_with_past_key_values_(common_outputs, direction="outputs")

        return common_outputs

    @property
    def num_layers(self) -> Tuple[int]:
        try:
            # 尝试获取父类中的层数
            num_layers = super().num_layers
            # 将层数转换为元组形式 (num_layers, num_layers)
            num_layers = (num_layers, num_layers)
        except AttributeError:
            # 如果父类中不存在 num_layers 属性，则根据配置信息获取编码器和解码器层数
            if hasattr(self._config, "encoder_layers") and hasattr(self._config, "decoder_layers"):
                num_layers = (self._config.encoder_layers, self._config.decoder_layers)
            else:
                # 抛出属性错误异常，提示在模型配置中找不到编码器和解码器层数的属性
                raise AttributeError(
                    "could not find the number of encoder and decoder layers attributes in the model configuration,"
                    " override the num_layers property of the model OnnxConfig to solve this"
                )

        return num_layers

    @property
    def num_attention_heads(self) -> Tuple[int]:
        try:
            # 尝试获取父类中的注意力头数
            num_attention_heads = super().num_attention_heads
            # 将注意力头数转换为元组形式 (num_attention_heads, num_attention_heads)
            num_attention_heads = (num_attention_heads, num_attention_heads)
        except AttributeError:
            # 如果父类中不存在 num_attention_heads 属性，则根据配置信息获取编码器和解码器注意力头数
            if hasattr(self._config, "encoder_attention_heads") and hasattr(self._config, "decoder_attention_heads"):
                num_attention_heads = (self._config.encoder_attention_heads, self._config.decoder_attention_heads)
            else:
                # 抛出属性错误异常，提示在模型配置中找不到编码器和解码器注意力头数的属性
                raise AttributeError(
                    "could not find the number of attention heads for the encoder and the decoder attributes in the"
                    " model configuration, override the num_attention_heads property of the model OnnxConfig to solve"
                    " this"
                )
        return num_attention_heads

    def generate_dummy_inputs(
        self,
        tokenizer: "PreTrainedTokenizerBase",
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
    def fill_with_past_key_values_(self, inputs_or_outputs: Mapping[str, Mapping[int, str]], direction: str):
        # 如果方向不是"inputs"或"outputs"，则抛出数值错误异常
        if direction not in ["inputs", "outputs"]:
            raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')

        # 根据方向确定名称
        name = "past_key_values" if direction == "inputs" else "present"

        # 获取编码器和解码器层数
        num_encoder_layers, num_decoder_layers = self.num_layers
        # 计算最小和最大层数差异
        min_num_layers = min(num_encoder_layers, num_decoder_layers)
        max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers
        # 确定剩余方向的名称（编码器或解码器）
        remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"

        # 设置编码器和解码器的序列名称
        encoder_sequence = "past_encoder_sequence"
        decoder_sequence = "past_decoder_sequence" if direction == "inputs" else "past_decoder_sequence + sequence"

        # 填充每一层的键值对
        for i in range(min_num_layers):
            inputs_or_outputs[f"{name}.{i}.decoder.key"] = {0: "batch", 2: decoder_sequence}
            inputs_or_outputs[f"{name}.{i}.decoder.value"] = {0: "batch", 2: decoder_sequence}
            inputs_or_outputs[f"{name}.{i}.encoder.key"] = {0: "batch", 2: encoder_sequence}
            inputs_or_outputs[f"{name}.{i}.encoder.value"] = {0: "batch", 2: encoder_sequence}

        # 对于剩余的层，根据剩余方向名称设置相应的轴信息
        for i in range(min_num_layers, max_num_layers):
            if remaining_side_name == "encoder":
                axes_info = {0: "batch", 2: encoder_sequence}
            else:
                axes_info = {0: "batch", 2: decoder_sequence}
            inputs_or_outputs[f"{name}.{i}.{remaining_side_name}.key"] = axes_info

    def _flatten_past_key_values_(self, flattened_output, name, idx, t):
        # 将 t 中的键值展平到给定的名称和索引中
        flattened_output[f"{name}.{idx}.decoder.key"] = t[0]
        flattened_output[f"{name}.{idx}.decoder.value"] = t[1]
        flattened_output[f"{name}.{idx}.encoder.key"] = t[2]
        flattened_output[f"{name}.{idx}.encoder.value"] = t[3]

`.\onnx\convert.py`

# 导入警告模块，用于处理警告信息
import warnings
# 从inspect模块中导入signature函数，用于获取函数签名信息
from inspect import signature
# 从itertools模块中导入chain函数，用于扁平化多个可迭代对象
from itertools import chain
# 从pathlib模块中导入Path类，用于处理文件路径
from pathlib import Path
# 从typing模块中导入必要的类型提示
from typing import TYPE_CHECKING, Iterable, List, Tuple, Union

# 导入numpy库，通常用于数值计算
import numpy as np
# 从packaging.version模块中导入Version和parse函数，用于处理版本号信息
from packaging.version import Version, parse

# 从上级目录中导入tokenization_utils_base模块中的PreTrainedTokenizerBase类
from ..tokenization_utils_base import PreTrainedTokenizerBase
# 从上级目录中导入utils模块中的TensorType、is_tf_available、is_torch_available和logging函数
from ..utils import (
    TensorType,
    is_tf_available,
    is_torch_available,
    logging,
)
# 从当前目录中导入config模块中的OnnxConfig类
from .config import OnnxConfig

# 如果torch可用，则从..modeling_utils模块中导入PreTrainedModel类
if is_torch_available():
    from ..modeling_utils import PreTrainedModel

# 如果tensorflow可用，则从..modeling_tf_utils模块中导入TFPreTrainedModel类
if is_tf_available():
    from ..modeling_tf_utils import TFPreTrainedModel

# 如果当前是类型检查状态，则从..feature_extraction_utils和..processing_utils模块中导入相应类
if TYPE_CHECKING:
    from ..feature_extraction_utils import FeatureExtractionMixin
    from ..processing_utils import ProcessorMixin
    from ..tokenization_utils import PreTrainedTokenizer

# 从logging模块中获取名为__name__的logger对象，并赋值给logger变量
logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

# 定义最低要求的ONNX Runtime版本号
ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0")


def check_onnxruntime_requirements(minimum_version: Version):
    """
    检查是否安装了ONNX Runtime，并且安装的版本是否足够新

    Raises:
        ImportError: 如果未安装ONNX Runtime或版本太旧
    """
    try:
        # 尝试导入onnxruntime模块
        import onnxruntime

        # 解析已安装onnxruntime的版本号
        ort_version = parse(onnxruntime.__version__)

        # 要求至少是1.4.0版本
        if ort_version < ORT_QUANTIZE_MINIMUM_VERSION:
            raise ImportError(
                f"We found an older version of onnxruntime ({onnxruntime.__version__}) "
                f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n"
                "Please update onnxruntime by running `pip install --upgrade onnxruntime`"
            )

    except ImportError:
        raise ImportError(
            "onnxruntime doesn't seem to be currently installed. "
            "Please install the onnxruntime by running `pip install onnxruntime`"
            " and relaunch the conversion."
        )


def export_pytorch(
    preprocessor: Union["PreTrainedTokenizer", "FeatureExtractionMixin", "ProcessorMixin"],
    model: "PreTrainedModel",
    config: OnnxConfig,
    opset: int,
    output: Path,
    tokenizer: "PreTrainedTokenizer" = None,
    device: str = "cpu",
) -> Tuple[List[str], List[str]]:
    """
    导出PyTorch模型至ONNX格式

    Args:
        preprocessor (Union[PreTrainedTokenizer, FeatureExtractionMixin, ProcessorMixin]):
            预处理器对象，可能是PreTrainedTokenizer、FeatureExtractionMixin或ProcessorMixin的子类实例
        model (PreTrainedModel): 预训练模型对象，是PreTrainedModel的子类实例
        config (OnnxConfig): ONNX导出配置对象，是OnnxConfig类的实例
        opset (int): ONNX操作集版本号
        output (Path): 导出的ONNX模型路径
        tokenizer (PreTrainedTokenizer, optional):
            如果模型需要tokenizer，此处提供其对象，可能是PreTrainedTokenizer的子类实例. Defaults to None.
        device (str, optional): 设备类型，例如'cpu'或'cuda'. Defaults to "cpu".

    Returns:
        Tuple[List[str], List[str]]: 返回两个字符串列表，分别表示成功和失败的导出步骤

    """
    # 检查预处理器的类型是否为 `PreTrainedTokenizerBase`，并且确保没有同时提供 tokenizer 参数
    if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None:
        # 如果同时提供了 tokenizer 和 preprocessor，则抛出数值错误异常
        raise ValueError("You cannot provide both a tokenizer and a preprocessor to export the model.")
    
    # 如果提供了 tokenizer 参数，则发出警告信息，表示在未来版本中将移除 tokenizer 参数，建议使用 preprocessor 参数代替
    if tokenizer is not None:
        warnings.warn(
            "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use"
            " `preprocessor` instead.",
            FutureWarning,
        )
        # 记录日志信息，指示将 preprocessor 参数重写为 tokenizer 参数，用于生成虚拟输入
        logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
        preprocessor = tokenizer
    # 检查模型是否是 PreTrainedModel 的子类
    if issubclass(type(model), PreTrainedModel):
        import torch
        from torch.onnx import export as onnx_export

        # 输出使用的 PyTorch 框架版本信息
        logger.info(f"Using framework PyTorch: {torch.__version__}")
        
        # 禁止梯度计算，并设置模型返回字典形式的输出
        with torch.no_grad():
            model.config.return_dict = True
            # 将模型设置为评估模式
            model.eval()

            # 检查是否需要覆盖某些配置项
            if config.values_override is not None:
                logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
                # 遍历并覆盖配置项
                for override_config_key, override_config_value in config.values_override.items():
                    logger.info(f"\t- {override_config_key} -> {override_config_value}")
                    setattr(model.config, override_config_key, override_config_value)

            # 确保输入数据与模型要求匹配
            # TODO: 在导出 QA 模型时，需要确认是否提供了 "is_pair=True"
            model_inputs = config.generate_dummy_inputs(preprocessor, framework=TensorType.PYTORCH)
            # 设置设备类型并将模型移动到相应设备
            device = torch.device(device)
            if device.type == "cuda" and torch.cuda.is_available():
                model.to(device)
                model_inputs_device = {}
                for k, v in model_inputs.items():
                    if isinstance(v, Tuple):
                        model_inputs_device[k] = tuple(
                            x.to(device) if isinstance(x, torch.Tensor) else None for x in v
                        )
                    elif isinstance(v, List):
                        model_inputs_device[k] = [
                            tuple(x.to(device) if isinstance(x, torch.Tensor) else None for x in t) for t in v
                        ]
                    else:
                        model_inputs_device[k] = v.to(device)

                model_inputs = model_inputs_device

            # 确保模型输入与配置输入匹配
            inputs_match, matched_inputs = ensure_model_and_config_inputs_match(model, model_inputs.keys())
            # 获取配置中的输出项列表
            onnx_outputs = list(config.outputs.keys())

            # 如果模型和配置的输入不匹配，则抛出数值错误
            if not inputs_match:
                raise ValueError("Model and config inputs doesn't match")

            # 应用配置的操作补丁
            config.patch_ops()

            # 导出模型到 ONNX 格式
            onnx_export(
                model,
                (model_inputs,),
                f=output.as_posix(),
                input_names=list(config.inputs.keys()),
                output_names=onnx_outputs,
                dynamic_axes=dict(chain(config.inputs.items(), config.outputs.items())),
                do_constant_folding=True,
                opset_version=opset,
            )

            # 恢复配置的操作
            config.restore_ops()

    # 返回匹配的输入和 ONNX 输出列表
    return matched_inputs, onnx_outputs
# 将 TensorFlow 模型导出为 ONNX 中间表示（IR）

def export_tensorflow(
    preprocessor: Union["PreTrainedTokenizer", "FeatureExtractionMixin"],
    model: "TFPreTrainedModel",
    config: OnnxConfig,
    opset: int,
    output: Path,
    tokenizer: "PreTrainedTokenizer" = None,
) -> Tuple[List[str], List[str]]:
    """
    Args:
        preprocessor: ([`PreTrainedTokenizer`] or [`FeatureExtractionMixin`]):
            用于对数据进行编码的预处理器。
        model ([`TFPreTrainedModel`]):
            要导出的模型。
        config ([`~onnx.config.OnnxConfig`]):
            导出模型相关的 ONNX 配置。
        opset (`int`):
            要使用的 ONNX 操作集的版本。
        output (`Path`):
            存储导出的 ONNX 模型的目录。

    Returns:
        `Tuple[List[str], List[str]]`: 包含模型输入顺序列表和来自 ONNX 配置的命名输入的元组。
    """
    import onnx
    import tensorflow as tf
    import tf2onnx

    if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None:
        raise ValueError("You cannot provide both a tokenizer and preprocessor to export the model.")
    if tokenizer is not None:
        warnings.warn(
            "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use"
            " `preprocessor` instead.",
            FutureWarning,
        )
        logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummmy inputs.")
        preprocessor = tokenizer

    # 设置模型配置以返回字典形式的输出
    model.config.return_dict = True

    # 检查是否需要覆盖某些配置项
    if config.values_override is not None:
        logger.info(f"Overriding {len(config.values_override)} configuration item(s)")
        for override_config_key, override_config_value in config.values_override.items():
            logger.info(f"\t- {override_config_key} -> {override_config_value}")
            setattr(model.config, override_config_key, override_config_value)

    # 确保输入匹配
    model_inputs = config.generate_dummy_inputs(preprocessor, framework=TensorType.TENSORFLOW)
    inputs_match, matched_inputs = ensure_model_and_config_inputs_match(model, model_inputs.keys())
    onnx_outputs = list(config.outputs.keys())

    # 创建 TensorFlow 的输入签名
    input_signature = [
        tf.TensorSpec([None] * tensor.ndim, dtype=tensor.dtype, name=key) for key, tensor in model_inputs.items()
    ]

    # 将 Keras 模型转换为 ONNX 模型
    onnx_model, _ = tf2onnx.convert.from_keras(model, input_signature, opset=opset)
    # 将 ONNX 模型保存到文件
    onnx.save(onnx_model, output.as_posix())
    # 恢复操作
    config.restore_ops()

    # 返回匹配的输入和 ONNX 输出
    return matched_inputs, onnx_outputs
    device: str = "cpu",
def export_to_onnx(
    preprocessor: Union['PreTrainedTokenizer', 'FeatureExtractionMixin', 'ProcessorMixin'],
    model: Union['PreTrainedModel', 'TFPreTrainedModel'],
    config: OnnxConfig,
    opset: int,
    output: Path,
    device: str = 'cpu'
) -> Tuple[List[str], List[str]]:
    """
    Export a Pytorch or TensorFlow model to an ONNX Intermediate Representation (IR)

    Args:
        preprocessor (Union['PreTrainedTokenizer', 'FeatureExtractionMixin', 'ProcessorMixin']):
            The preprocessor used for encoding the data.
        model (Union['PreTrainedModel', 'TFPreTrainedModel']):
            The model to export.
        config (OnnxConfig):
            The ONNX configuration associated with the exported model.
        opset (int):
            The version of the ONNX operator set to use.
        output (Path):
            Directory to store the exported ONNX model.
        device (str, optional, defaults to 'cpu'):
            The device on which the ONNX model will be exported. Either 'cpu' or 'cuda'. Only PyTorch is supported for
            export on CUDA devices.

    Returns:
        Tuple[List[str], List[str]]: A tuple with an ordered list of the model's inputs, and the named inputs from
        the ONNX configuration.
    """

    # Check if either PyTorch or TensorFlow is available; raise ImportError if not
    if not (is_torch_available() or is_tf_available()):
        raise ImportError(
            "Cannot convert because neither PyTorch nor TensorFlow are installed. "
            "Please install torch or tensorflow first."
        )

    # Raise RuntimeError if trying to export a TensorFlow model on CUDA device
    if is_tf_available() and isinstance(model, TFPreTrainedModel) and device == "cuda":
        raise RuntimeError("`tf2onnx` does not support export on CUDA device.")

    # Raise ValueError if both a tokenizer and a preprocessor are provided
    if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None:
        raise ValueError("You cannot provide both a tokenizer and a preprocessor to export the model.")

    # Warn and log if 'tokenizer' argument is used; it's deprecated
    if tokenizer is not None:
        warnings.warn(
            "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use"
            " `preprocessor` instead.",
            FutureWarning,
        )
        logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
        preprocessor = tokenizer

    # Check PyTorch version compatibility
    if is_torch_available():
        from ..utils import get_torch_version

        if not config.is_torch_support_available:
            logger.warning(
                f"Unsupported PyTorch version for this model. Minimum required is {config.torch_onnx_minimum_version},"
                f" got: {get_torch_version()}"
            )

    # Export using PyTorch if available and model is a subclass of PreTrainedModel
    if is_torch_available() and issubclass(type(model), PreTrainedModel):
        return export_pytorch(preprocessor, model, config, opset, output, tokenizer=tokenizer, device=device)
    # Export using TensorFlow if available and model is a subclass of TFPreTrainedModel
    elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
        return export_tensorflow(preprocessor, model, config, opset, output, tokenizer=tokenizer)
    atol: float,
    tokenizer: "PreTrainedTokenizer" = None,


    # 定义一个名为 atol 的参数，类型为 float，表示绝对误差容限
    atol: float,
    # 定义一个名为 tokenizer 的参数，默认为 None，类型为 "PreTrainedTokenizer"，表示一个预训练的分词器对象
    tokenizer: "PreTrainedTokenizer" = None,
    # 导入所需的模块和类
    from onnxruntime import InferenceSession, SessionOptions

    # 输出信息，验证 ONNX 模型的有效性
    logger.info("Validating ONNX model...")

    # 如果 preprocessor 是 PreTrainedTokenizerBase 的实例且 tokenizer 不为空，则抛出异常
    if isinstance(preprocessor, PreTrainedTokenizerBase) and tokenizer is not None:
        raise ValueError("You cannot provide both a tokenizer and a preprocessor to validate the model outputs.")
    
    # 如果存在 tokenizer 参数，则发出警告，并用 tokenizer 覆盖 preprocessor 参数以生成虚拟输入
    if tokenizer is not None:
        warnings.warn(
            "The `tokenizer` argument is deprecated and will be removed in version 5 of Transformers. Use"
            " `preprocessor` instead.",
            FutureWarning,
        )
        logger.info("Overwriting the `preprocessor` argument with `tokenizer` to generate dummy inputs.")
        preprocessor = tokenizer

    # 生成具有不同 batch_size 和 seq_len 的输入，用于测试动态输入形状
    if is_torch_available() and issubclass(type(reference_model), PreTrainedModel):
        reference_model_inputs = config.generate_dummy_inputs(
            preprocessor,
            batch_size=config.default_fixed_batch + 1,
            seq_length=config.default_fixed_sequence + 1,
            framework=TensorType.PYTORCH,
        )
    else:
        reference_model_inputs = config.generate_dummy_inputs(
            preprocessor,
            batch_size=config.default_fixed_batch + 1,
            seq_length=config.default_fixed_sequence + 1,
            framework=TensorType.TENSORFLOW,
        )

    # 创建 ONNX Runtime 会话
    options = SessionOptions()
    session = InferenceSession(onnx_model.as_posix(), options, providers=["CPUExecutionProvider"])

    # 如果是 PyTorch 可用且 reference_model 是 PreTrainedModel 的子类，则将 reference_model 移到 CPU 上
    if is_torch_available() and issubclass(type(reference_model), PreTrainedModel):
        reference_model.to("cpu")

    # 使用 reference_model_inputs 计算 reference_model 的输出
    ref_outputs = reference_model(**reference_model_inputs)
    ref_outputs_dict = {}

    # 将可能的输出集合（如 past_keys）展平为一个平面结构
    for name, value in ref_outputs.items():
        # 将输出名称重写为 "present"，因为这是用于 ONNX 输出的名称（"past_key_values" 用于 ONNX 输入）
        if name == "past_key_values":
            name = "present"
        # 如果值是列表或元组，则通过 config.flatten_output_collection_property 展平并更新 ref_outputs_dict
        if isinstance(value, (list, tuple)):
            value = config.flatten_output_collection_property(name, value)
            ref_outputs_dict.update(value)
        else:
            ref_outputs_dict[name] = value

    # 根据 reference_model_inputs 创建 onnxruntime 输入
    reference_model_inputs_onnxruntime = config.generate_dummy_inputs_onnxruntime(reference_model_inputs)

    # 将可能的输入集合（如 past_keys）展平为一个平面结构
    onnx_inputs = {}
    # 遍历reference_model`
    # Iterate over each name-value pair in reference_model_inputs_onnxruntime dictionary
    for name, value in reference_model_inputs_onnxruntime.items():
        # Check if the value is a list or tuple
        if isinstance(value, (list, tuple)):
            # Flatten the output collection property using config.flatten_output_collection_property method
            value = config.flatten_output_collection_property(name, value)
            # Update onnx_inputs dictionary with flattened values converted to numpy arrays
            onnx_inputs.update({tensor_name: pt_tensor.numpy() for tensor_name, pt_tensor in value.items()})
        else:
            # Convert value to numpy array and assign to onnx_inputs dictionary
            onnx_inputs[name] = value.numpy()

    # Compute outputs from the ONNX model using session.run
    onnx_outputs = session.run(onnx_named_outputs, onnx_inputs)

    # Check if the set of keys in onnx_outputs is a subset of keys in ref_outputs_dict
    ref_outputs_set, onnx_outputs_set = set(ref_outputs_dict.keys()), set(onnx_named_outputs)
    if not onnx_outputs_set.issubset(ref_outputs_set):
        # Log mismatched output names if sets do not match
        logger.info(
            f"\t-[x] ONNX model output names {onnx_outputs_set} do not match reference model {ref_outputs_set}"
        )
        # Raise ValueError if output names do not match
        raise ValueError(
            "Outputs don't match between reference model and ONNX exported model: "
            f"{onnx_outputs_set.difference(ref_outputs_set)}"
        )
    else:
        # Log matching output names if sets match
        logger.info(f"\t-[✓] ONNX model output names match reference model ({onnx_outputs_set})")

    # Validate shape and values of ONNX model outputs against reference model
    for name, ort_value in zip(onnx_named_outputs, onnx_outputs):
        # Determine reference value based on framework availability and model type
        if is_torch_available() and issubclass(type(reference_model), PreTrainedModel):
            ref_value = ref_outputs_dict[name].detach().numpy()
        else:
            ref_value = ref_outputs_dict[name].numpy()
        logger.info(f'\t- Validating ONNX Model output "{name}":')

        # Check if shapes match
        if not ort_value.shape == ref_value.shape:
            logger.info(f"\t\t-[x] shape {ort_value.shape} doesn't match {ref_value.shape}")
            # Raise ValueError if shapes do not match
            raise ValueError(
                "Outputs shape doesn't match between reference model and ONNX exported model: "
                f"Got {ref_value.shape} (reference) and {ort_value.shape} (ONNX)"
            )
        else:
            logger.info(f"\t\t-[✓] {ort_value.shape} matches {ref_value.shape}")

        # Check if values are close within specified tolerance
        if not np.allclose(ref_value, ort_value, atol=atol):
            bad_indices = np.logical_not(np.isclose(ref_value, ort_value, atol=atol))
            logger.info(f"\t\t-[x] values not close enough (atol: {atol})")
            # Raise ValueError if values are not sufficiently close
            raise ValueError(
                "Outputs values don't match between reference model and ONNX exported model: "
                f"Got max absolute difference of: {np.amax(np.abs(ref_value - ort_value))} for "
                f"{ref_value[bad_indices]} vs {ort_value[bad_indices]}"
            )
        else:
            logger.info(f"\t\t-[✓] all values close (atol: {atol})")
# 确保模型输入和配置输入匹配的函数
def ensure_model_and_config_inputs_match(
    model: Union["PreTrainedModel", "TFPreTrainedModel"], model_inputs: Iterable[str]
) -> Tuple[bool, List[str]]:
    """
    确保模型输入和配置输入匹配的函数。

    :param model: 预训练模型对象，可以是 `PreTrainedModel` 或 `TFPreTrainedModel` 的子类之一
    :param model_inputs: 模型期望的输入参数的可迭代对象，通常是字符串列表
    :return: 返回一个元组，包含一个布尔值和一个字符串列表。布尔值表示模型输入是否与配置输入匹配，字符串列表表示匹配的输入参数的有序列表。
    """

    # 如果当前环境支持 PyTorch 并且 model 是 PreTrainedModel 的子类
    if is_torch_available() and issubclass(type(model), PreTrainedModel):
        # 获取模型的 forward 方法的参数签名
        forward_parameters = signature(model.forward).parameters
    else:
        # 否则获取模型的 call 方法的参数签名（通常是 TensorFlow 模型）
        forward_parameters = signature(model.call).parameters

    # 将模型期望的输入参数转换为集合
    model_inputs_set = set(model_inputs)

    # 获取模型 forward 方法的参数名称集合
    forward_inputs_set = set(forward_parameters.keys())

    # 检查模型期望的输入参数是否都在 forward 方法的参数中
    is_ok = model_inputs_set.issubset(forward_inputs_set)

    # 确保输入参数的顺序匹配（非常重要！！！）
    matching_inputs = forward_inputs_set.intersection(model_inputs_set)
    ordered_inputs = [parameter for parameter in forward_parameters.keys() if parameter in matching_inputs]

    # 返回匹配结果和有序的输入参数列表
    return is_ok, ordered_inputs

`.\onnx\features.py`

import os  # 导入标准库 os，用于与操作系统交互
from functools import partial, reduce  # 从 functools 模块导入 partial 和 reduce 函数
from typing import TYPE_CHECKING, Callable, Dict, Optional, Tuple, Type, Union  # 导入类型提示相关的库

import transformers  # 导入 transformers 库，用于自然语言处理模型

from .. import PretrainedConfig, is_tf_available, is_torch_available  # 导入相对路径下的模块和函数
from ..utils import TF2_WEIGHTS_NAME, WEIGHTS_NAME, logging  # 导入相对路径下的工具函数和常量
from .config import OnnxConfig  # 导入当前目录下的 config 模块中的 OnnxConfig 类


if TYPE_CHECKING:
    from transformers import PreTrainedModel, TFPreTrainedModel  # 根据 TYPE_CHECKING 导入相关类型

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象，用于记录日志信息，名称为当前模块名，如果名字无效则禁用


if is_torch_available():  # 如果系统支持 torch
    from transformers.models.auto import (  # 导入 torch 下的自动模型选择
        AutoModel,
        AutoModelForCausalLM,
        AutoModelForImageClassification,
        AutoModelForImageSegmentation,
        AutoModelForMaskedImageModeling,
        AutoModelForMaskedLM,
        AutoModelForMultipleChoice,
        AutoModelForObjectDetection,
        AutoModelForQuestionAnswering,
        AutoModelForSemanticSegmentation,
        AutoModelForSeq2SeqLM,
        AutoModelForSequenceClassification,
        AutoModelForSpeechSeq2Seq,
        AutoModelForTokenClassification,
        AutoModelForVision2Seq,
    )

if is_tf_available():  # 如果系统支持 tensorflow
    from transformers.models.auto import (  # 导入 tensorflow 下的自动模型选择
        TFAutoModel,
        TFAutoModelForCausalLM,
        TFAutoModelForMaskedLM,
        TFAutoModelForMultipleChoice,
        TFAutoModelForQuestionAnswering,
        TFAutoModelForSemanticSegmentation,
        TFAutoModelForSeq2SeqLM,
        TFAutoModelForSequenceClassification,
        TFAutoModelForTokenClassification,
    )

if not is_torch_available() and not is_tf_available():  # 如果系统既不支持 torch 也不支持 tensorflow
    logger.warning(  # 记录警告信息，提醒用户无法导出模型
        "The ONNX export features are only supported for PyTorch or TensorFlow. You will not be able to export models"
        " without one of these libraries installed."
    )


def supported_features_mapping(  # 定义函数 supported_features_mapping，用于生成支持特性与其对应 OnnxConfig 的映射关系
    *supported_features: str, onnx_config_cls: str = None  # 支持的特性名称（可变参数），以及指定的 OnnxConfig 类的全名
) -> Dict[str, Callable[[PretrainedConfig], OnnxConfig]]:  # 返回字典类型，键为特性名称，值为对应的 OnnxConfig 构造函数
    """
    Generate the mapping between supported the features and their corresponding OnnxConfig for a given model.

    Args:
        *supported_features: The names of the supported features.
        onnx_config_cls: The OnnxConfig full name corresponding to the model.

    Returns:
        The dictionary mapping a feature to an OnnxConfig constructor.
    """
    if onnx_config_cls is None:  # 如果未提供 OnnxConfig 类的全名，则抛出 ValueError 异常
        raise ValueError("A OnnxConfig class must be provided")

    config_cls = transformers  # 初始化配置类为 transformers 模块
    for attr_name in onnx_config_cls.split("."):  # 根据类名字符串分割，逐层获取属性
        config_cls = getattr(config_cls, attr_name)
    mapping = {}  # 初始化空字典，用于存储特性与构造函数的映射关系
    for feature in supported_features:  # 遍历所有支持的特性名称
        if "-with-past" in feature:  # 如果特性名称包含 "-with-past"
            task = feature.replace("-with-past", "")  # 提取任务名称
            mapping[feature] = partial(config_cls.with_past, task=task)  # 使用部分函数生成配置类的构造函数
        else:
            mapping[feature] = partial(config_cls.from_model_config, task=feature)  # 使用部分函数生成配置类的构造函数

    return mapping  # 返回特性与构造函数的映射字典


class FeaturesManager:  # 定义特性管理器类
    _TASKS_TO_AUTOMODELS = {}  # 空字典，用于存储任务与自动模型的映射关系
    _TASKS_TO_TF_AUTOMODELS = {}  # 空字典，用于存储任务与 TensorFlow 自动模型的映射关系
    # 如果 torch 库可用，则定义一个任务到自动模型类的映射字典
    if is_torch_available():
        _TASKS_TO_AUTOMODELS = {
            "default": AutoModel,
            "masked-lm": AutoModelForMaskedLM,
            "causal-lm": AutoModelForCausalLM,
            "seq2seq-lm": AutoModelForSeq2SeqLM,
            "sequence-classification": AutoModelForSequenceClassification,
            "token-classification": AutoModelForTokenClassification,
            "multiple-choice": AutoModelForMultipleChoice,
            "object-detection": AutoModelForObjectDetection,
            "question-answering": AutoModelForQuestionAnswering,
            "image-classification": AutoModelForImageClassification,
            "image-segmentation": AutoModelForImageSegmentation,
            "masked-im": AutoModelForMaskedImageModeling,
            "semantic-segmentation": AutoModelForSemanticSegmentation,
            "vision2seq-lm": AutoModelForVision2Seq,
            "speech2seq-lm": AutoModelForSpeechSeq2Seq,
        }
    
    # 如果 tensorflow 库可用，则定义一个任务到 TensorFlow 自动模型类的映射字典
    if is_tf_available():
        _TASKS_TO_TF_AUTOMODELS = {
            "default": TFAutoModel,
            "masked-lm": TFAutoModelForMaskedLM,
            "causal-lm": TFAutoModelForCausalLM,
            "seq2seq-lm": TFAutoModelForSeq2SeqLM,
            "sequence-classification": TFAutoModelForSequenceClassification,
            "token-classification": TFAutoModelForTokenClassification,
            "multiple-choice": TFAutoModelForMultipleChoice,
            "question-answering": TFAutoModelForQuestionAnswering,
            "semantic-segmentation": TFAutoModelForSemanticSegmentation,
        }

    # 定义一个集合，包含所有支持的特性，特性由各个模型类型支持的特性的并集组成
    AVAILABLE_FEATURES = sorted(reduce(lambda s1, s2: s1 | s2, (v.keys() for v in _SUPPORTED_MODEL_TYPE.values())))

    # 静态方法：根据模型类型获取支持的特性列表
    @staticmethod
    def get_supported_features_for_model_type(
        model_type: str, model_name: Optional[str] = None
    ) -> Dict[str, Callable[[PretrainedConfig], OnnxConfig]]:
        """
        Tries to retrieve the feature -> OnnxConfig constructor map from the model type.

        Args:
            model_type (`str`):
                The model type to retrieve the supported features for.
            model_name (`str`, *optional*):
                The name attribute of the model object, only used for the exception message.

        Returns:
            The dictionary mapping each feature to a corresponding OnnxConfig constructor.
        """
        # 将 model_type 转换为小写
        model_type = model_type.lower()
        # 检查 model_type 是否在支持的模型类型中
        if model_type not in FeaturesManager._SUPPORTED_MODEL_TYPE:
            # 准备错误信息，如果提供了 model_name，则将其包含在错误信息中
            model_type_and_model_name = f"{model_type} ({model_name})" if model_name else model_type
            # 抛出 KeyError 异常，说明给定的模型类型不被支持
            raise KeyError(
                f"{model_type_and_model_name} is not supported yet. "
                f"Only {list(FeaturesManager._SUPPORTED_MODEL_TYPE.keys())} are supported. "
                f"If you want to support {model_type} please propose a PR or open up an issue."
            )
        # 返回 model_type 对应的 OnnxConfig 构造函数字典
        return FeaturesManager._SUPPORTED_MODEL_TYPE[model_type]

    @staticmethod
    def feature_to_task(feature: str) -> str:
        """
        Converts a feature string by removing the '-with-past' suffix.

        Args:
            feature (`str`):
                The feature string to be converted.

        Returns:
            The feature string with '-with-past' suffix removed.
        """
        return feature.replace("-with-past", "")

    @staticmethod
    def _validate_framework_choice(framework: str):
        """
        Validates if the framework requested for the export is both correct and available, otherwise throws an
        exception.

        Args:
            framework (`str`):
                The framework requested for ONNX export.

        Raises:
            ValueError: If the provided framework is not 'pt' or 'tf'.
            RuntimeError: If the requested framework is 'pt' but PyTorch is not available,
                          or if the requested framework is 'tf' but TensorFlow is not available.
        """
        # 检查 framework 是否在支持的框架列表中
        if framework not in ["pt", "tf"]:
            # 抛出 ValueError 异常，说明只支持 'pt' 或 'tf' 两种框架
            raise ValueError(
                f"Only two frameworks are supported for ONNX export: pt or tf, but {framework} was provided."
            )
        # 如果 framework 是 'pt'，检查是否可以导出到 ONNX
        elif framework == "pt" and not is_torch_available():
            # 抛出 RuntimeError 异常，说明无法使用 PyTorch 导出模型到 ONNX
            raise RuntimeError("Cannot export model to ONNX using PyTorch because no PyTorch package was found.")
        # 如果 framework 是 'tf'，检查是否可以导出到 ONNX
        elif framework == "tf" and not is_tf_available():
            # 抛出 RuntimeError 异常，说明无法使用 TensorFlow 导出模型到 ONNX
            raise RuntimeError("Cannot export model to ONNX using TensorFlow because no TensorFlow package was found.")
    def get_model_class_for_feature(feature: str, framework: str = "pt") -> Type:
        """
        Attempts to retrieve an AutoModel class from a feature name.

        Args:
            feature (`str`):
                The feature required.
            framework (`str`, *optional*, defaults to `"pt"`):
                The framework to use for the export.

        Returns:
            The AutoModel class corresponding to the feature.
        """
        # 根据特征名称获取对应的任务
        task = FeaturesManager.feature_to_task(feature)
        # 验证选择的框架是否有效
        FeaturesManager._validate_framework_choice(framework)
        # 根据选择的框架确定任务到AutoModel类的映射
        if framework == "pt":
            task_to_automodel = FeaturesManager._TASKS_TO_AUTOMODELS
        else:
            task_to_automodel = FeaturesManager._TASKS_TO_TF_AUTOMODELS
        # 如果任务不在映射中，则抛出KeyError异常
        if task not in task_to_automodel:
            raise KeyError(
                f"Unknown task: {feature}. Possible values are {list(FeaturesManager._TASKS_TO_AUTOMODELS.values())}"
            )

        return task_to_automodel[task]

    @staticmethod
    def determine_framework(model: str, framework: str = None) -> str:
        """
        Determines the framework to use for the export.

        The priority is in the following order:
            1. User input via `framework`.
            2. If local checkpoint is provided, use the same framework as the checkpoint.
            3. Available framework in environment, with priority given to PyTorch

        Args:
            model (`str`):
                The name of the model to export.
            framework (`str`, *optional*, defaults to `None`):
                The framework to use for the export. See above for priority if none provided.

        Returns:
            The framework to use for the export.
        """
        # 如果用户指定了框架，则直接返回该框架
        if framework is not None:
            return framework

        # 框架映射关系
        framework_map = {"pt": "PyTorch", "tf": "TensorFlow"}
        # 导出器映射关系
        exporter_map = {"pt": "torch", "tf": "tf2onnx"}

        # 如果模型路径是一个目录
        if os.path.isdir(model):
            # 检查是否存在PyTorch的权重文件
            if os.path.isfile(os.path.join(model, WEIGHTS_NAME)):
                framework = "pt"
            # 检查是否存在TensorFlow的权重文件
            elif os.path.isfile(os.path.join(model, TF2_WEIGHTS_NAME)):
                framework = "tf"
            else:
                # 如果无法确定框架，则抛出FileNotFoundError异常
                raise FileNotFoundError(
                    "Cannot determine framework from given checkpoint location."
                    f" There should be a {WEIGHTS_NAME} for PyTorch"
                    f" or {TF2_WEIGHTS_NAME} for TensorFlow."
                )
            # 记录日志，表示找到本地模型
            logger.info(f"Local {framework_map[framework]} model found.")
        else:
            # 如果PyTorch可用，则选择PyTorch框架
            if is_torch_available():
                framework = "pt"
            # 如果TensorFlow可用，则选择TensorFlow框架
            elif is_tf_available():
                framework = "tf"
            else:
                # 如果环境中既没有PyTorch也没有TensorFlow，则抛出EnvironmentError异常
                raise EnvironmentError("Neither PyTorch nor TensorFlow found in environment. Cannot export to ONNX.")

        # 记录日志，表示使用导出器将模型导出为ONNX格式
        logger.info(f"Framework not requested. Using {exporter_map[framework]} to export to ONNX.")

        return framework
    def get_model_from_feature(
        feature: str, model: str, framework: str = None, cache_dir: str = None
    ) -> Union["PreTrainedModel", "TFPreTrainedModel"]:
        """
        Attempts to retrieve a model instance based on the given feature and model name.

        Args:
            feature (`str`):
                The specific feature required by the model.
            model (`str`):
                The name of the model to retrieve.
            framework (`str`, *optional*, defaults to `None`):
                The framework to use for model instantiation. If not provided, determined by `FeaturesManager.determine_framework`.

        Returns:
            Union["PreTrainedModel", "TFPreTrainedModel"]: The instantiated model object.
        """
        framework = FeaturesManager.determine_framework(model, framework)
        # 获取特定 feature 对应的模型类
        model_class = FeaturesManager.get_model_class_for_feature(feature, framework)
        try:
            # 尝试从预训练模型加载指定模型
            model = model_class.from_pretrained(model, cache_dir=cache_dir)
        except OSError:
            if framework == "pt":
                # 若出错且框架为 PyTorch，尝试加载 TensorFlow 模型并转换为 PyTorch 格式
                logger.info("Loading TensorFlow model in PyTorch before exporting to ONNX.")
                model = model_class.from_pretrained(model, from_tf=True, cache_dir=cache_dir)
            else:
                # 若出错且框架不是 PyTorch，尝试加载 PyTorch 模型并转换为 TensorFlow 格式
                logger.info("Loading PyTorch model in TensorFlow before exporting to ONNX.")
                model = model_class.from_pretrained(model, from_pt=True, cache_dir=cache_dir)
        return model

    @staticmethod
    def check_supported_model_or_raise(
        model: Union["PreTrainedModel", "TFPreTrainedModel"], feature: str = "default"
    ) -> Tuple[str, Callable]:
        """
        Checks if a given model supports a specified feature.

        Args:
            model (Union["PreTrainedModel", "TFPreTrainedModel"]):
                The model instance to check.
            feature (`str`, *optional*, defaults to `"default"`):
                The feature name to verify if supported.

        Returns:
            Tuple[str, Callable]:
                - The type of the model (`str`).
                - Callable function from `FeaturesManager._SUPPORTED_MODEL_TYPE` corresponding to the feature.
        """
        # 获取模型类型并替换下划线为破折号
        model_type = model.config.model_type.replace("_", "-")
        # 获取模型名称（如果有）
        model_name = getattr(model, "name", "")
        # 获取模型支持的特性列表
        model_features = FeaturesManager.get_supported_features_for_model_type(model_type, model_name=model_name)
        # 检查指定特性是否在支持的特性列表中
        if feature not in model_features:
            raise ValueError(
                f"{model.config.model_type} doesn't support feature {feature}. Supported values are: {model_features}"
            )

        return model.config.model_type, FeaturesManager._SUPPORTED_MODEL_TYPE[model_type][feature]

    def get_config(model_type: str, feature: str) -> OnnxConfig:
        """
        Retrieves the configuration for a specified model type and feature combination.

        Args:
            model_type (`str`):
                The type of model to fetch the configuration for.
            feature (`str`):
                The feature to retrieve the configuration for.

        Returns:
            `OnnxConfig`: Configuration object for the specified model type and feature.
        """
        return FeaturesManager._SUPPORTED_MODEL_TYPE[model_type][feature]

`.\onnx\utils.py`

# 导入所需的模块和类型声明
from ctypes import c_float, sizeof
from enum import Enum
from typing import TYPE_CHECKING, Optional, Union

# 如果是类型检查，导入相关的预处理模块
if TYPE_CHECKING:
    from .. import AutoFeatureExtractor, AutoProcessor, AutoTokenizer  # tests_ignore

# 枚举类型，定义参数的数据格式
class ParameterFormat(Enum):
    Float = c_float

    @property
    def size(self) -> int:
        """
        返回该数据类型所需的字节数

        Returns:
            Integer > 0
        """
        return sizeof(self.value)

# 计算有效轴维度的函数
def compute_effective_axis_dimension(dimension: int, fixed_dimension: int, num_token_to_add: int = 0) -> int:
    """

    Args:
        dimension: 当前轴的维度
        fixed_dimension: 固定的轴维度
        num_token_to_add: 需要添加的标记数量

    Returns:
        计算后的有效轴维度
    """
    # 如果维度 <= 0，使用固定维度
    if dimension <= 0:
        dimension = fixed_dimension

    dimension -= num_token_to_add
    return dimension

# 计算序列化参数大小的函数
def compute_serialized_parameters_size(num_parameters: int, dtype: ParameterFormat) -> int:
    """
    计算在给定存储格式中序列化模型时所有参数占用的大小

    Args:
        num_parameters: 需要保存的参数数量
        dtype: 每个参数保存的数据格式

    Returns:
        所有参数保存时占用的字节数
    """
    return num_parameters * dtype.size

# 获取预处理器的函数
def get_preprocessor(model_name: str) -> Optional[Union["AutoTokenizer", "AutoFeatureExtractor", "AutoProcessor"]]:
    """
    获取适用于 `model_name` 的预处理器（分词器、特征提取器或处理器）。

    Args:
        model_name (`str`): 模型名称，用于加载预处理器。

    Returns:
        `Optional[Union[AutoTokenizer, AutoFeatureExtractor, AutoProcessor]]`:
            如果找到处理器，则返回处理器。如果存在分词器或特征提取器，则返回分词器或特征提取器。如果同时存在分词器和特征提取器，则会引发错误。如果找不到预处理器，则返回 `None`。
    """
    # 避免循环导入问题，仅在此处导入
    from .. import AutoFeatureExtractor, AutoProcessor, AutoTokenizer  # tests_ignore

    try:
        return AutoProcessor.from_pretrained(model_name)
    # 处理可能发生的异常：ValueError, OSError, KeyError
    except (ValueError, OSError, KeyError):
        # 初始化 tokenizer 和 feature_extractor 变量为 None
        tokenizer, feature_extractor = None, None
        
        # 尝试根据模型名称加载 AutoTokenizer，可能会抛出 OSError 或 KeyError 异常
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
        except (OSError, KeyError):
            pass
        
        # 尝试根据模型名称加载 AutoFeatureExtractor，可能会抛出 OSError 或 KeyError 异常
        try:
            feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
        except (OSError, KeyError):
            pass

        # 检查 tokenizer 和 feature_extractor 是否都不为 None
        if tokenizer is not None and feature_extractor is not None:
            # 如果两者都不为 None，则抛出 ValueError 异常，指示找到了同时存在的 tokenizer 和 feature extractor
            raise ValueError(
                f"Couldn't auto-detect preprocessor for {model_name}. Found both a tokenizer and a feature extractor."
            )
        elif tokenizer is None and feature_extractor is None:
            # 如果两者都为 None，则返回 None，表示未能找到有效的预处理器
            return None
        elif tokenizer is not None:
            # 如果只有 tokenizer 不为 None，则返回 tokenizer
            return tokenizer
        else:
            # 如果只有 feature_extractor 不为 None，则返回 feature_extractor
            return feature_extractor

`.\onnx\init.py`

# 导入类型检查工具，用于检查类型注解的有效性
from typing import TYPE_CHECKING

# 导入延迟加载模块的工具函数
from ..utils import _LazyModule

# 定义模块的导入结构，包括各个子模块及其成员
_import_structure = {
    "config": [
        "EXTERNAL_DATA_FORMAT_SIZE_LIMIT",  # 外部数据格式大小限制
        "OnnxConfig",  # OnnxConfig 类型
        "OnnxConfigWithPast",  # 带有历史的 OnnxConfig 类型
        "OnnxSeq2SeqConfigWithPast",  # 带有历史的 OnnxSeq2SeqConfig 类型
        "PatchingSpec",  # 补丁规范类
    ],
    "convert": ["export", "validate_model_outputs"],  # 转换相关函数
    "features": ["FeaturesManager"],  # 特征管理器类
    "utils": ["ParameterFormat", "compute_serialized_parameters_size"],  # 参数格式及计算序列化参数大小函数
}

# 如果处于类型检查模式，则从各子模块导入特定类型
if TYPE_CHECKING:
    from .config import (
        EXTERNAL_DATA_FORMAT_SIZE_LIMIT,  # 外部数据格式大小限制
        OnnxConfig,  # OnnxConfig 类型
        OnnxConfigWithPast,  # 带有历史的 OnnxConfig 类型
        OnnxSeq2SeqConfigWithPast,  # 带有历史的 OnnxSeq2SeqConfig 类型
        PatchingSpec,  # 补丁规范类
    )
    from .convert import export, validate_model_outputs  # 导出和验证模型输出函数
    from .features import FeaturesManager  # 特征管理器类
    from .utils import ParameterFormat, compute_serialized_parameters_size  # 参数格式及计算序列化参数大小函数

# 如果不处于类型检查模式，则进行延迟加载模块设置
else:
    import sys
    
    # 将当前模块替换为延迟加载模块对象，使用给定的导入结构和模块规范
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\onnx\main.py`

# 版权声明和许可信息
#
# 版权所有 2021 年 HuggingFace 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）许可;
# 除非符合许可证的条款，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，软件是基于“原样”分发的，
# 没有任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。
import subprocess  # 导入 subprocess 模块，用于执行外部命令和进程管理
import sys  # 导入 sys 模块，提供对 Python 运行时系统的访问
import warnings  # 导入 warnings 模块，用于管理警告信息
from argparse import ArgumentParser  # 从 argparse 模块导入 ArgumentParser 类，用于解析命令行参数
from pathlib import Path  # 导入 Path 类，用于操作路径

from packaging import version  # 导入 version 模块，用于处理版本信息

from .. import AutoFeatureExtractor, AutoImageProcessor, AutoProcessor, AutoTokenizer  # 导入自定义模块
from ..utils import logging  # 导入自定义模块中的 logging 工具
from ..utils.import_utils import is_optimum_available  # 导入自定义模块中的 is_optimum_available 函数
from .convert import export, validate_model_outputs  # 从当前目录下的 convert 模块导入 export 和 validate_model_outputs 函数
from .features import FeaturesManager  # 从当前目录下的 features 模块导入 FeaturesManager 类
from .utils import get_preprocessor  # 从当前目录下的 utils 模块导入 get_preprocessor 函数

MIN_OPTIMUM_VERSION = "1.5.0"  # 定义最小的 optimum 版本号

ENCODER_DECODER_MODELS = ["vision-encoder-decoder"]  # 定义编码-解码模型列表

# 使用 optimum 导出模型
def export_with_optimum(args):
    if is_optimum_available():  # 如果 optimum 可用
        from optimum.version import __version__ as optimum_version  # 导入 optimum 的版本信息

        parsed_optimum_version = version.parse(optimum_version)  # 解析 optimum 的版本号
        if parsed_optimum_version < version.parse(MIN_OPTIMUM_VERSION):  # 如果 optimum 的版本低于要求的最小版本
            raise RuntimeError(  # 抛出运行时异常
                f"transformers.onnx requires optimum >= {MIN_OPTIMUM_VERSION} but {optimum_version} is installed. You "
                "can upgrade optimum by running: pip install -U optimum[exporters]"
            )
    else:  # 如果 optimum 不可用
        raise RuntimeError(  # 抛出运行时异常
            "transformers.onnx requires optimum to run, you can install the library by running: pip install "
            "optimum[exporters]"
        )
    # 构建命令行参数列表
    cmd_line = [
        sys.executable,
        "-m",
        "optimum.exporters.onnx",
        f"--model {args.model}",
        f"--task {args.feature}",
        f"--framework {args.framework}" if args.framework is not None else "",
        f"{args.output}",
    ]
    proc = subprocess.Popen(cmd_line, stdout=subprocess.PIPE)  # 执行命令行，并获取子进程对象
    proc.wait()  # 等待子进程执行完毕

    logger.info(  # 使用 logger 输出信息
        "The export was done by optimum.exporters.onnx. We recommend using to use this package directly in future, as "
        "transformers.onnx is deprecated, and will be removed in v5. You can find more information here: "
        "https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model."
    )


# 使用 transformers 导出模型
def export_with_transformers(args):
    args.output = args.output if args.output.is_file() else args.output.joinpath("model.onnx")  # 如果输出路径不是文件，则拼接文件名
    if not args.output.parent.exists():  # 如果输出路径的父目录不存在
        args.output.parent.mkdir(parents=True)  # 创建父目录及其所有必需的上级目录

    # 分配模型
    model = FeaturesManager.get_model_from_feature(
        args.feature, args.model, framework=args.framework, cache_dir=args.cache_dir
    )
    # 检查给定模型是否被支持，并返回模型类型和对应的配置对象
    model_kind, model_onnx_config = FeaturesManager.check_supported_model_or_raise(model, feature=args.feature)
    # 根据模型配置创建对应的 ONNX 配置对象
    onnx_config = model_onnx_config(model.config)

    # 如果模型类型是编码器-解码器模型
    if model_kind in ENCODER_DECODER_MODELS:
        # 获取编码器和解码器模型对象
        encoder_model = model.get_encoder()
        decoder_model = model.get_decoder()

        # 获取编码器和解码器模型的 ONNX 配置
        encoder_onnx_config = onnx_config.get_encoder_config(encoder_model.config)
        decoder_onnx_config = onnx_config.get_decoder_config(
            encoder_model.config, decoder_model.config, feature=args.feature
        )

        # 如果未指定操作集，则选择编码器和解码器的默认操作集中的最大值
        if args.opset is None:
            args.opset = max(encoder_onnx_config.default_onnx_opset, decoder_onnx_config.default_onnx_opset)

        # 检查指定的操作集是否满足编码器和解码器的最小要求
        if args.opset < min(encoder_onnx_config.default_onnx_opset, decoder_onnx_config.default_onnx_opset):
            raise ValueError(
                f"Opset {args.opset} is not sufficient to export {model_kind}. At least "
                f"{min(encoder_onnx_config.default_onnx_opset, decoder_onnx_config.default_onnx_opset)} is required."
            )

        # 根据模型类型创建自动特征提取器对象
        preprocessor = AutoFeatureExtractor.from_pretrained(args.model)

        # 导出编码器模型的 ONNX 文件，并获取输入和输出信息
        onnx_inputs, onnx_outputs = export(
            preprocessor,
            encoder_model,
            encoder_onnx_config,
            args.opset,
            args.output.parent.joinpath("encoder_model.onnx"),
        )

        # 验证编码器模型输出的正确性
        validate_model_outputs(
            encoder_onnx_config,
            preprocessor,
            encoder_model,
            args.output.parent.joinpath("encoder_model.onnx"),
            onnx_outputs,
            args.atol if args.atol else encoder_onnx_config.atol_for_validation,
        )

        # 根据模型类型创建自动分词器对象
        preprocessor = AutoTokenizer.from_pretrained(args.model)

        # 导出解码器模型的 ONNX 文件，并获取输入和输出信息
        onnx_inputs, onnx_outputs = export(
            preprocessor,
            decoder_model,
            decoder_onnx_config,
            args.opset,
            args.output.parent.joinpath("decoder_model.onnx"),
        )

        # 验证解码器模型输出的正确性
        validate_model_outputs(
            decoder_onnx_config,
            preprocessor,
            decoder_model,
            args.output.parent.joinpath("decoder_model.onnx"),
            onnx_outputs,
            args.atol if args.atol else decoder_onnx_config.atol_for_validation,
        )
        # 记录信息，显示模型保存的路径
        logger.info(
            f"All good, model saved at: {args.output.parent.joinpath('encoder_model.onnx').as_posix()},"
            f" {args.output.parent.joinpath('decoder_model.onnx').as_posix()}"
        )
    else:
        # 如果不是第一个分支，则实例化适当的预处理器

        if args.preprocessor == "auto":
            # 如果预处理器类型是 "auto"，则根据模型获取适当的预处理器对象
            preprocessor = get_preprocessor(args.model)
        elif args.preprocessor == "tokenizer":
            # 如果预处理器类型是 "tokenizer"，则使用预训练的 AutoTokenizer 创建预处理器对象
            preprocessor = AutoTokenizer.from_pretrained(args.model)
        elif args.preprocessor == "image_processor":
            # 如果预处理器类型是 "image_processor"，则使用预训练的 AutoImageProcessor 创建预处理器对象
            preprocessor = AutoImageProcessor.from_pretrained(args.model)
        elif args.preprocessor == "feature_extractor":
            # 如果预处理器类型是 "feature_extractor"，则使用预训练的 AutoFeatureExtractor 创建预处理器对象
            preprocessor = AutoFeatureExtractor.from_pretrained(args.model)
        elif args.preprocessor == "processor":
            # 如果预处理器类型是 "processor"，则使用预训练的 AutoProcessor 创建预处理器对象
            preprocessor = AutoProcessor.from_pretrained(args.model)
        else:
            # 如果预处理器类型未知，则抛出 ValueError 异常
            raise ValueError(f"Unknown preprocessor type '{args.preprocessor}'")

        # 确保请求的 opset 足够
        if args.opset is None:
            args.opset = onnx_config.default_onnx_opset

        if args.opset < onnx_config.default_onnx_opset:
            # 如果请求的 opset 小于默认的 opset，抛出 ValueError 异常
            raise ValueError(
                f"Opset {args.opset} is not sufficient to export {model_kind}. "
                f"At least  {onnx_config.default_onnx_opset} is required."
            )

        # 导出模型到 ONNX 格式，获取输入和输出
        onnx_inputs, onnx_outputs = export(
            preprocessor,
            model,
            onnx_config,
            args.opset,
            args.output,
        )

        if args.atol is None:
            # 如果未指定 atol，则使用默认的验证容差值
            args.atol = onnx_config.atol_for_validation

        # 验证导出的模型输出是否符合预期
        validate_model_outputs(onnx_config, preprocessor, model, args.output, onnx_outputs, args.atol)
        
        # 记录信息，指示模型已成功保存
        logger.info(f"All good, model saved at: {args.output.as_posix()}")
        
        # 发出警告，提示使用过时的 ONNX 导出工具，建议将来使用新的导出器
        warnings.warn(
            "The export was done by transformers.onnx which is deprecated and will be removed in v5. We recommend"
            " using optimum.exporters.onnx in future. You can find more information here:"
            " https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model.",
            FutureWarning,
        )
# 主程序入口函数
def main():
    # 创建参数解析器实例，用于解析命令行参数
    parser = ArgumentParser("Hugging Face Transformers ONNX exporter")
    
    # 添加必需参数：模型 ID 或者磁盘上的模型路径
    parser.add_argument(
        "-m", "--model", type=str, required=True, help="Model ID on huggingface.co or path on disk to load model from."
    )
    
    # 添加可选参数：导出模型时使用的特性类型，默认为 "default"
    parser.add_argument(
        "--feature",
        default="default",
        help="The type of features to export the model with.",
    )
    
    # 添加可选参数：导出模型时使用的 ONNX opset 版本
    parser.add_argument("--opset", type=int, default=None, help="ONNX opset version to export the model with.")
    
    # 添加可选参数：验证模型时的绝对差值容忍度
    parser.add_argument(
        "--atol", type=float, default=None, help="Absolute difference tolerance when validating the model."
    )
    
    # 添加可选参数：指定导出模型时使用的框架，可选项为 "pt" 或 "tf"
    parser.add_argument(
        "--framework",
        type=str,
        choices=["pt", "tf"],
        default=None,
        help=(
            "The framework to use for the ONNX export."
            " If not provided, will attempt to use the local checkpoint's original framework"
            " or what is available in the environment."
        ),
    )
    
    # 添加位置参数：指定生成的 ONNX 模型存储路径
    parser.add_argument("output", type=Path, help="Path indicating where to store generated ONNX model.")
    
    # 添加可选参数：指定缓存目录的路径
    parser.add_argument("--cache_dir", type=str, default=None, help="Path indicating where to store cache.")
    
    # 添加可选参数：指定使用的预处理器类型，可选项有多种，如 "auto"、"tokenizer" 等
    parser.add_argument(
        "--preprocessor",
        type=str,
        choices=["auto", "tokenizer", "feature_extractor", "image_processor", "processor"],
        default="auto",
        help="Which type of preprocessor to use. 'auto' tries to automatically detect it.",
    )
    
    # 添加可选参数：是否使用 transformers.onnx 而非 optimum.exporters.onnx 来执行 ONNX 导出
    parser.add_argument(
        "--export_with_transformers",
        action="store_true",
        help=(
            "Whether to use transformers.onnx instead of optimum.exporters.onnx to perform the ONNX export. It can be "
            "useful when exporting a model supported in transformers but not in optimum, otherwise it is not "
            "recommended."
        ),
    )

    # 解析命令行参数
    args = parser.parse_args()
    
    # 如果指定了 --export_with_transformers 或者 optimum 模块不可用，使用 transformers.onnx 导出模型
    if args.export_with_transformers or not is_optimum_available():
        export_with_transformers(args)
    else:
        # 否则，使用 optimum.exporters.onnx 导出模型
        export_with_optimum(args)


# 如果当前脚本作为主程序运行，则执行以下代码
if __name__ == "__main__":
    # 获取日志记录器实例，并设置日志级别为 INFO
    logger = logging.get_logger("transformers.onnx")  # pylint: disable=invalid-name
    logger.setLevel(logging.INFO)
    
    # 调用主程序入口函数 main()
    main()

`.\optimization.py`

# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch optimization for BERT model."""

import math
import warnings
from functools import partial
from typing import Callable, Iterable, Optional, Tuple, Union

import torch
from torch import nn
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau

from .trainer_pt_utils import LayerWiseDummyOptimizer, LayerWiseDummyScheduler
from .trainer_utils import SchedulerType
from .utils import logging
from .utils.versions import require_version

# 获取logger对象，用于日志记录
logger = logging.get_logger(__name__)


def _get_constant_lambda(_=None):
    # 返回常数学习率调度函数
    return 1


def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
    """
    Create a schedule with a constant learning rate, using the learning rate set in optimizer.

    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        last_epoch (`int`, *optional*, defaults to -1):
            The index of the last epoch when resuming training.

    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """
    # 使用LambdaLR创建常数学习率调度器
    return LambdaLR(optimizer, _get_constant_lambda, last_epoch=last_epoch)


def get_reduce_on_plateau_schedule(optimizer: Optimizer, **kwargs):
    """
    Create a schedule with a constant learning rate that decreases when a metric has stopped improving.

    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        kwargs (`dict`, *optional*):
            Extra parameters to be passed to the scheduler. See `torch.optim.lr_scheduler.ReduceLROnPlateau`
            for possible parameters.

    Return:
        `torch.optim.lr_scheduler.ReduceLROnPlateau` with the appropriate schedule.
    """
    # 使用ReduceLROnPlateau创建学习率调度器，根据指标停止改善时降低学习率
    return ReduceLROnPlateau(optimizer, **kwargs)


def _get_constant_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int):
    if current_step < num_warmup_steps:
        # 在预热阶段线性增加学习率
        return float(current_step) / float(max(1.0, num_warmup_steps))
    # 达到预热步数后保持恒定学习率
    return 1.0


def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, last_epoch: int = -1):
    """
    Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
    increases linearly between 0 and the initial lr set in the optimizer.

    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        last_epoch (`int`, *optional*, defaults to -1):
            The index of the last epoch when resuming training.

    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """
    # 使用LambdaLR创建带预热的常数学习率调度器
    return LambdaLR(optimizer, partial(_get_constant_schedule_with_warmup_lr_lambda, num_warmup_steps=num_warmup_steps), last_epoch=last_epoch)
    # 定义一个偏函数，用于生成包含预热学习率调度的 Lambda 函数
    lr_lambda = partial(_get_constant_schedule_with_warmup_lr_lambda, num_warmup_steps=num_warmup_steps)
    # 返回一个 LambdaLR 对象，将给定的优化器和生成的 Lambda 函数作为参数
    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
def _get_linear_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int):
    # 如果当前步数小于预热步数，则返回一个线性增长的学习率比例因子
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))
    # 否则返回一个线性衰减的学习率比例因子
    return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))


def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
    """
    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.

    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
        last_epoch (`int`, *optional*, defaults to -1):
            The index of the last epoch when resuming training.

    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """
    # 使用 partial 函数固定部分参数，创建一个 lambda 函数作为学习率调度器的输入
    lr_lambda = partial(
        _get_linear_schedule_with_warmup_lr_lambda,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
    )
    # 返回一个 LambdaLR 类型的学习率调度器对象
    return LambdaLR(optimizer, lr_lambda, last_epoch)


def _get_cosine_schedule_with_warmup_lr_lambda(
    current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float
):
    # 如果当前步数小于预热步数，则返回一个线性增长的学习率比例因子
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))
    # 否则计算余弦衰减的学习率比例因子
    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
    return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))


def get_cosine_schedule_with_warmup(
    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1
):
    """
    Create a schedule with a learning rate that decreases following the values of the cosine function between the
    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
    initial lr set in the optimizer.

    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
        num_cycles (`float`, *optional*, defaults to 0.5):
            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
            following a half-cosine).
        last_epoch (`int`, *optional*, defaults to -1):
            The index of the last epoch when resuming training.
    """
    # 使用 partial 函数固定部分参数，创建一个 lambda 函数作为学习率调度器的输入
    lr_lambda = partial(
        _get_cosine_schedule_with_warmup_lr_lambda,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
        num_cycles=num_cycles,
    )
    # 返回一个 LambdaLR 类型的学习率调度器对象
    return LambdaLR(optimizer, lr_lambda, last_epoch)
    # 定义一个部分应用的函数 `_get_cosine_schedule_with_warmup_lr_lambda` 的 Lambda 函数，用于生成学习率调度函数
    lr_lambda = partial(
        _get_cosine_schedule_with_warmup_lr_lambda,
        num_warmup_steps=num_warmup_steps,  # 设置预热步数参数
        num_training_steps=num_training_steps,  # 设置训练步数参数
        num_cycles=num_cycles,  # 设置余弦退火周期数参数
    )
    # 返回一个 LambdaLR 对象，将这个 Lambda 函数应用于优化器的学习率调度
    return LambdaLR(optimizer, lr_lambda, last_epoch)
# 定义一个函数，根据余弦函数值生成学习率调度表，带有硬重启，并在预热期间逐步增加学习率
def _get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda(
    current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: int
):
    # 如果当前步数小于预热步数，返回线性增长的学习率比例
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))
    # 计算当前进度，基于训练步数和预热步数，生成一个 0 到 1 的进度值
    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
    # 如果进度超过或等于 1.0，返回学习率为 0.0
    if progress >= 1.0:
        return 0.0
    # 否则，根据余弦函数生成学习率，带有硬重启的周期性
    return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))


# 创建一个学习率调度对象，基于给定的优化器和参数，返回 LambdaLR 调度器对象
def get_cosine_with_hard_restarts_schedule_with_warmup(
    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1
):
    """
    Create a schedule with a learning rate that decreases following the values of the cosine function between the
    initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases
    linearly between 0 and the initial lr set in the optimizer.

    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
        num_cycles (`int`, *optional*, defaults to 1):
            The number of hard restarts to use.
        last_epoch (`int`, *optional*, defaults to -1):
            The index of the last epoch when resuming training.

    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """
    # 使用偏函数创建 LambdaLR 调度器，使用 _get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda 函数
    lr_lambda = partial(
        _get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
        num_cycles=num_cycles,
    )
    # 返回 LambdaLR 调度器对象
    return LambdaLR(optimizer, lr_lambda, last_epoch)


# 定义一个函数，根据多项式衰减生成学习率调度表，带有预热期间的逐步增加学习率
def _get_polynomial_decay_schedule_with_warmup_lr_lambda(
    current_step: int,
    *,
    num_warmup_steps: int,
    num_training_steps: int,
    lr_end: float,
    power: float,
    lr_init: int,
):
    # 如果当前步数小于预热步数，返回线性增长的学习率比例
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))
    # 如果当前步数大于训练步数，返回最终学习率与初始学习率的比值
    elif current_step > num_training_steps:
        return lr_end / lr_init  # as LambdaLR multiplies by lr_init
    else:
        # 计算多项式衰减的学习率
        lr_range = lr_init - lr_end
        decay_steps = num_training_steps - num_warmup_steps
        pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
        decay = lr_range * pct_remaining**power + lr_end
        return decay / lr_init  # as LambdaLR multiplies by lr_init


# 创建一个学习率调度对象，基于给定的优化器和参数，返回 LambdaLR 调度器对象
def get_polynomial_decay_schedule_with_warmup(
    optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
):
    """
    Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
    optimizer to `lr_end` after `num_training_steps`, with a linear warmup over `num_warmup_steps` steps.

    Args:
        optimizer (`torch.optim.Optimizer`):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
        lr_end (`float`, optional, defaults to 1e-7):
            The final learning rate after the decay.
        power (`float`, optional, defaults to 1.0):
            Power factor for polynomial decay.
        last_epoch (`int`, optional, defaults to -1):
            The index of the last epoch when resuming training.

    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """
    # 使用偏函数创建 LambdaLR 调度器，使用 _get_polynomial_decay_schedule_with_warmup_lr_lambda 函数
    lr_lambda = partial(
        _get_polynomial_decay_schedule_with_warmup_lr_lambda,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
        lr_end=lr_end,
        power=power,
        lr_init=optimizer.param_groups[0]['lr'],
    )
    # 返回 LambdaLR 调度器对象
    return LambdaLR(optimizer, lr_lambda, last_epoch)
    # 使用多项式衰减调度器生成学习率调度方案，包括预热期和最终学习率结束的定义
    # 根据优化器调度学习率
    # Args:
    #   optimizer ([`~torch.optim.Optimizer`]): 要调度学习率的优化器
    #   num_warmup_steps (`int`): 预热阶段的步数
    #   num_training_steps (`int`): 总训练步数
    #   lr_end (`float`, *optional*, defaults to 1e-7): 最终的学习率
    #   power (`float`, *optional*, defaults to 1.0): 幂因子
    #   last_epoch (`int`, *optional*, defaults to -1): 在恢复训练时的最后一个 epoch 的索引

    # 提取初始学习率
    lr_init = optimizer.defaults["lr"]
    # 检查初始学习率和最终学习率的合理性
    if not (lr_init > lr_end):
        raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")

    # 部分函数定义，生成多项式衰减与预热期学习率衰减方案的 lambda 函数
    lr_lambda = partial(
        _get_polynomial_decay_schedule_with_warmup_lr_lambda,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
        lr_end=lr_end,
        power=power,
        lr_init=lr_init,
    )
    # 返回 LambdaLR 对象，用于优化器的学习率调度
    return LambdaLR(optimizer, lr_lambda, last_epoch)
# 根据当前步数和预热步数计算逆平方根学习率衰减函数的值
def _get_inverse_sqrt_schedule_lr_lambda(current_step: int, *, num_warmup_steps: int, timescale: int = None):
    # 如果当前步数小于预热步数，则返回线性增长的学习率值
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))
    # 计算衰减的时间偏移量
    shift = timescale - num_warmup_steps
    # 计算衰减系数，根据当前步数和时间尺度
    decay = 1.0 / math.sqrt((current_step + shift) / timescale)
    return decay


def get_inverse_sqrt_schedule(
    optimizer: Optimizer, num_warmup_steps: int, timescale: int = None, last_epoch: int = -1
):
    """
    创建一个逆平方根学习率调度，从优化器中设置的初始学习率开始，在一个预热期间内线性增加学习率，从0增加到初始学习率。

    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            要调度学习率的优化器。
        num_warmup_steps (`int`):
            预热阶段的步数。
        timescale (`int`, *可选*, 默认为 `num_warmup_steps`):
            时间尺度。
        last_epoch (`int`, *可选*, 默认为 -1):
            恢复训练时的最后一个周期索引。

    Returns:
        `torch.optim.lr_scheduler.LambdaLR`：带有适当调度的对象。
    """
    # 注意：此实现修改自
    # https://github.com/google-research/big_vision/blob/f071ce68852d56099437004fd70057597a95f6ef/big_vision/utils.py#L930

    # 如果未指定时间尺度，则使用默认值（预热步数或默认值10000）
    if timescale is None:
        timescale = num_warmup_steps or 10_000

    # 创建一个局部函数，用于 LambdaLR 对象的 lr_lambda 参数
    lr_lambda = partial(_get_inverse_sqrt_schedule_lr_lambda, num_warmup_steps=num_warmup_steps, timescale=timescale)
    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)


TYPE_TO_SCHEDULER_FUNCTION = {
    SchedulerType.LINEAR: get_linear_schedule_with_warmup,
    SchedulerType.COSINE: get_cosine_schedule_with_warmup,
    SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup,
    SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
    SchedulerType.CONSTANT: get_constant_schedule,
    SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
    SchedulerType.INVERSE_SQRT: get_inverse_sqrt_schedule,  # 关联逆平方根调度函数
    SchedulerType.REDUCE_ON_PLATEAU: get_reduce_on_plateau_schedule,
}


def get_scheduler(
    name: Union[str, SchedulerType],
    optimizer: Optimizer,
    num_warmup_steps: Optional[int] = None,
    num_training_steps: Optional[int] = None,
    scheduler_specific_kwargs: Optional[dict] = None,
):
    """
    统一的 API 通过名称获取任何调度器。

    Args:
        name (Union[str, SchedulerType]):
            调度器的名称。
        optimizer (Optimizer):
            要调度学习率的优化器。
        num_warmup_steps (Optional[int], 可选):
            预热阶段的步数。
        num_training_steps (Optional[int], 可选):
            训练总步数。
        scheduler_specific_kwargs (Optional[dict], 可选):
            特定于调度器的其他参数。

    """
    Args:
        name (`str` or `SchedulerType`):
            The name of the scheduler to use.
        optimizer (`torch.optim.Optimizer`):
            The optimizer that will be used during training.
        num_warmup_steps (`int`, *optional*):
            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
            optional), the function will raise an error if it's unset and the scheduler type requires it.
        num_training_steps (`int`, *optional*):
            The number of training steps to do. This is not required by all schedulers (hence the argument being
            optional), the function will raise an error if it's unset and the scheduler type requires it.
        scheduler_specific_kwargs (`dict`, *optional*):
            Extra parameters for schedulers such as cosine with restarts. Mismatched scheduler types and scheduler
            parameters will cause the scheduler function to raise a TypeError.
    """
    # Convert `name` to SchedulerType enum
    name = SchedulerType(name)
    # Retrieve the scheduler function corresponding to `name`
    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]

    # If `optimizer` is a LayerWiseDummyOptimizer, recursively fetch schedulers for each parameter
    if optimizer is not None and isinstance(optimizer, LayerWiseDummyOptimizer):
        optimizer_dict = optimizer.optimizer_dict
        scheduler_dict = {}

        # Iterate over optimizer parameters and fetch corresponding schedulers
        for param in optimizer_dict.keys():
            scheduler_dict[param] = get_scheduler(
                name,
                optimizer=optimizer_dict[param],
                num_warmup_steps=num_warmup_steps,
                num_training_steps=num_training_steps,
            )

        # Define a scheduler hook for each parameter to apply the respective scheduler step
        def scheduler_hook(param):
            if param.grad is not None:
                scheduler_dict[param].step()

        # Register the scheduler hook for each parameter that requires gradients
        for param in optimizer_dict.keys():
            if param.requires_grad:
                param.register_post_accumulate_grad_hook(scheduler_hook)

        # Return a LayerWiseDummyScheduler instance
        return LayerWiseDummyScheduler()

    # For constant scheduler types, directly apply the scheduler function on `optimizer`
    if name == SchedulerType.CONSTANT:
        return schedule_func(optimizer)

    # Handle REDUCE_ON_PLATEAU scheduler type with specific kwargs if provided
    if scheduler_specific_kwargs is None:
        scheduler_specific_kwargs = {}
    if name == SchedulerType.REDUCE_ON_PLATEAU:
        return schedule_func(optimizer, **scheduler_specific_kwargs)

    # Raise an error if `num_warmup_steps` is not provided for required scheduler types
    if num_warmup_steps is None:
        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")

    # Apply schedulers requiring `num_warmup_steps` with the provided value
    if name == SchedulerType.CONSTANT_WITH_WARMUP:
        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
    if name == SchedulerType.INVERSE_SQRT:
        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)

    # All other schedulers require `num_training_steps` to be provided
    # 如果 `num_training_steps` 参数为 None，则抛出 ValueError 异常，指示需要提供 `num_training_steps` 参数
    if num_training_steps is None:
        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")

    # 调用指定的 `schedule_func` 函数，传入以下参数：
    # - optimizer: 优化器对象
    # - num_warmup_steps: 预热步数
    # - num_training_steps: 训练步数
    # - **scheduler_specific_kwargs: 其他特定调度器的关键字参数，传递给 `schedule_func`
    return schedule_func(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
        **scheduler_specific_kwargs,
    )
    # 定义 AdamW 优化器类，继承自 Optimizer 类
    """
    Implements Adam algorithm with weight decay fix as introduced in [Decoupled Weight Decay
    Regularization](https://arxiv.org/abs/1711.05101).

    Parameters:
        params (`Iterable[nn.parameter.Parameter]`):
            Iterable of parameters to optimize or dictionaries defining parameter groups.
        lr (`float`, *optional*, defaults to 0.001):
            The learning rate to use.
        betas (`Tuple[float,float]`, *optional*, defaults to `(0.9, 0.999)`):
            Adam's betas parameters (b1, b2).
        eps (`float`, *optional*, defaults to 1e-06):
            Adam's epsilon for numerical stability.
        weight_decay (`float`, *optional*, defaults to 0.0):
            Decoupled weight decay to apply.
        correct_bias (`bool`, *optional*, defaults to `True`):
            Whether or not to correct bias in Adam (for instance, in Bert TF repository they use `False`).
        no_deprecation_warning (`bool`, *optional*, defaults to `False`):
            A flag used to disable the deprecation warning (set to `True` to disable the warning).
    """

    def __init__(
        self,
        params: Iterable[nn.parameter.Parameter],
        lr: float = 1e-3,
        betas: Tuple[float, float] = (0.9, 0.999),
        eps: float = 1e-6,
        weight_decay: float = 0.0,
        correct_bias: bool = True,
        no_deprecation_warning: bool = False,
    ):
        # 如果未禁用不推荐使用警告，则发出未来将删除警告
        if not no_deprecation_warning:
            warnings.warn(
                "This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch"
                " implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this"
                " warning",
                FutureWarning,
            )
        # 检查学习率是否非负
        if lr < 0.0:
            raise ValueError(f"Invalid learning rate: {lr} - should be >= 0.0")
        # 检查 beta 参数是否在有效范围内
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError(f"Invalid beta parameter: {betas[0]} - should be in [0.0, 1.0)")
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError(f"Invalid beta parameter: {betas[1]} - should be in [0.0, 1.0)")
        # 检查 epsilon 是否非负
        if not 0.0 <= eps:
            raise ValueError(f"Invalid epsilon value: {eps} - should be >= 0.0")
        # 设置默认参数字典
        defaults = {"lr": lr, "betas": betas, "eps": eps, "weight_decay": weight_decay, "correct_bias": correct_bias}
        # 调用父类 Optimizer 的构造函数进行初始化
        super().__init__(params, defaults)

    @torch.no_grad()
    # 执行单个优化步骤的方法
    def step(self, closure: Callable = None):
        """
        Performs a single optimization step.

        Arguments:
            closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
        """
        loss = None
        # 如果提供了闭包函数，重新评估模型并返回损失
        if closure is not None:
            loss = closure()

        # 遍历每个参数组
        for group in self.param_groups:
            # 遍历当前参数组中的参数
            for p in group["params"]:
                # 如果参数没有梯度，继续下一个参数
                if p.grad is None:
                    continue
                grad = p.grad
                # 如果梯度是稀疏的，Adam 不支持稀疏梯度，建议使用 SparseAdam
                if grad.is_sparse:
                    raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")

                # 获取或初始化该参数的状态信息
                state = self.state[p]

                # 状态初始化
                if len(state) == 0:
                    state["step"] = 0
                    # 梯度值的指数移动平均
                    state["exp_avg"] = torch.zeros_like(p)
                    # 梯度值平方的指数移动平均
                    state["exp_avg_sq"] = torch.zeros_like(p)

                # 获取当前参数的 exp_avg 和 exp_avg_sq
                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
                beta1, beta2 = group["betas"]

                # 更新步数
                state["step"] += 1

                # 更新梯度值的指数移动平均和平方梯度值的指数移动平均
                # 使用原地操作同时更新平均值
                exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1))
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
                denom = exp_avg_sq.sqrt().add_(group["eps"])

                step_size = group["lr"]
                # 如果需要修正偏差（比如对BERT），不进行偏差修正
                if group["correct_bias"]:
                    bias_correction1 = 1.0 - beta1 ** state["step"]
                    bias_correction2 = 1.0 - beta2 ** state["step"]
                    step_size = step_size * math.sqrt(bias_correction2) / bias_correction1

                # 更新参数值
                p.addcdiv_(exp_avg, denom, value=-step_size)

                # 使用Adam进行权重衰减，与梯度平方移动平均无关
                if group["weight_decay"] > 0.0:
                    p.add_(p, alpha=(-group["lr"] * group["weight_decay"]))

        # 返回损失值（如果有）
        return loss
class Adafactor(Optimizer):
    """
    AdaFactor pytorch implementation can be used as a drop in replacement for Adam original fairseq code:
    https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py

    Paper: *Adafactor: Adaptive Learning Rates with Sublinear Memory Cost* https://arxiv.org/abs/1804.04235 Note that
    this optimizer internally adjusts the learning rate depending on the `scale_parameter`, `relative_step` and
    `warmup_init` options. To use a manual (external) learning rate schedule you should set `scale_parameter=False` and
    `relative_step=False`.

    Arguments:
        params (`Iterable[nn.parameter.Parameter]`):
            Iterable of parameters to optimize or dictionaries defining parameter groups.
        lr (`float`, *optional*):
            The external learning rate.
        eps (`Tuple[float, float]`, *optional*, defaults to `(1e-30, 0.001)`):
            Regularization constants for square gradient and parameter scale respectively
        clip_threshold (`float`, *optional*, defaults to 1.0):
            Threshold of root mean square of final gradient update
        decay_rate (`float`, *optional*, defaults to -0.8):
            Coefficient used to compute running averages of square
        beta1 (`float`, *optional*):
            Coefficient used for computing running averages of gradient
        weight_decay (`float`, *optional*, defaults to 0.0):
            Weight decay (L2 penalty)
        scale_parameter (`bool`, *optional*, defaults to `True`):
            If True, learning rate is scaled by root mean square
        relative_step (`bool`, *optional*, defaults to `True`):
            If True, time-dependent learning rate is computed instead of external learning rate
        warmup_init (`bool`, *optional*, defaults to `False`):
            Time-dependent learning rate computation depends on whether warm-up initialization is being used

    This implementation handles low-precision (FP16, bfloat) values, but we have not thoroughly tested.

    Recommended T5 finetuning settings (https://discuss.huggingface.co/t/t5-finetuning-tips/684/3):

        - Training without LR warmup or clip_threshold is not recommended.

           - use scheduled LR warm-up to fixed LR
           - use clip_threshold=1.0 (https://arxiv.org/abs/1804.04235)
        - Disable relative updates
        - Use scale_parameter=False
        - Additional optimizer operations like gradient clipping should not be used alongside Adafactor

    Example:

    ```
    Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=1e-3)
    ```

    Others reported the following combination to work well:

    ```
    Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
    ```

    When using `lr=None` with [`Trainer`] you will most likely need to use [`~optimization.AdafactorSchedule`]
    scheduler as following:

    ```
    ```

    def __init__(self, params, lr=None, eps=(1e-30, 0.001), clip_threshold=1.0, decay_rate=-0.8, beta1=None, weight_decay=0.0,
                 scale_parameter=True, relative_step=True, warmup_init=False):
        """
        Initialize Adafactor optimizer

        Args:
            params (Iterable[nn.parameter.Parameter]): Iterable of parameters to optimize or dicts defining parameter groups
            lr (float, optional): External learning rate (default: None)
            eps (Tuple[float, float], optional): Regularization constants for square gradient and parameter scale (default: (1e-30, 0.001))
            clip_threshold (float, optional): Threshold of root mean square of final gradient update (default: 1.0)
            decay_rate (float, optional): Coefficient used to compute running averages of square (default: -0.8)
            beta1 (float, optional): Coefficient used for computing running averages of gradient (default: None)
            weight_decay (float, optional): Weight decay (L2 penalty) (default: 0.0)
            scale_parameter (bool, optional): If True, learning rate is scaled by root mean square (default: True)
            relative_step (bool, optional): If True, time-dependent learning rate is computed instead of external learning rate (default: True)
            warmup_init (bool, optional): Time-dependent learning rate computation depends on whether warm-up initialization is being used (default: False)
        """
        # 调用父类构造函数，初始化优化器
        super(Adafactor, self).__init__(params, defaults=dict(lr=lr, eps=eps, clip_threshold=clip_threshold,
                                                             decay_rate=decay_rate, beta1=beta1,
                                                             weight_decay=weight_decay))
        
        # 设置 Adafactor 特有的参数
        self.scale_parameter = scale_parameter
        self.relative_step = relative_step
        self.warmup_init = warmup_init

    def step(self, closure=None):
        """
        Performs a single optimization step.

        Args:
            closure (callable, optional): A closure that reevaluates the model and returns the loss.

        Returns:
            None
        """
        # 获取当前的学习率
        lr = self.defaults['lr']
        if lr is None:
            raise ValueError('Learning rate is required for Adafactor optimizer')

        # 对参数进行更新步骤
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data

                # 参数状态初始化
                state = self.state[p]
                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                    if self.scale_parameter:
                        state['exp_avg_sq_prm'] = torch.zeros_like(p.data)

                # 获取当前状态中的平方梯度估计
                exp_avg_sq = state['exp_avg_sq']
                if self.scale_parameter:
                    exp_avg_sq_prm = state['exp_avg_sq_prm']

                # 更新步骤数
                state['step'] += 1
                bias_correction = 1 - self.decay_rate ** state['step']

                # 计算梯度平方的指数加权平均值
                if self.scale_parameter:
                    grad_sq = grad.pow(2).add_(group['eps'][0])
                    exp_avg_sq.mul_(self.decay_rate).add_(1.0 - self.decay_rate, grad_sq)
                    rms = exp_avg_sq_prm.mul(1 - bias_correction).sqrt().add_(group['eps'][1])
                    p.data.addcdiv_(-lr, grad, rms)
                else:
                    grad_sq = grad.pow(2).add_(group['eps'][0])
                    exp_avg_sq.mul_(self.decay_rate).add_(1.0 - self.decay_rate, grad_sq)
                    rms = exp_avg_sq.sqrt().add_(group['eps'][1])
                    p.data.addcdiv_(-lr, grad, rms)

                # 阈值剪裁
                if group['clip_threshold'] > 0:
                    rms_clipped = rms.clamp(min=group['clip_threshold'])
                    p.data.div_(rms_clipped)

                # L2 正则化
                if group['weight_decay'] > 0:
                    p.data.add_(-group['weight_decay'], p.data)

                # 存储更新后的平方梯度估计
                exp_avg_sq.copy_(exp_avg_sq)

        return None
    # 从transformers.optimization模块导入Adafactor和AdafactorSchedule优化器
    from transformers.optimization import Adafactor, AdafactorSchedule

    # 使用Adafactor优化器初始化，设置相关参数
    optimizer = Adafactor(
        model.parameters(),  # 传入模型的参数
        lr=1e-3,  # 学习率设置为1e-3
        eps=(1e-30, 1e-3),  # epsilon参数设置为(1e-30, 1e-3)
        clip_threshold=1.0,  # 梯度裁剪阈值设置为1.0
        decay_rate=-0.8,  # 衰减率设置为-0.8
        beta1=None,  # beta1设置为None
        weight_decay=0.0,  # 权重衰减设置为0.0
        relative_step=False,  # 相对步长设置为False
        scale_parameter=False,  # 参数缩放设置为False
        warmup_init=False,  # 初始化时的预热设置为False
    )

    # 使用AdafactorSchedule初始化学习率调度器
    lr_scheduler = AdafactorSchedule(optimizer)

    # 使用Trainer初始化训练器，传入相应的参数和优化器
    trainer = Trainer(
        ...,  # 其他训练器的参数，未提供具体内容
        optimizers=(optimizer, lr_scheduler)  # 设置优化器为前面初始化的optimizer和lr_scheduler
    )
class AdafactorSchedule(LambdaLR):
    """
    [`~optimization.Adafactor`] 自行执行调度，如果训练循环依赖于调度器（例如用于日志记录），此类创建代理对象，
    从优化器中检索当前 lr 值。

    在启动期间返回 `initial_lr`，在步进期间返回实际的 `lr`。
    """

    def __init__(self, optimizer, initial_lr=0.0):
        # 定义一个 lambda 函数返回初始 lr
        def lr_lambda(_):
            return initial_lr

        # 为每个参数组设置 "initial_lr" 键值对
        for group in optimizer.param_groups:
            group["initial_lr"] = initial_lr
        # 调用父类 LambdaLR 的初始化方法
        super().__init__(optimizer, lr_lambda)
        # 删除每个参数组的 "initial_lr" 键值对
        for group in optimizer.param_groups:
            del group["initial_lr"]

    def get_lr(self):
        # 获取优化器对象
        opt = self.optimizer
        # 获取每个参数组的学习率列表
        lrs = [
            opt._get_lr(group, opt.state[group["params"][0]])
            for group in opt.param_groups
            if group["params"][0].grad is not None
        ]
        # 如果没有可用的学习率值，则使用基础学习率列表
        if len(lrs) == 0:
            lrs = self.base_lrs  # 如果在步进之前调用
        return lrs


def get_adafactor_schedule(optimizer, initial_lr=0.0):
    """
    获取 [`~optimization.Adafactor`] 的代理调度对象

    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            要调度学习率的优化器。
        initial_lr (`float`, *可选*, 默认为 0.0):
            初始学习率

    Return:
        [`~optimization.Adafactor`] 的代理调度对象。
    """
    return AdafactorSchedule(optimizer, initial_lr)

`.\optimization_tf.py`

# 版权声明和许可信息，指明使用许可证 Apache License, Version 2.0
#
# 引入正则表达式模块和类型提示模块
import re
from typing import Callable, List, Optional, Union

# 引入 TensorFlow 库
import tensorflow as tf

# 尝试从 tf_keras.optimizers.legacy 或 tensorflow.keras.optimizers.legacy 导入 Adam 优化器
try:
    from tf_keras.optimizers.legacy import Adam
except (ImportError, ModuleNotFoundError):
    from tensorflow.keras.optimizers.legacy import Adam

# 从 modeling_tf_utils 模块中导入 keras 对象
from .modeling_tf_utils import keras

# 根据 Keras 的随机移动模块位置问题进行条件分支，选择正确的学习率调度模块
if hasattr(keras.optimizers.schedules, "learning_rate_schedule"):
    # 如果存在 learning_rate_schedule，则使用其作为调度模块
    schedules = keras.optimizers.schedules.learning_rate_schedule
else:
    # 否则使用 modeling_tf_utils 模块中的 keras.optimizers.schedules
    schedules = keras.optimizers.schedules

# 定义一个 WarmUp 类，继承自 LearningRateSchedule 类
class WarmUp(schedules.LearningRateSchedule):
    """
    应用于给定学习率衰减计划的热身（warmup）计划。

    Args:
        initial_learning_rate (`float`):
            热身结束后计划的初始学习率（这将是热身结束时的学习率）。
        decay_schedule_fn (`Callable`):
            热身结束后应用于剩余训练的衰减计划函数。
        warmup_steps (`int`):
            训练过程中热身部分的步数。
        power (`float`, *optional*, defaults to 1.0):
            用于多项式热身的幂次数（默认为线性热身）。
        name (`str`, *optional*):
            计划期间返回张量的可选名称前缀。
    """

    def __init__(
        self,
        initial_learning_rate: float,
        decay_schedule_fn: Callable,
        warmup_steps: int,
        power: float = 1.0,
        name: str = None,
    ):
        super().__init__()
        self.initial_learning_rate = initial_learning_rate
        self.warmup_steps = warmup_steps
        self.power = power
        self.decay_schedule_fn = decay_schedule_fn
        self.name = name
    # 定义一个调用对象的方法，用于实现学习率的WarmUp策略
    def __call__(self, step):
        # 使用命名空间，如果未提供名称，则使用默认名称"WarmUp"
        with tf.name_scope(self.name or "WarmUp") as name:
            # 将全局步骤转换为浮点数
            global_step_float = tf.cast(step, tf.float32)
            # 将WarmUp步骤数转换为浮点数
            warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
            # 计算当前WarmUp进度的百分比
            warmup_percent_done = global_step_float / warmup_steps_float
            # 根据WarmUp进度百分比计算当前学习率
            warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power)
            # 根据全局步骤是否小于WarmUp步骤数决定返回的学习率
            return tf.cond(
                global_step_float < warmup_steps_float,
                lambda: warmup_learning_rate,
                lambda: self.decay_schedule_fn(step - self.warmup_steps),
                name=name,
            )

    # 返回当前对象的配置信息，以字典形式
    def get_config(self):
        return {
            "initial_learning_rate": self.initial_learning_rate,
            "decay_schedule_fn": self.decay_schedule_fn,
            "warmup_steps": self.warmup_steps,
            "power": self.power,
            "name": self.name,
        }
# 创建一个优化器，其中包含使用热身阶段后的线性衰减学习率计划。

def create_optimizer(
    init_lr: float,
    num_train_steps: int,
    num_warmup_steps: int,
    min_lr_ratio: float = 0.0,
    adam_beta1: float = 0.9,
    adam_beta2: float = 0.999,
    adam_epsilon: float = 1e-8,
    adam_clipnorm: Optional[float] = None,
    adam_global_clipnorm: Optional[float] = None,
    weight_decay_rate: float = 0.0,
    power: float = 1.0,
    include_in_weight_decay: Optional[List[str]] = None,
):
    """
    创建一个优化器，并使用热身阶段后的线性衰减学习率计划。

    Args:
        init_lr (`float`):
            热身阶段结束时的初始学习率。
        num_train_steps (`int`):
            总训练步数。
        num_warmup_steps (`int`):
            热身步数。
        min_lr_ratio (`float`, *optional*, defaults to 0):
            线性衰减结束时的最终学习率将为 `init_lr * min_lr_ratio`。
        adam_beta1 (`float`, *optional*, defaults to 0.9):
            Adam优化器中的beta1参数。
        adam_beta2 (`float`, *optional*, defaults to 0.999):
            Adam优化器中的beta2参数。
        adam_epsilon (`float`, *optional*, defaults to 1e-8):
            Adam优化器中的epsilon参数。
        adam_clipnorm (`float`, *optional*, defaults to `None`):
            如果不为`None`，则对每个权重张量的梯度范数进行裁剪。
        adam_global_clipnorm (`float`, *optional*, defaults to `None`):
            如果不为`None`，则将梯度范数裁剪到此值。使用此参数时，梯度范数计算为所有权重张量的向量化结果。
        weight_decay_rate (`float`, *optional*, defaults to 0):
            使用的权重衰减率。
        power (`float`, *optional*, defaults to 1.0):
            PolynomialDecay中使用的幂次数。
        include_in_weight_decay (`List[str]`, *optional*):
            要应用权重衰减的参数名称列表（或正则表达式模式）。如果未传入，则权重衰减将应用于除偏置和层归一化参数之外的所有参数。
    """
    # 实现学习率的线性衰减。
    lr_schedule = schedules.PolynomialDecay(
        initial_learning_rate=init_lr,
        decay_steps=num_train_steps - num_warmup_steps,
        end_learning_rate=init_lr * min_lr_ratio,
        power=power,
    )
    # 如果存在热身步数，则将学习率计划包装在WarmUp对象中。
    if num_warmup_steps:
        lr_schedule = WarmUp(
            initial_learning_rate=init_lr,
            decay_schedule_fn=lr_schedule,
            warmup_steps=num_warmup_steps,
        )
    # 如果权重衰减率大于0，则使用带权重衰减的Adam优化器
    if weight_decay_rate > 0.0:
        optimizer = AdamWeightDecay(
            learning_rate=lr_schedule,                 # 学习率调度器
            weight_decay_rate=weight_decay_rate,       # 权重衰减率
            beta_1=adam_beta1,                         # Adam优化器的beta_1参数
            beta_2=adam_beta2,                         # Adam优化器的beta_2参数
            epsilon=adam_epsilon,                      # Adam优化器的epsilon参数
            clipnorm=adam_clipnorm,                    # Adam优化器的梯度范数裁剪参数
            global_clipnorm=adam_global_clipnorm,      # Adam优化器的全局梯度范数裁剪参数
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],  # 不进行权重衰减的参数列表
            include_in_weight_decay=include_in_weight_decay,  # 需要进行权重衰减的参数列表
        )
    else:
        # 如果权重衰减率为0，则使用普通的Adam优化器
        optimizer = keras.optimizers.Adam(
            learning_rate=lr_schedule,                 # 学习率调度器
            beta_1=adam_beta1,                         # Adam优化器的beta_1参数
            beta_2=adam_beta2,                         # Adam优化器的beta_2参数
            epsilon=adam_epsilon,                      # Adam优化器的epsilon参数
            clipnorm=adam_clipnorm,                    # Adam优化器的梯度范数裁剪参数
            global_clipnorm=adam_global_clipnorm,      # Adam优化器的全局梯度范数裁剪参数
        )
    
    # 我们返回优化器和学习率调度器，以便更好地独立追踪学习率的变化
    return optimizer, lr_schedule
class AdamWeightDecay(Adam):
    """
    Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the
    loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact
    with the m and v parameters in strange ways as shown in [Decoupled Weight Decay
    Regularization](https://arxiv.org/abs/1711.05101).

    Instead we want to decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent
    to adding the square of the weights to the loss with plain (non-momentum) SGD.

    Args:
        learning_rate (`Union[float, LearningRateSchedule]`, *optional*, defaults to 0.001):
            The learning rate to use or a schedule.
        beta_1 (`float`, *optional*, defaults to 0.9):
            The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates.
        beta_2 (`float`, *optional*, defaults to 0.999):
            The beta2 parameter in Adam, which is the exponential decay rate for the 2nd momentum estimates.
        epsilon (`float`, *optional*, defaults to 1e-07):
            The epsilon parameter in Adam, which is a small constant for numerical stability.
        amsgrad (`bool`, *optional*, defaults to `False`):
            Whether to apply AMSGrad variant of this algorithm or not, see [On the Convergence of Adam and
            Beyond](https://arxiv.org/abs/1904.09237).
        weight_decay_rate (`float`, *optional*, defaults to 0.0):
            The weight decay to apply.
        include_in_weight_decay (`List[str]`, *optional*):
            List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
            applied to all parameters by default (unless they are in `exclude_from_weight_decay`).
        exclude_from_weight_decay (`List[str]`, *optional*):
            List of the parameter names (or re patterns) to exclude from applying weight decay to. If a
            `include_in_weight_decay` is passed, the names in it will supersede this list.
        name (`str`, *optional*, defaults to `"AdamWeightDecay"`):
            Optional name for the operations created when applying gradients.
        kwargs (`Dict[str, Any]`, *optional*):
            Keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
            norm; `clipvalue` is clip gradients by value, `decay` is included for backward compatibility to allow time
            inverse decay of learning rate. `lr` is included for backward compatibility, recommended to use
            `learning_rate` instead.
    """

    # 继承自 Adam 优化器的扩展类，支持在梯度上应用 L2 权重衰减和全局梯度裁剪
    def __init__(
        self,
        learning_rate: Union[float, LearningRateSchedule] = 0.001,
        beta_1: float = 0.9,
        beta_2: float = 0.999,
        epsilon: float = 1e-07,
        amsgrad: bool = False,
        weight_decay_rate: float = 0.0,
        include_in_weight_decay: Optional[List[str]] = None,
        exclude_from_weight_decay: Optional[List[str]] = None,
        name: str = "AdamWeightDecay",
        **kwargs: Dict[str, Any]
    ):
        # 调用父类 Adam 的构造函数
        super().__init__(
            learning_rate=learning_rate,
            beta_1=beta_1,
            beta_2=beta_2,
            epsilon=epsilon,
            amsgrad=amsgrad,
            name=name,
            **kwargs
        )
        # 设置权重衰减率
        self.weight_decay_rate = weight_decay_rate
        # 设置应用权重衰减的参数列表
        self.include_in_weight_decay = include_in_weight_decay
        # 设置不应用权重衰减的参数列表
        self.exclude_from_weight_decay = exclude_from_weight_decay

    # 重写父类的 `apply_gradients` 方法以支持权重衰减
    def apply_gradients(self, grads_and_vars: List[Tuple[tf.Tensor, tf.Variable]], name: Optional[str] = None):
        # 如果设置了权重衰减率，应用权重衰减
        if self.weight_decay_rate > 0.0:
            # 获取当前优化器的学习率
            lr = self._get_hyper("learning_rate")
            # 获取应用了权重衰减的参数列表
            apply_decay = self._should_apply_weight_decay()
            # 遍历梯度和变量的元组
            for grad, var in grads_and_vars:
                if apply_decay and self._do_use_weight_decay(var.name):
                    # 对梯度应用权重衰减
                    grad += self.weight_decay_rate * var
                # 应用梯度到变量上
                self._resource_apply_dense(grad, var, apply_state=True)
        
        # 调用父类的 `apply_gradients` 方法应用梯度
        return super().apply_gradients(grads_and_vars, name=name)

    # 检查是否应用权重衰减到特定参数上
    def _do_use_weight_decay(self, param_name: str) -> bool:
        if self.include_in_weight_decay:
            for pattern in self.include_in_weight_decay:
                if re.search(pattern, param_name):
                    return True
        if self.exclude_from_weight_decay:
            for pattern in self.exclude_from_weight_decay:
                if re.search(pattern, param_name):
                    return False
        return True

    # 检查是否应该应用权重衰减
    def _should_apply_weight_decay(self) -> bool:
        return self.weight_decay_rate > 0.0
    def __init__(
        self,
        learning_rate: Union[float, schedules.LearningRateSchedule] = 0.001,
        beta_1: float = 0.9,
        beta_2: float = 0.999,
        epsilon: float = 1e-7,
        amsgrad: bool = False,
        weight_decay_rate: float = 0.0,
        include_in_weight_decay: Optional[List[str]] = None,
        exclude_from_weight_decay: Optional[List[str]] = None,
        name: str = "AdamWeightDecay",
        **kwargs,
    ):
        # 调用父类的构造方法，初始化优化器的参数
        super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
        # 设置权重衰减率
        self.weight_decay_rate = weight_decay_rate
        # 指定需要参与权重衰减的变量名列表
        self._include_in_weight_decay = include_in_weight_decay
        # 指定不需要参与权重衰减的变量名列表
        self._exclude_from_weight_decay = exclude_from_weight_decay

    @classmethod
    def from_config(cls, config):
        """从配置中创建优化器，并添加WarmUp自定义对象。"""
        custom_objects = {"WarmUp": WarmUp}
        return super(AdamWeightDecay, cls).from_config(config, custom_objects=custom_objects)

    def _prepare_local(self, var_device, var_dtype, apply_state):
        # 调用父类方法，准备本地变量
        super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, apply_state)
        # 将权重衰减率作为常量添加到应用状态中
        apply_state[(var_device, var_dtype)]["weight_decay_rate"] = tf.constant(
            self.weight_decay_rate, name="adam_weight_decay_rate"
        )

    def _decay_weights_op(self, var, learning_rate, apply_state):
        # 检查当前变量是否需要进行权重衰减
        do_decay = self._do_use_weight_decay(var.name)
        if do_decay:
            # 如果需要进行权重衰减，则计算并应用权重更新操作
            return var.assign_sub(
                learning_rate * var * apply_state[(var.device, var.dtype.base_dtype)]["weight_decay_rate"],
                use_locking=self._use_locking,
            )
        # 如果不需要进行权重衰减，则返回空操作
        return tf.no_op()

    def apply_gradients(self, grads_and_vars, name=None, **kwargs):
        # 将梯度和变量分离，然后调用父类的应用梯度方法
        grads, tvars = list(zip(*grads_and_vars))
        return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars), name=name, **kwargs)

    def _get_lr(self, var_device, var_dtype, apply_state):
        """从状态中获取给定变量的学习率。"""
        if apply_state is None:
            return self._decayed_lr_t[var_dtype], {}

        apply_state = apply_state or {}
        coefficients = apply_state.get((var_device, var_dtype))
        if coefficients is None:
            coefficients = self._fallback_apply_state(var_device, var_dtype)
            apply_state[(var_device, var_dtype)] = coefficients

        return coefficients["lr_t"], {"apply_state": apply_state}

    def _resource_apply_dense(self, grad, var, apply_state=None):
        # 获取学习率和参数，然后执行权重衰减操作
        lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
        decay = self._decay_weights_op(var, lr_t, apply_state)
        with tf.control_dependencies([decay]):
            return super(AdamWeightDecay, self)._resource_apply_dense(grad, var, **kwargs)
    # 用于在稀疏梯度情况下应用优化器更新，继承自父类的方法
    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
        # 获取学习率和额外参数配置
        lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
        # 计算权重衰减操作
        decay = self._decay_weights_op(var, lr_t, apply_state)
        # 确保权重衰减操作完成后再执行优化器更新
        with tf.control_dependencies([decay]):
            # 调用父类的稀疏梯度更新方法
            return super(AdamWeightDecay, self)._resource_apply_sparse(grad, var, indices, **kwargs)

    # 获取当前优化器配置信息
    def get_config(self):
        # 调用父类方法获取基础配置
        config = super().get_config()
        # 更新配置信息，添加权重衰减率
        config.update({"weight_decay_rate": self.weight_decay_rate})
        return config

    # 判断是否对给定参数名使用 L2 权重衰减
    def _do_use_weight_decay(self, param_name):
        """Whether to use L2 weight decay for `param_name`."""
        # 如果权重衰减率为零，不使用权重衰减
        if self.weight_decay_rate == 0:
            return False

        # 如果参数名符合包含在权重衰减中的正则表达式规则，则使用权重衰减
        if self._include_in_weight_decay:
            for r in self._include_in_weight_decay:
                if re.search(r, param_name) is not None:
                    return True

        # 如果参数名符合排除在外的正则表达式规则，则不使用权重衰减
        if self._exclude_from_weight_decay:
            for r in self._exclude_from_weight_decay:
                if re.search(r, param_name) is not None:
                    return False
        # 默认使用权重衰减
        return True
# 从 https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py 中提取

class GradientAccumulator:
    """
    梯度累积工具类。当与分布策略一起使用时，累加器应在副本上下文中调用。
    梯度将在每个副本上本地累积，且不进行同步。用户应调用 `.gradients` 方法获取梯度，
    如果需要，对梯度进行缩放，并将结果传递给 `apply_gradients` 方法。
    """

    # 我们使用 ON_READ 同步策略，这样在赋值时不进行同步。要获取值，我们调用 .value() 方法，
    # 在当前副本上返回值，而不进行同步。

    def __init__(self):
        """初始化累加器。"""
        self._gradients = []  # 存储累积的梯度列表
        self._accum_steps = None  # 累积步数

    @property
    def step(self):
        """累积步数的属性。"""
        if self._accum_steps is None:
            self._accum_steps = tf.Variable(
                tf.constant(0, dtype=tf.int64),
                trainable=False,
                synchronization=tf.VariableSynchronization.ON_READ,
                aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
            )
        return self._accum_steps.value()

    @property
    def gradients(self):
        """当前副本上累积的梯度列表。"""
        if not self._gradients:
            raise ValueError("需要先调用累加器以初始化梯度")
        return [gradient.value() if gradient is not None else gradient for gradient in self._gradients]

    def __call__(self, gradients):
        """在当前副本上累积 `gradients`。"""
        if not self._gradients:
            _ = self.step  # 创建步数变量。
            self._gradients.extend(
                [
                    tf.Variable(
                        tf.zeros_like(gradient),
                        trainable=False,
                        synchronization=tf.VariableSynchronization.ON_READ,
                        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
                    )
                    if gradient is not None
                    else gradient
                    for gradient in gradients
                ]
            )
        if len(gradients) != len(self._gradients):
            raise ValueError(f"期望 {len(self._gradients)} 个梯度，但实际得到 {len(gradients)} 个")

        for accum_gradient, gradient in zip(self._gradients, gradients):
            if accum_gradient is not None and gradient is not None:
                accum_gradient.assign_add(gradient)

        self._accum_steps.assign_add(1)

    def reset(self):
        """重置当前副本上累积的梯度。"""
        if not self._gradients:
            return
        self._accum_steps.assign(0)
        for gradient in self._gradients:
            if gradient is not None:
                gradient.assign(tf.zeros_like(gradient))

posted @ 2024-07-01 10:54 绝不原创的飞龙阅读(38) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-一百三十-

Transformers 源码解析（一百三十）

`.\models\yolos\init.py`

`.\models\yoso\configuration_yoso.py`

`.\models\yoso\convert_yoso_pytorch_to_pytorch.py`

`.\models\yoso\modeling_yoso.py`

`.\models\yoso\init.py`

`.\models\init.py`

`.\onnx\config.py`

`.\onnx\convert.py`

`.\onnx\features.py`

`.\onnx\utils.py`

`.\onnx\init.py`

`.\onnx\main.py`

`.\optimization.py`

`.\optimization_tf.py`

公告

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-一百三十-

Transformers 源码解析（一百三十）

.\models\yolos\__init__.py

.\models\yoso\configuration_yoso.py

.\models\yoso\convert_yoso_pytorch_to_pytorch.py

.\models\yoso\modeling_yoso.py

.\models\yoso\__init__.py

.\models\__init__.py

.\onnx\config.py

.\onnx\convert.py

.\onnx\features.py

.\onnx\utils.py

.\onnx\__init__.py

.\onnx\__main__.py

.\optimization.py

.\optimization_tf.py

公告

`.\models\yolos\init.py`

`.\models\yoso\configuration_yoso.py`

`.\models\yoso\convert_yoso_pytorch_to_pytorch.py`

`.\models\yoso\modeling_yoso.py`

`.\models\yoso\init.py`

`.\models\init.py`

`.\onnx\config.py`

`.\onnx\convert.py`

`.\onnx\features.py`

`.\onnx\utils.py`

`.\onnx\init.py`

`.\onnx\main.py`

`.\optimization.py`

`.\optimization_tf.py`