Transformers-源码解析-四十五-

Transformers 源码解析（四十五）

`.\models\encodec\feature_extraction_encodec.py`

# 指定代码文件的编码格式为UTF-8

# 版权声明，声明此代码版权归HuggingFace Inc.团队所有，保留所有权利

# 根据Apache License, Version 2.0许可证使用本文件。您除非遵守许可证，否则不得使用本文件。
# 您可以在以下网址获取许可证副本：http://www.apache.org/licenses/LICENSE-2.0

# 如果适用法律要求或书面同意，软件按"原样"分发，不附带任何明示或暗示的担保或条件。

# 导入必要的库
"""Feature extractor class for EnCodec."""

from typing import List, Optional, Union

import numpy as np  # 导入NumPy库

# 导入相关的特征提取工具和实用函数
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
from ...feature_extraction_utils import BatchFeature
from ...utils import PaddingStrategy, TensorType, logging

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 定义EnCodecFeatureExtractor类，继承自SequenceFeatureExtractor类
class EncodecFeatureExtractor(SequenceFeatureExtractor):
    r"""
    Constructs an EnCodec feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    Instantiating a feature extractor with the defaults will yield a similar configuration to that of the
    [facebook/encodec_24khz](https://huggingface.co/facebook/encodec_24khz) architecture.

    Args:
        feature_size (`int`, *optional*, defaults to 1):
            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
        sampling_rate (`int`, *optional*, defaults to 24000):
            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            The value that is used to fill the padding values.
        chunk_length_s (`float`, *optional*):
            If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
        overlap (`float`, *optional*):
            Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
            formulae : `int((1.0 - self.overlap) * self.chunk_length)`.

    """

    # 模型输入的名称列表
    model_input_names = ["input_values", "padding_mask"]

    # 构造函数，初始化EnCodecFeatureExtractor对象
    def __init__(
        self,
        feature_size: int = 1,  # 特征维度，默认为1
        sampling_rate: int = 24000,  # 采样率，默认为24000
        padding_value: float = 0.0,  # 填充值，默认为0.0
        chunk_length_s: float = None,  # 分块长度（秒），可选参数
        overlap: float = None,  # 分块重叠度，可选参数
        **kwargs,  # 其他关键字参数
    ):
        # 调用父类的构造函数
        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
        # 设置分块长度属性
        self.chunk_length_s = chunk_length_s
        # 设置分块重叠度属性
        self.overlap = overlap

    # chunk_length_s属性的getter，作为属性，可以动态更改chunk_length_s的值
    @property
    # 如果未设置 chunk_length_s，则返回 None，表示长度未定义
    def chunk_length(self) -> Optional[int]:
        if self.chunk_length_s is None:
            return None
        else:
            # 计算并返回采样率乘以 chunk_length_s 的整数值，作为 chunk 的长度
            return int(self.chunk_length_s * self.sampling_rate)

    # 这是一个属性，因为你可能想动态更改 chunk_length_s
    @property
    def chunk_stride(self) -> Optional[int]:
        # 如果 chunk_length_s 或 overlap 未定义，则返回 None，表示步长未定义
        if self.chunk_length_s is None or self.overlap is None:
            return None
        else:
            # 计算并返回步长值，确保至少为 1，根据 overlap 和 chunk_length 计算得出
            return max(1, int((1.0 - self.overlap) * self.chunk_length))

    # 函数调用运算符重载，用于将音频数据处理成模型所需格式
    def __call__(
        self,
        raw_audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
        padding: Optional[Union[bool, str, PaddingStrategy]] = None,
        truncation: Optional[bool] = False,
        max_length: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        sampling_rate: Optional[int] = None,

`.\models\encodec\modeling_encodec.py`

# coding=utf-8
# 版权 2023 Meta Platforms, Inc. 及其关联公司以及 HuggingFace Inc. 团队保留所有权利。
#
# 根据 Apache 许可证版本 2.0 （“许可证”）获得许可；
# 除非符合许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则依据许可证分发的软件是按“原样”分发的，
# 没有任何明示或暗示的保证或条件。
# 请参阅许可证了解具体语言的权限和限制。
""" PyTorch EnCodec model."""

import math
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

from ...modeling_utils import PreTrainedModel
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_encodec import EncodecConfig

logger = logging.get_logger(__name__)

# General docstring
_CONFIG_FOR_DOC = "EncodecConfig"

ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/encodec_24khz",
    "facebook/encodec_48khz",
    # See all EnCodec models at https://huggingface.co/models?filter=encodec
]

@dataclass
class EncodecOutput(ModelOutput):
    """
    Args:
        audio_codes (`torch.LongTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_values (`torch.FlaotTensor` of shape `(batch_size, sequence_length)`, *optional*)
            Decoded audio values, obtained using the decoder part of Encodec.
    """
    audio_codes: torch.LongTensor = None
    audio_values: torch.FloatTensor = None


@dataclass
class EncodecEncoderOutput(ModelOutput):
    """
    Args:
        audio_codes (`torch.LongTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
            Scaling factor for each `audio_codes` input. This is used to unscale each chunk of audio when decoding.
    """
    audio_codes: torch.LongTensor = None
    audio_scales: torch.FloatTensor = None


@dataclass
class EncodecDecoderOutput(ModelOutput):
    """
    Args:
        audio_values (`torch.FloatTensor`  of shape `(batch_size, segment_length)`, *optional*):
            Decoded audio values, obtained using the decoder part of Encodec.
    """
    audio_values: torch.FloatTensor = None


class EncodecConv1d(nn.Module):
    """Conv1d with asymmetric or causal padding and normalization."""

    def __init__(
        self, config, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, dilation: int = 1
    ):
        # 调用父类的初始化方法
        super().__init__()
        # 配置参数
        self.config = config
        # 输入通道数
        self.in_channels = in_channels
        # 输出通道数
        self.out_channels = out_channels
        # 卷积核大小
        self.kernel_size = kernel_size
        # 步长
        self.stride = stride
        # 膨胀率
        self.dilation = dilation
    ):
        super().__init__()  # 调用父类的构造函数初始化
        self.causal = config.use_causal_conv  # 设置是否使用因果卷积的配置
        self.pad_mode = config.pad_mode  # 设置填充模式的配置
        self.norm_type = config.norm_type  # 设置规范化类型的配置

        if self.norm_type not in ["weight_norm", "time_group_norm"]:
            raise ValueError(
                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
            )  # 如果规范化类型不在支持的列表中，抛出数值错误异常

        # warn user on unusual setup between dilation and stride
        if stride > 1 and dilation > 1:
            logger.warning(
                "EncodecConv1d has been initialized with stride > 1 and dilation > 1"
                f" (kernel_size={kernel_size} stride={stride}, dilation={dilation})."
            )  # 如果步长大于1且膨胀大于1，记录警告信息

        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, dilation=dilation)
        if self.norm_type == "weight_norm":
            self.conv = nn.utils.weight_norm(self.conv)  # 如果使用权重规范化，对卷积层应用权重规范化
        elif self.norm_type == "time_group_norm":
            self.norm = nn.GroupNorm(1, out_channels)  # 如果使用时间组规范化，创建时间组规范化层

    @staticmethod
    def _get_extra_padding_for_conv1d(
        hidden_states: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0
    ) -> int:
        """See `pad_for_conv1d`."""
        length = hidden_states.shape[-1]  # 获取隐藏状态的长度
        n_frames = (length - kernel_size + padding_total) / stride + 1  # 计算帧数
        ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total)  # 计算理想长度
        return ideal_length - length  # 返回额外的填充长度

    @staticmethod
    def _pad1d(hidden_states: torch.Tensor, paddings: Tuple[int, int], mode: str = "zero", value: float = 0.0):
        """Tiny wrapper around torch.nn.functional.pad, just to allow for reflect padding on small input.
        If this is the case, we insert extra 0 padding to the right before the reflection happens.
        """
        length = hidden_states.shape[-1]  # 获取隐藏状态的长度
        padding_left, padding_right = paddings  # 解包填充值
        if not mode == "reflect":
            return nn.functional.pad(hidden_states, paddings, mode, value)  # 如果填充模式不是反射，则使用指定模式进行填充

        max_pad = max(padding_left, padding_right)  # 获取最大填充值
        extra_pad = 0
        if length <= max_pad:
            extra_pad = max_pad - length + 1  # 计算额外填充长度
            hidden_states = nn.functional.pad(hidden_states, (0, extra_pad))  # 在反射填充前插入额外的0填充
        padded = nn.functional.pad(hidden_states, paddings, mode, value)  # 执行填充操作
        end = padded.shape[-1] - extra_pad  # 计算有效结束位置
        return padded[..., :end]  # 返回填充后的结果，截断额外填充部分
    # 定义一个前向传播函数，接受隐藏状态作为输入
    def forward(self, hidden_states):
        # 获取卷积层的核大小
        kernel_size = self.conv.kernel_size[0]
        # 获取卷积层的步幅
        stride = self.conv.stride[0]
        # 获取卷积层的扩张率
        dilation = self.conv.dilation[0]
        # 计算考虑扩张率后的有效核大小
        kernel_size = (kernel_size - 1) * dilation + 1  # effective kernel size with dilations
        # 计算总的填充量
        padding_total = kernel_size - stride
        # 调用方法计算额外的填充量
        extra_padding = self._get_extra_padding_for_conv1d(hidden_states, kernel_size, stride, padding_total)

        if self.causal:
            # 如果是因果卷积，进行左填充
            hidden_states = self._pad1d(hidden_states, (padding_total, extra_padding), mode=self.pad_mode)
        else:
            # 如果不是因果卷积，根据奇数步长要求进行非对称填充
            padding_right = padding_total // 2
            padding_left = padding_total - padding_right
            hidden_states = self._pad1d(
                hidden_states, (padding_left, padding_right + extra_padding), mode=self.pad_mode
            )

        # 对隐藏状态应用卷积操作
        hidden_states = self.conv(hidden_states)

        if self.norm_type == "time_group_norm":
            # 如果指定使用时间组归一化，对隐藏状态进行归一化处理
            hidden_states = self.norm(hidden_states)

        # 返回处理后的隐藏状态
        return hidden_states
class EncodecConvTranspose1d(nn.Module):
    """ConvTranspose1d with asymmetric or causal padding and normalization."""

    def __init__(self, config, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1):
        super().__init__()
        self.causal = config.use_causal_conv  # 是否使用因果卷积
        self.trim_right_ratio = config.trim_right_ratio  # 右侧修剪比例
        self.norm_type = config.norm_type  # 标准化类型
        if self.norm_type not in ["weight_norm", "time_group_norm"]:
            raise ValueError(
                f'self.norm_type must be one of `"weight_norm"`, `"time_group_norm"`), got {self.norm_type}'
            )

        self.conv = nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride)  # 定义反卷积层
        if config.norm_type == "weight_norm":
            self.conv = nn.utils.weight_norm(self.conv)  # 如果标准化类型是 weight_norm，则对卷积层应用 weight_norm
        elif config.norm_type == "time_group_norm":
            self.norm = nn.GroupNorm(1, out_channels)  # 如果标准化类型是 time_group_norm，则使用 GroupNorm

        if not (self.causal or self.trim_right_ratio == 1.0):
            raise ValueError("`trim_right_ratio` != 1.0 only makes sense for causal convolutions")

    def forward(self, hidden_states):
        kernel_size = self.conv.kernel_size[0]  # 获取卷积核大小
        stride = self.conv.stride[0]  # 获取卷积步长
        padding_total = kernel_size - stride  # 计算总的填充量

        hidden_states = self.conv(hidden_states)  # 执行反卷积操作

        if self.norm_type == "time_group_norm":
            hidden_states = self.norm(hidden_states)  # 如果使用 time_group_norm，则对隐藏状态进行标准化

        # 只修剪固定的填充。从 `pad_for_conv1d` 多余的填充将在输出时移除。
        # 在这里移除它们需要在匹配的层传递长度。
        if self.causal:
            # 根据指定的比例修剪右侧的填充
            # 如果 trim_right_ratio = 1.0，则从右侧全部修剪
            padding_right = math.ceil(padding_total * self.trim_right_ratio)
        else:
            # 对于奇数步长需要对称填充
            padding_right = padding_total // 2

        padding_left = padding_total - padding_right

        # 取消填充
        end = hidden_states.shape[-1] - padding_right
        hidden_states = hidden_states[..., padding_left:end]
        return hidden_states


class EncodecLSTM(nn.Module):
    """
    LSTM without worrying about the hidden state, nor the layout of the data. Expects input as convolutional layout.
    """

    def __init__(self, config, dimension):
        super().__init__()
        self.lstm = nn.LSTM(dimension, dimension, config.num_lstm_layers)  # 定义 LSTM 层

    def forward(self, hidden_states):
        hidden_states = hidden_states.permute(2, 0, 1)  # 调整输入的维度顺序
        hidden_states = self.lstm(hidden_states)[0] + hidden_states  # 执行 LSTM 操作并添加到原始输入
        hidden_states = hidden_states.permute(1, 2, 0)  # 调整输出的维度顺序
        return hidden_states


class EncodecResnetBlock(nn.Module):
    """
    Residual block from SEANet model as used by EnCodec.
    """
    # 初始化函数，用于初始化 EncodecBlock 类的实例
    def __init__(self, config: EncodecConfig, dim: int, dilations: List[int]):
        super().__init__()  # 调用父类的初始化方法

        # 根据配置参数和维度计算出卷积核大小的元组
        kernel_sizes = (config.residual_kernel_size, 1)

        # 检查卷积核大小的数量是否与 dilations 列表的长度相等，若不相等则抛出异常
        if len(kernel_sizes) != len(dilations):
            raise ValueError("Number of kernel sizes should match number of dilations")

        # 根据维度和压缩比例计算隐藏层的维度
        hidden = dim // config.compress
        block = []

        # 遍历卷积核大小和 dilations 列表，构建 EncodecBlock 的每个卷积层
        for i, (kernel_size, dilation) in enumerate(zip(kernel_sizes, dilations)):
            # 计算当前卷积层的输入通道数和输出通道数
            in_chs = dim if i == 0 else hidden
            out_chs = dim if i == len(kernel_sizes) - 1 else hidden

            # 添加 ELU 激活函数层
            block += [nn.ELU()]
            # 添加 EncodecConv1d 卷积层
            block += [EncodecConv1d(config, in_chs, out_chs, kernel_size, dilation=dilation)]

        # 将 block 列表转换为 nn.ModuleList，并赋值给 self.block
        self.block = nn.ModuleList(block)

        # 根据配置参数决定是否使用卷积作为 shortcut
        if config.use_conv_shortcut:
            self.shortcut = EncodecConv1d(config, dim, dim, kernel_size=1)
        else:
            # 否则使用恒等映射作为 shortcut
            self.shortcut = nn.Identity()

    # 前向传播函数，用于计算 EncodecBlock 的前向传播结果
    def forward(self, hidden_states):
        residual = hidden_states  # 记录初始输入作为残差连接的基准

        # 遍历 self.block 中的每个层，依次对 hidden_states 进行前向传播计算
        for layer in self.block:
            hidden_states = layer(hidden_states)

        # 将残差连接的结果与 self.shortcut 计算的结果相加，并返回最终的前向传播结果
        return self.shortcut(residual) + hidden_states
class EncodecEncoder(nn.Module):
    """SEANet encoder as used by EnCodec."""

    def __init__(self, config: EncodecConfig):
        super().__init__()
        model = [EncodecConv1d(config, config.audio_channels, config.num_filters, config.kernel_size)]
        scaling = 1

        # Downsample to raw audio scale
        for ratio in reversed(config.upsampling_ratios):
            current_scale = scaling * config.num_filters
            # Add residual layers
            for j in range(config.num_residual_layers):
                model += [EncodecResnetBlock(config, current_scale, [config.dilation_growth_rate**j, 1])]
            # Add downsampling layers
            model += [nn.ELU()]
            model += [EncodecConv1d(config, current_scale, current_scale * 2, kernel_size=ratio * 2, stride=ratio)]
            scaling *= 2

        model += [EncodecLSTM(config, scaling * config.num_filters)]
        model += [nn.ELU()]
        model += [EncodecConv1d(config, scaling * config.num_filters, config.hidden_size, config.last_kernel_size)]

        self.layers = nn.ModuleList(model)

    def forward(self, hidden_states):
        for layer in self.layers:
            hidden_states = layer(hidden_states)
        return hidden_states


class EncodecDecoder(nn.Module):
    """SEANet decoder as used by EnCodec."""

    def __init__(self, config: EncodecConfig):
        super().__init__()
        scaling = int(2 ** len(config.upsampling_ratios))
        model = [EncodecConv1d(config, config.hidden_size, scaling * config.num_filters, config.kernel_size)]

        model += [EncodecLSTM(config, scaling * config.num_filters)]

        # Upsample to raw audio scale
        for ratio in config.upsampling_ratios:
            current_scale = scaling * config.num_filters
            # Add upsampling layers
            model += [nn.ELU()]
            model += [
                EncodecConvTranspose1d(config, current_scale, current_scale // 2, kernel_size=ratio * 2, stride=ratio)
            ]
            # Add residual layers
            for j in range(config.num_residual_layers):
                model += [EncodecResnetBlock(config, current_scale // 2, (config.dilation_growth_rate**j, 1))]
            scaling //= 2

        # Add final layers
        model += [nn.ELU()]
        model += [EncodecConv1d(config, config.num_filters, config.audio_channels, config.last_kernel_size)]
        self.layers = nn.ModuleList(model)

    def forward(self, hidden_states):
        for layer in self.layers:
            hidden_states = layer(hidden_states)
        return hidden_states


class EncodecEuclideanCodebook(nn.Module):
    """Codebook with Euclidean distance."""
    # 初始化函数，接受一个配置对象 config
    def __init__(self, config: EncodecConfig):
        # 调用父类的初始化函数
        super().__init__()
        
        # 创建一个全零的张量作为初始的嵌入向量，大小为 (codebook_size, codebook_dim)
        embed = torch.zeros(config.codebook_size, config.codebook_dim)
        
        # 设置对象的 codebook_size 属性
        self.codebook_size = config.codebook_size
        
        # 使用 register_buffer 方法注册一个名为 "inited" 的布尔型张量，值为 True
        self.register_buffer("inited", torch.Tensor([True]))
        
        # 使用 register_buffer 方法注册一个名为 "cluster_size" 的全零张量，大小为 (codebook_size,)
        self.register_buffer("cluster_size", torch.zeros(config.codebook_size))
        
        # 使用 register_buffer 方法注册一个名为 "embed" 的张量，初始值为 embed
        self.register_buffer("embed", embed)
        
        # 使用 register_buffer 方法注册一个名为 "embed_avg" 的张量，初始值为 embed 的克隆
        self.register_buffer("embed_avg", embed.clone())

    # 量化函数，接受隐藏状态 hidden_states 作为输入
    def quantize(self, hidden_states):
        # 将 embed 转置后进行量化计算
        embed = self.embed.t()
        
        # 计算隐藏状态的平方和，并保留维度
        scaled_states = hidden_states.pow(2).sum(1, keepdim=True)
        
        # 计算距离 dist，用于量化操作
        dist = -(scaled_states - 2 * hidden_states @ embed + embed.pow(2).sum(0, keepdim=True))
        
        # 选取距离最大的索引作为量化后的索引
        embed_ind = dist.max(dim=-1).indices
        
        # 返回量化后的索引
        return embed_ind

    # 编码函数，接受隐藏状态 hidden_states 作为输入
    def encode(self, hidden_states):
        # 获取隐藏状态的形状
        shape = hidden_states.shape
        
        # 对隐藏状态进行预处理，将其重塑为二维张量
        hidden_states = hidden_states.reshape((-1, shape[-1]))
        
        # 进行量化操作
        embed_ind = self.quantize(hidden_states)
        
        # 对量化后的索引进行后处理，恢复原始形状
        embed_ind = embed_ind.view(*shape[:-1])
        
        # 返回编码后的索引
        return embed_ind

    # 解码函数，接受量化后的索引 embed_ind 作为输入
    def decode(self, embed_ind):
        # 使用 nn.functional.embedding 对 embed_ind 进行解码，使用预先定义的 embed 作为嵌入矩阵
        quantize = nn.functional.embedding(embed_ind, self.embed)
        
        # 返回解码结果
        return quantize
class EncodecVectorQuantization(nn.Module):
    """
    Vector quantization implementation. Currently supports only euclidean distance.
    """

    def __init__(self, config: EncodecConfig):
        super().__init__()
        # 初始化时创建一个 EncodecEuclideanCodebook 对象作为 codebook
        self.codebook = EncodecEuclideanCodebook(config)

    def encode(self, hidden_states):
        # 将 hidden_states 的维度进行置换，通常用于序列数据的维度变换
        hidden_states = hidden_states.permute(0, 2, 1)
        # 调用 codebook 的 encode 方法进行向量编码
        embed_in = self.codebook.encode(hidden_states)
        return embed_in

    def decode(self, embed_ind):
        # 调用 codebook 的 decode 方法进行向量解码
        quantize = self.codebook.decode(embed_ind)
        # 再次置换维度，使其与输入 hidden_states 的维度一致
        quantize = quantize.permute(0, 2, 1)
        return quantize


class EncodecResidualVectorQuantizer(nn.Module):
    """Residual Vector Quantizer."""

    def __init__(self, config: EncodecConfig):
        super().__init__()
        # 从 config 中获取相关参数
        self.codebook_size = config.codebook_size
        self.frame_rate = config.frame_rate
        self.num_quantizers = config.num_quantizers
        # 使用 ModuleList 创建多个 EncodecVectorQuantization 实例作为 layers
        self.layers = nn.ModuleList([EncodecVectorQuantization(config) for _ in range(config.num_quantizers)])

    def get_num_quantizers_for_bandwidth(self, bandwidth: Optional[float] = None) -> int:
        """Return num_quantizers based on specified target bandwidth."""
        # 根据码书大小和帧率计算每个量化器的带宽
        bw_per_q = math.log2(self.codebook_size) * self.frame_rate
        num_quantizers = self.num_quantizers
        if bandwidth is not None and bandwidth > 0.0:
            # 根据给定带宽计算最大可用的量化器数量
            num_quantizers = int(max(1, math.floor(bandwidth * 1000 / bw_per_q)))
        return num_quantizers

    def encode(self, embeddings: torch.Tensor, bandwidth: Optional[float] = None) -> torch.Tensor:
        """
        Encode a given input tensor with the specified frame rate at the given bandwidth. The RVQ encode method sets
        the appropriate number of quantizers to use and returns indices for each quantizer.
        """
        # 根据带宽计算要使用的量化器数量
        num_quantizers = self.get_num_quantizers_for_bandwidth(bandwidth)
        residual = embeddings
        all_indices = []
        # 对每个量化器层进行编码和解码
        for layer in self.layers[:num_quantizers]:
            indices = layer.encode(residual)
            quantized = layer.decode(indices)
            residual = residual - quantized
            all_indices.append(indices)
        # 将所有量化器的输出索引堆叠成一个张量返回
        out_indices = torch.stack(all_indices)
        return out_indices

    def decode(self, codes: torch.Tensor) -> torch.Tensor:
        """Decode the given codes to the quantized representation."""
        quantized_out = torch.tensor(0.0, device=codes.device)
        # 对每个量化器层进行解码
        for i, indices in enumerate(codes):
            layer = self.layers[i]
            quantized = layer.decode(indices)
            quantized_out = quantized_out + quantized
        return quantized_out


class EncodecPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类
    config_class = EncodecConfig
    # 指定模型前缀
    base_model_prefix = "encodec"
    # 主输入名称
    main_input_name = "input_values"
    # 初始化神经网络模块的权重
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果是线性层模块
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化权重，均值为0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置项，将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是LayerNorm或者GroupNorm模块
        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
            # 将偏置项初始化为零
            module.bias.data.zero_()
            # 将权重初始化为1.0
            module.weight.data.fill_(1.0)
        # 如果是一维卷积层模块
        elif isinstance(module, nn.Conv1d):
            # 使用Kaiming正态分布初始化权重
            nn.init.kaiming_normal_(module.weight)
            # 如果存在偏置项，根据特定公式使用均匀分布初始化
            if module.bias is not None:
                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
                nn.init.uniform_(module.bias, a=-k, b=k)
        # 如果是嵌入层模块
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化权重，均值为0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果指定了padding_idx，将其对应的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        # 如果是LSTM模块
        elif isinstance(module, nn.LSTM):
            # 遍历LSTM模块的命名参数
            for name, param in module.named_parameters():
                # 如果参数名中包含"weight"，使用Xavier均匀分布初始化
                if "weight" in name:
                    nn.init.xavier_uniform_(param)
                # 如果参数名中包含"bias"，将其初始化为零
                elif "bias" in name:
                    nn.init.constant_(param, 0.0)
# 定义一个多行字符串，用于存储关于 ENCODEC_START_DOCSTRING 的详细文档说明
ENCODEC_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`EncodecConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 定义一个多行字符串，用于存储关于 ENCODEC_INPUTS_DOCSTRING 的详细文档说明
ENCODEC_INPUTS_DOCSTRING = r"""
    Args:
        input_values (`torch.FloatTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
            Raw audio input converted to Float and padded to the approriate length in order to be encoded using chunks
            of length self.chunk_length and a stride of `config.chunk_stride`.
        padding_mask (`torch.BoolTensor` of shape `(batch_size, channels, sequence_length)`, *optional*):
            Mask to avoid computing scaling factors on padding token indices (can we avoid computing conv on these+).
            Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            <Tip warning={true}>

             `padding_mask` should always be passed, unless the input was truncated or not padded. This is because in
             order to process tensors effectively, the input audio should be padded so that `input_length % stride =
             step` with `step = chunk_length-stride`. This ensures that all chunks are of the same shape

            </Tip>

        bandwidth (`float`, *optional*):
            The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
            bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented as
            `bandwidth == 6.0`
        audio_codes (`torch.LongTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
            Discret code embeddings computed using `model.encode`.
        audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
            Scaling factor for each `audio_codes` input.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# 应用装饰器函数 add_start_docstrings，添加了关于 EnCodec neural audio codec 模型的描述和 ENCODEC_START_DOCSTRING 的详细文档说明
@add_start_docstrings(
    "The EnCodec neural audio codec model.",
    ENCODEC_START_DOCSTRING,
)
    def __init__(self, config: EncodecConfig):
        # 调用父类的构造函数，传入配置对象
        super().__init__(config)
        # 将配置对象存储在实例中
        self.config = config

        # 创建编码器和解码器实例，使用给定的配置对象
        self.encoder = EncodecEncoder(config)
        self.decoder = EncodecDecoder(config)

        # 创建量化器实例，使用给定的配置对象
        self.quantizer = EncodecResidualVectorQuantizer(config)

        # 计算每个码书的比特数，并检查码书大小是否为2的幂
        self.bits_per_codebook = int(math.log2(self.config.codebook_size))
        if 2**self.bits_per_codebook != self.config.codebook_size:
            raise ValueError("The codebook_size must be a power of 2.")

        # 执行后续的初始化步骤
        self.post_init()

    def get_encoder(self):
        # 返回当前实例的编码器
        return self.encoder

    def get_decoder(self):
        # 返回当前实例的解码器
        return self.decoder

    def _encode_frame(
        self, input_values: torch.Tensor, bandwidth: float, padding_mask: int
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        """
        使用底层的 VQVAE 对给定输入进行编码。如果 `config.normalize` 设置为 `True`，则首先对输入进行归一化。
        需要 padding_mask 来计算正确的比例。
        """
        # 获取输入张量的长度
        length = input_values.shape[-1]
        # 计算帧的持续时间，基于采样率和长度
        duration = length / self.config.sampling_rate

        # 如果配置中设置了 chunk_length_s，并且帧的持续时间超过了 chunk_length_s，则引发运行时错误
        if self.config.chunk_length_s is not None and duration > 1e-5 + self.config.chunk_length_s:
            raise RuntimeError(f"Duration of frame ({duration}) is longer than chunk {self.config.chunk_length_s}")

        scale = None
        if self.config.normalize:
            # 如果填充非零
            input_values = input_values * padding_mask
            # 计算输入的平均值（单声道）
            mono = torch.sum(input_values, 1, keepdim=True) / input_values.shape[1]
            # 计算标准差
            scale = mono.pow(2).mean(dim=-1, keepdim=True).sqrt() + 1e-8
            # 对输入进行归一化
            input_values = input_values / scale

        # 使用编码器对归一化后的输入进行编码，得到嵌入
        embeddings = self.encoder(input_values)
        # 使用量化器对嵌入进行编码，得到码字
        codes = self.quantizer.encode(embeddings, bandwidth)
        # 调整码字的维度顺序
        codes = codes.transpose(0, 1)
        # 返回码字和归一化的比例
        return codes, scale

    def encode(
        self,
        input_values: torch.Tensor,
        padding_mask: torch.Tensor = None,
        bandwidth: Optional[float] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor, Optional[torch.Tensor]], EncodecEncoderOutput]:
        """
        Encodes the input audio waveform into discrete codes.

        Args:
            input_values (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Float values of the input audio waveform.
            padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Padding mask used to pad the `input_values`.
            bandwidth (`float`, *optional*):
                The target bandwidth. Must be one of `config.target_bandwidths`. If `None`, uses the smallest possible
                bandwidth. bandwidth is represented as a thousandth of what it is, e.g. 6kbps bandwidth is represented
                as bandwidth == 6.0

        Returns:
            A list of frames containing the discrete encoded codes for the input audio waveform, along with rescaling
            factors for each chunk when `normalize` is True. Each frame is a tuple `(codebook, scale)`, with
            `codebook` of shape `[batch_size, num_codebooks, frames]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        if bandwidth is None:
            bandwidth = self.config.target_bandwidths[0]
        if bandwidth not in self.config.target_bandwidths:
            raise ValueError(
                f"This model doesn't support the bandwidth {bandwidth}. "
                f"Select one of {self.config.target_bandwidths}."
            )

        _, channels, input_length = input_values.shape

        if channels < 1 or channels > 2:
            raise ValueError(f"Number of audio channels must be 1 or 2, but got {channels}")

        # Determine the chunk length and stride based on model configuration
        chunk_length = self.config.chunk_length
        if chunk_length is None:
            chunk_length = input_length
            stride = input_length  # No overlap between chunks if chunk_length equals input_length
        else:
            stride = self.config.chunk_stride

        # If padding mask is not provided, create a mask with all elements set to True
        if padding_mask is None:
            padding_mask = torch.ones_like(input_values).bool()

        encoded_frames = []
        scales = []

        # Check if input length is properly divisible into chunks
        step = chunk_length - stride
        if (input_length % stride) - step != 0:
            raise ValueError(
                "The input length is not properly padded for batched chunked decoding. Make sure to pad the input correctly."
            )

        # Iterate over the input audio waveform in chunks
        for offset in range(0, input_length - step, stride):
            mask = padding_mask[..., offset : offset + chunk_length].bool()
            frame = input_values[:, :, offset : offset + chunk_length]
            # Encode each chunk of audio waveform into discrete codes
            encoded_frame, scale = self._encode_frame(frame, bandwidth, mask)
            encoded_frames.append(encoded_frame)
            scales.append(scale)

        encoded_frames = torch.stack(encoded_frames)

        # Return encoded frames and scales if return_dict is False
        if not return_dict:
            return (encoded_frames, scales)

        # If return_dict is True, return an instance of EncodecEncoderOutput
        return EncodecEncoderOutput(encoded_frames, scales)
    def _linear_overlap_add(frames: List[torch.Tensor], stride: int):
        # Generic overlap add, with linear fade-in/fade-out, supporting complex scenario
        # e.g., more than 2 frames per position.
        # The core idea is to use a weight function that is a triangle,
        # with a maximum value at the middle of the chunk.
        # We use this weighting when summing the frames, and divide by the sum of weights
        # for each position at the end. Thus:
        #   - if a frame is the only one to cover a position, the weighting is a no-op.
        #   - if 2 frames cover a position:
        #          ...  ...
        #         /   \/   \
        #        /    /\    \
        #            S  T       , i.e. S offset of second frame starts, T end of first frame.
        # Then the weight function for each one is: (t - S), (T - t), with `t` a given offset.
        # After the final normalization, the weight of the second frame at position `t` is
        # (t - S) / (t - S + (T - t)) = (t - S) / (T - S), which is exactly what we want.
        #
        #   - if more than 2 frames overlap at a given point, we hope that by induction
        #      something sensible happens.

        # 检查输入帧列表是否为空
        if len(frames) == 0:
            raise ValueError("`frames` cannot be an empty list.")

        # 获取第一个帧的设备信息，数据类型和形状（去掉最后一个维度）
        device = frames[0].device
        dtype = frames[0].dtype
        shape = frames[0].shape[:-1]

        # 计算总的输出大小，考虑重叠时的步长
        total_size = stride * (len(frames) - 1) + frames[-1].shape[-1]

        # 获取第一个帧的长度
        frame_length = frames[0].shape[-1]

        # 生成时间向量，用于权重计算，使用三角形权重函数
        time_vec = torch.linspace(0, 1, frame_length + 2, device=device, dtype=dtype)[1:-1]
        weight = 0.5 - (time_vec - 0.5).abs()

        # 初始化总权重和输出张量
        sum_weight = torch.zeros(total_size, device=device, dtype=dtype)
        out = torch.zeros(*shape, total_size, device=device, dtype=dtype)
        offset: int = 0

        # 遍历每个帧并添加到输出张量中，同时累加权重
        for frame in frames:
            frame_length = frame.shape[-1]
            out[..., offset : offset + frame_length] += weight[:frame_length] * frame
            sum_weight[offset : offset + frame_length] += weight[:frame_length]
            offset += stride

        # 检查最小的权重和是否大于零，防止除以零错误
        if sum_weight.min() == 0:
            raise ValueError(f"`sum_weight` minimum element must be bigger than zero: {sum_weight}`")

        # 返回归一化后的输出张量
        return out / sum_weight
    ) -> Union[Tuple[torch.Tensor, torch.Tensor], EncodecDecoderOutput]:
        """
        Decodes the given frames into an output audio waveform.

        Note that the output might be a bit bigger than the input. In that case, any extra steps at the end can be
        trimmed.

        Args:
            audio_codes (`torch.LongTensor`  of shape `(batch_size, nb_chunks, chunk_length)`, *optional*):
                Discrete code embeddings computed using `model.encode`.
            audio_scales (`torch.Tensor` of shape `(batch_size, nb_chunks)`, *optional*):
                Scaling factor for each `audio_codes` input.
            padding_mask (`torch.Tensor` of shape `(batch_size, channels, sequence_length)`):
                Padding mask used to pad the `input_values`.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        """
        # Determine whether to return a dictionary output based on provided argument or default configuration
        return_dict = return_dict or self.config.return_dict

        # Retrieve the chunk length from configuration
        chunk_length = self.config.chunk_length

        # If chunk_length is not specified, decode a single frame
        if chunk_length is None:
            if len(audio_codes) != 1:
                raise ValueError(f"Expected one frame, got {len(audio_codes)}")
            # Decode the single frame using the provided audio codes and scales
            audio_values = self._decode_frame(audio_codes[0], audio_scales[0])
        else:
            decoded_frames = []

            # Decode each frame using corresponding codes and scales
            for frame, scale in zip(audio_codes, audio_scales):
                frames = self._decode_frame(frame, scale)
                decoded_frames.append(frames)

            # Combine decoded frames using linear overlap-add method
            audio_values = self._linear_overlap_add(decoded_frames, self.config.chunk_stride or 1)

        # Trim the audio waveform based on the provided padding mask
        if padding_mask is not None and padding_mask.shape[-1] < audio_values.shape[-1]:
            audio_values = audio_values[..., : padding_mask.shape[-1]]

        # Return either a tuple or EncodecDecoderOutput based on return_dict flag
        if not return_dict:
            return (audio_values,)
        return EncodecDecoderOutput(audio_values)
        return_dict = return_dict or self.config.return_dict
        # 如果 return_dict 为 None，则使用 self.config.return_dict 的值作为默认值

        if padding_mask is None:
            padding_mask = torch.ones_like(input_values).bool()
        # 如果 padding_mask 为 None，则创建一个与 input_values 维度相同的全为 True 的布尔张量作为 padding_mask

        if audio_codes is not None and audio_scales is None:
            raise ValueError("You specified `audio_codes` but did not specify the `audio_scales`")
        # 如果指定了 audio_codes 但未指定 audio_scales，则抛出 ValueError 异常

        if audio_scales is not None and audio_codes is None:
            raise ValueError("You specified `audio_scales` but did not specify the `audio_codes`")
        # 如果指定了 audio_scales 但未指定 audio_codes，则抛出 ValueError 异常

        if audio_scales is None and audio_codes is None:
            audio_codes, audio_scales = self.encode(input_values, padding_mask, bandwidth, False)
        # 如果未指定 audio_scales 和 audio_codes，则调用 self.encode 方法生成它们

        audio_values = self.decode(audio_codes, audio_scales, padding_mask, return_dict=return_dict)[0]
        # 使用 self.decode 方法解码得到 audio_values

        if not return_dict:
            return (audio_codes, audio_values)
        # 如果 return_dict 为 False，则返回 audio_codes 和 audio_values 的元组

        return EncodecOutput(audio_codes=audio_codes, audio_values=audio_values)
        # 否则，返回一个 EncodecOutput 对象，包含 audio_codes 和 audio_values

`.\models\encodec\init.py`

# 版权声明及使用许可信息，声明该代码版权归HuggingFace团队所有
#
# 在Apache许可版本2.0下授权使用本文件；除非符合许可条款，否则不得使用本文件
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件基于“现状”提供，不附带任何明示或暗示的担保
# 查看许可证以获取特定语言的权限和限制
from typing import TYPE_CHECKING

# 从utils模块导入所需的内容
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
)

# 定义模块的导入结构
_import_structure = {
    "configuration_encodec": [
        "ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "EncodecConfig",
    ],
    "feature_extraction_encodec": ["EncodecFeatureExtractor"],
}

# 检查是否存在torch可用，若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果torch可用，则将以下模块添加到导入结构中
    _import_structure["modeling_encodec"] = [
        "ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST",
        "EncodecModel",
        "EncodecPreTrainedModel",
    ]

# 如果类型检查为真，则导入以下模块
if TYPE_CHECKING:
    from .configuration_encodec import (
        ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP,
        EncodecConfig,
    )
    from .feature_extraction_encodec import EncodecFeatureExtractor

    # 再次检查torch是否可用，不可用则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果torch可用，则导入以下模块
        from .modeling_encodec import (
            ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST,
            EncodecModel,
            EncodecPreTrainedModel,
        )

else:
    # 如果不是类型检查，则导入sys模块，并将当前模块设置为_LazyModule的实例
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\encoder_decoder\configuration_encoder_decoder.py`

# 设置文件编码为UTF-8
# 版权声明：2020年由HuggingFace Inc.团队版权所有。
# 版权声明：2018年，NVIDIA CORPORATION版权所有。
#
# 根据Apache许可证2.0版（“许可证”）授权，除非符合许可证规定，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按“原样”分发，不提供任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。

# 导入必要的模块
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取名为__name__的当前日志记录器
logger = logging.get_logger(__name__)

# 定义EncoderDecoderConfig类，继承自PretrainedConfig
class EncoderDecoderConfig(PretrainedConfig):
    r"""
    [`EncoderDecoderConfig`]是用于存储[`EncoderDecoderModel`]配置的配置类。它用于根据指定的参数实例化编码器和解码器模型。

    配置对象继承自[`PretrainedConfig`]，可用于控制模型输出。有关更多信息，请阅读[`PretrainedConfig`]的文档。

    Args:
        kwargs (*可选参数*):
            关键字参数的字典。特别是:

                - **encoder** ([`PretrainedConfig`]，*可选*) -- 定义编码器配置的配置对象实例。
                - **decoder** ([`PretrainedConfig`]，*可选*) -- 定义解码器配置的配置对象实例。

    Examples:

    ```
    >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel

    >>> # 初始化一个Bert google-bert/bert-base-uncased风格的配置
    >>> config_encoder = BertConfig()
    >>> config_decoder = BertConfig()

    >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)

    >>> # 初始化一个Bert2Bert模型（带有随机权重），从google-bert/bert-base-uncased风格的配置开始
    >>> model = EncoderDecoderModel(config=config)

    >>> # 访问模型配置
    >>> config_encoder = model.config.encoder
    >>> config_decoder = model.config.decoder
    >>> # 将解码器配置设置为因果语言模型
    >>> config_decoder.is_decoder = True
    >>> config_decoder.add_cross_attention = True

    >>> # 保存模型，包括其配置
    >>> model.save_pretrained("my-model")

    >>> # 从预训练文件夹加载模型和配置
    >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained("my-model")
    >>> model = EncoderDecoderModel.from_pretrained("my-model", config=encoder_decoder_config)
    ```"""
    
    # 模型类型为“encoder-decoder”
    model_type = "encoder-decoder"
    # 是复合对象
    is_composition = True
    # 初始化方法，继承自父类并接收关键字参数
    def __init__(self, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 断言确保参数中包含 "encoder" 和 "decoder"，否则抛出异常
        assert (
            "encoder" in kwargs and "decoder" in kwargs
        ), "Config has to be initialized with encoder and decoder config"
        # 从参数中弹出 "encoder" 和 "decoder" 的配置信息
        encoder_config = kwargs.pop("encoder")
        # 获取编码器模型类型并弹出其配置信息
        encoder_model_type = encoder_config.pop("model_type")
        # 获取解码器配置信息并弹出其模型类型
        decoder_config = kwargs.pop("decoder")
        decoder_model_type = decoder_config.pop("model_type")

        # 导入自动配置模块
        from ..auto.configuration_auto import AutoConfig

        # 使用自动配置模块为编码器创建配置对象
        self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config)
        # 使用自动配置模块为解码器创建配置对象
        self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config)
        # 设置标志，表明这是一个编码器-解码器结构
        self.is_encoder_decoder = True

    # 类方法：根据预训练的编码器和解码器配置实例化一个编码器-解码器配置对象
    @classmethod
    def from_encoder_decoder_configs(
        cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
    ) -> PretrainedConfig:
        r"""
        Instantiate a [`EncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model configuration and
        decoder model configuration.

        Returns:
            [`EncoderDecoderConfig`]: An instance of a configuration object
        """
        # 记录信息：为解码器配置设置 `is_decoder=True` 和 `add_cross_attention=True`
        logger.info("Set `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
        # 设置解码器配置为解码器类型，并启用交叉注意力机制
        decoder_config.is_decoder = True
        decoder_config.add_cross_attention = True

        # 使用当前类构造函数创建一个编码器-解码器配置对象，并返回
        return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)

`.\models\encoder_decoder\modeling_encoder_decoder.py`

# 设置编码方式为 UTF-8，确保脚本可以正确处理各种字符
# 版权声明和许可证信息，表明此代码遵循 Apache License, Version 2.0
# 详细许可证信息可以通过指定的 URL 获取
# 除非符合许可证中的规定，否则不得使用此文件
# 引入必要的模块和库
import gc  # Python 的垃圾回收模块，用于手动控制内存的释放
import inspect  # 用于获取对象信息的模块，如获取函数或类的源代码
import os  # 提供了许多与操作系统交互的函数
import tempfile  # 用于创建临时文件和目录的模块
import warnings  # 用于处理警告的模块
from typing import Optional, Tuple, Union  # Python 的类型提示模块，指定函数参数和返回值的类型

import torch  # PyTorch 深度学习库
from torch import nn  # PyTorch 中的神经网络模块
from torch.nn import CrossEntropyLoss  # 交叉熵损失函数

# 从 Hugging Face 的 Transformers 库中导入相关的模块和函数
from ...configuration_utils import PretrainedConfig  # 预训练配置文件类
from ...modeling_outputs import BaseModelOutput, Seq2SeqLMOutput  # 模型输出类
from ...modeling_utils import PreTrainedModel  # 预训练模型基类
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings  # 辅助函数和日志模块
from ..auto.configuration_auto import AutoConfig  # 自动配置模块
from ..auto.modeling_auto import AutoModel, AutoModelForCausalLM  # 自动模型和自动语言模型模块
from .configuration_encoder_decoder import EncoderDecoderConfig  # 编码器-解码器配置类

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 用于文档的配置对象名称
_CONFIG_FOR_DOC = "EncoderDecoderConfig"

# 弃用警告信息，指出新版本的变化和使用建议
DEPRECATION_WARNING = (
    "Version v4.12.0 introduces a better way to train encoder-decoder models by computing the loss inside the"
    " encoder-decoder framework rather than in the decoder itself. You may observe training discrepancies if"
    " fine-tuning a model trained with versions anterior to 4.12.0. The decoder_input_ids are now created based on the"
    " labels, no need to pass them yourself anymore."
)

# Encoder-Decoder 模型文档字符串的起始部分，使用原始字符串表示
ENCODER_DECODER_START_DOCSTRING = r"""
    This class can be used to initialize a sequence-to-sequence model with any pretrained autoencoding model as the
    encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
    [`~AutoModel.from_pretrained`] function and the decoder is loaded via [`~AutoModelForCausalLM.from_pretrained`]
    function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream
    generative task, like summarization.

    The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
    tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
    Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
    Zhou, Wei Li, Peter J. Liu.

    After such an Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other models
    (see the examples for more information).

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)



    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.



    Parameters:
        config ([`EncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

ENCODER_DECODER_INPUTS_DOCSTRING = r"""
"""

# 定义一个函数，用于将输入的 token ids 向右移动一位
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    将输入的 token ids 向右移动一位。
    """
    # 创建一个与 input_ids 相同形状的零张量 shifted_input_ids
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    # 将 input_ids 的除第一列外的数据复制到 shifted_input_ids 的第二列开始
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    # 如果 decoder_start_token_id 为 None，则抛出 ValueError
    if decoder_start_token_id is None:
        raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
    # 将 shifted_input_ids 的第一列设置为 decoder_start_token_id
    shifted_input_ids[:, 0] = decoder_start_token_id

    # 如果 pad_token_id 为 None，则抛出 ValueError
    if pad_token_id is None:
        raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
    # 将 shifted_input_ids 中可能的 -100 值替换为 pad_token_id
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    # 返回向右移动后的 input ids
    return shifted_input_ids


@add_start_docstrings(ENCODER_DECODER_START_DOCSTRING)
# 定义 EncoderDecoderModel 类，继承自 PreTrainedModel
class EncoderDecoderModel(PreTrainedModel):
    r"""
    [`EncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with one
    of the base model classes of the library as encoder and another one as decoder when created with the
    :meth*~transformers.AutoModel.from_pretrained* class method for the encoder and
    :meth*~transformers.AutoModelForCausalLM.from_pretrained* class method for the decoder.
    
    EncoderDecoderModel 是一个通用的模型类，当使用 :meth:`~transformers.AutoModel.from_pretrained` 方法为编码器和
    :meth:`~transformers.AutoModelForCausalLM.from_pretrained` 方法为解码器创建时，它将被实例化为一个转换器架构。
    """

    # 类变量，指定配置类为 EncoderDecoderConfig
    config_class = EncoderDecoderConfig
    # 类变量，指定基础模型前缀为 "encoder_decoder"
    base_model_prefix = "encoder_decoder"
    # 类变量，主输入名称为 "input_ids"
    main_input_name = "input_ids"
    # 类变量，支持梯度检查点
    supports_gradient_checkpointing = True

    # 初始化方法
    def __init__(
        self,
        config: Optional[PretrainedConfig] = None,
        encoder: Optional[PreTrainedModel] = None,
        decoder: Optional[PreTrainedModel] = None,
    ):
        """
        Initialize the EncoderDecoderModel.
        初始化 EncoderDecoderModel。
        """
        # 如果需要，将编码器和解码器的权重绑定在一起
        def tie_weights(self):
            """
            Tie encoder & decoder if needed.
            如果需要，将编码器和解码器的权重绑定在一起。
            """
            if self.config.tie_encoder_decoder:
                # 获取解码器基础模型的前缀
                decoder_base_model_prefix = self.decoder.base_model_prefix
                # 调用 _tie_encoder_decoder_weights 方法，将编码器和解码器的权重绑定在一起
                self._tie_encoder_decoder_weights(
                    self.encoder, self.decoder._modules[decoder_base_model_prefix], self.decoder.base_model_prefix
                )

    # 获取编码器模型的方法
    def get_encoder(self):
        """
        Get the encoder model.
        获取编码器模型。
        """
        return self.encoder

    # 获取解码器模型的方法
    def get_decoder(self):
        """
        Get the decoder model.
        获取解码器模型。
        """
        return self.decoder

    # 获取输入嵌入的方法
    def get_input_embeddings(self):
        """
        Get the input embeddings.
        获取输入嵌入。
        """
        return self.encoder.get_input_embeddings()

    # 获取输出嵌入的方法
    def get_output_embeddings(self):
        """
        Get the output embeddings.
        获取输出嵌入。
        """
        return self.decoder.get_output_embeddings()

    # 设置输出嵌入的方法
    def set_output_embeddings(self, new_embeddings):
        """
        Set the output embeddings.
        设置输出嵌入。
        """
        return self.decoder.set_output_embeddings(new_embeddings)

    @classmethod
    @classmethod
    def from_encoder_decoder_pretrained(
        cls,
        encoder_pretrained_model_name_or_path: str = None,
        decoder_pretrained_model_name_or_path: str = None,
        *model_args,
        **kwargs,
    ):
        """
        Instantiate an encoder-decoder model from pretrained model configurations.
        从预训练模型配置实例化一个编码器-解码器模型。
        """
        pass

    @add_start_docstrings_to_model_forward(ENCODER_DECODER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    # 定义模型的前向传播方法，用于生成模型的输出
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ):
        # 此处是模型前向传播的方法，接收多个输入参数，包括输入的 token IDs、注意力掩码等
        # 返回模型的输出结果，如生成的 token IDs、注意力分布等

    # 根据标签准备解码器的输入 token IDs
    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)

    # 准备用于生成的输入参数，构建生成过程所需的输入字典
    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
    ):
        # 调用解码器对象的准备输入方法，获取解码器的输入信息
        decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
        # 如果解码器输入中包含注意力掩码，则获取之
        decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
        # 构建输入字典，包括输入的注意力掩码、解码器的注意力掩码、解码器的输入 token IDs、编码器的输出等
        input_dict = {
            "attention_mask": attention_mask,
            "decoder_attention_mask": decoder_attention_mask,
            "decoder_input_ids": decoder_inputs["input_ids"],
            "encoder_outputs": encoder_outputs,
            "past_key_values": decoder_inputs["past_key_values"],
            "use_cache": use_cache,
        }
        return input_dict

    # 调整 token embeddings 大小的方法，目前尚未实现
    def resize_token_embeddings(self, *args, **kwargs):
        raise NotImplementedError(
            "Resizing the embedding layers via the EncoderDecoderModel directly is not supported. Please use the"
            " respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or"
            " model.decoder.resize_token_embeddings(...))"
        )

    # 重新排序缓存数据的方法，用于束搜索时的缓存重排
    def _reorder_cache(self, past_key_values, beam_idx):
        # 调用解码器对象的缓存重排方法
        return self.decoder._reorder_cache(past_key_values, beam_idx)

`.\models\encoder_decoder\modeling_flax_encoder_decoder.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Classes to support Flax Encoder-Decoder architectures"""


import os
from typing import Optional, Tuple, Union

import flax.linen as nn  # 导入 Flax 的 Linen 模块，用于定义神经网络结构
import jax  # 导入 JAX，用于自动求导和加速数值计算
import jax.numpy as jnp  # 导入 JAX 的 NumPy 接口，用于操作多维数组
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze  # 导入 Flax 的冻结字典相关函数
from flax.traverse_util import flatten_dict, unflatten_dict  # 导入 Flax 的字典扁平化和反扁平化工具函数
from jax import lax  # 导入 JAX 的 lax 模块，提供了一些数值计算的基本操作
from jax.random import PRNGKey  # 导入 JAX 的随机数生成模块

from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutputWithCrossAttentions, FlaxSeq2SeqLMOutput  # 导入输出相关类
from ...modeling_flax_utils import FlaxPreTrainedModel  # 导入 Flax 预训练模型基类
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings  # 导入工具函数和日志记录器
from ..auto.configuration_auto import AutoConfig  # 导入自动配置类
from ..auto.modeling_flax_auto import FlaxAutoModel, FlaxAutoModelForCausalLM  # 导入自动模型加载类
from .configuration_encoder_decoder import EncoderDecoderConfig  # 导入编码解码器配置类


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器实例

_CONFIG_FOR_DOC = "EncoderDecoderConfig"

ENCODER_DECODER_START_DOCSTRING = r"""
    This class can be used to initialize a sequence-to-sequence model with any pretrained autoencoding model as the
    encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
    [`~AutoModel.from_pretrained`] function and the decoder is loaded via [`~AutoModelForCausalLM.from_pretrained`]
    function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream
    generative task, like summarization.

    The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
    tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
    Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
    Zhou, Wei Li, Peter J. Liu.

    After such an Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other models
    (see the examples for more information).

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a Flax Linen
    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
    # 定义 EncoderDecoder 类，继承自 FlaxPreTrainedModel 类
    class EncoderDecoder(FlaxPreTrainedModel):
        # 初始化方法，根据给定的配置 config 初始化模型
        def __init__(self, config: EncoderDecoderConfig):
            # 调用父类的初始化方法，传入配置 config
            super().__init__(config)
    
        # forward 方法用于模型推理，接收输入并返回输出
        def forward(
            self,
            # 输入数据
            input_ids: jnp.ndarray,
            # 注意力掩码
            attention_mask: jnp.ndarray,
            # token 类型 IDs
            token_type_ids: jnp.ndarray = None,
            # 位置编码
            position_ids: jnp.ndarray = None,
            # 校准
            inputs_embeds: jnp.ndarray = None,
            # 输出模型
            output_attentions: bool = False,
            # 输出层
            output_hidden_states: bool = False,
            # 返回结果
            return_dict: bool = False,
        ) -> Union[FlaxBaseModelOutput, Tuple[jnp.ndarray]]:
            # 参数解释
            """
            forward方法用于模型推理，接收一系列输入数据并返回模型输出结果。
    
            Parameters:
                input_ids (jax.numpy.ndarray): 输入的 token IDs.
                attention_mask (jax.numpy.ndarray): 注意力掩码，用于指示哪些位置是 padding 的.
                token_type_ids (jax.numpy.ndarray, optional): token 类型 IDs，默认为 None.
                position_ids (jax.numpy.ndarray, optional): 位置编码，默认为 None.
                inputs_embeds (jax.numpy.ndarray, optional): 输入的嵌入向量，默认为 None.
                output_attentions (bool, optional): 是否输出注意力权重，默认为 False.
                output_hidden_states (bool, optional): 是否输出所有隐藏状态，默认为 False.
                return_dict (bool, optional): 是否返回字典格式的输出，默认为 False.
    
            Returns:
                Union[FlaxBaseModelOutput, Tuple[jax.numpy.ndarray]]: 模型输出结果，可能为多种格式的返回值。
            """
            # 实现模型的前向推理过程
            raise NotImplementedError
    
        # 静态方法，用于从预训练模型加载权重
        @classmethod
        def from_pretrained(
            cls,
            # 模型路径或标识符
            pretrained_model_name_or_path: str,
            # 模型配置
            config: Optional[EncoderDecoderConfig] = None,
            # 数据类型，默认为 float32
            dtype: Optional[jax.numpy.dtype] = jnp.float32,
            # 本地缓存目录
            local_files_only: bool = False,
            # 使用显存
            use_auth_token: Optional[Union[bool, str]] = None,
            # 一系列附加关键字参数
            **kwargs,
        ) -> "FlaxPreTrainedModel":
            # 参数解释
            """
            从预训练模型加载模型权重和配置信息。
    
            Parameters:
                pretrained_model_name_or_path (str): 预训练模型的路径或标识符.
                config (Optional[EncoderDecoderConfig]): 模型配置，可选.
                dtype (Optional[jax.numpy.dtype]): 计算时使用的数据类型，默认为 jax.numpy.float32.
                local_files_only (bool): 是否只使用本地文件，默认为 False.
                use_auth_token (Optional[Union[bool, str]]): 是否使用授权令牌，默认为 None.
                **kwargs: 其他关键字参数.
    
            Returns:
                FlaxPreTrainedModel: 加载并返回预训练模型.
            """
            # 如果未提供配置，创建一个空的配置对象
            if config is None:
                config = EncoderDecoderConfig()
    
            # 获取模型的 URL 或本地路径
            resolved_model_path = hf_cache_or_filename(pretrained_model_name_or_path, kwargs)
    
            # 从 URL 或本地路径加载模型文件
            model_file = download_model_from_path(resolved_model_path, local_files_only=local_files_only)
    
            # 加载模型的配置信息，这里只加载配置而不加载权重
            model_config = cls.config_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
    
            # 如果指定了 dtype，则将模型的计算类型设置为给定的 dtype
            if dtype is not None:
                model_config.dtype = dtype
    
            # 根据配置创建模型实例
            model = cls(config=model_config, **kwargs)
    
            # 如果存在本地缓存，加载权重
            if os.path.isfile(model_file):
                # 使用 JAX 来加载权重
                model_params = load_flax_weights_in_model(model, model_file)
    
            # 返回加载好权重的模型实例
            return model
    
        # 方法用于将模型参数转换为半精度（float16）
        def to_fp16(self):
            # 参数解释
            """
            将模型参数转换为半精度（float16）.
    
            Returns:
                EncoderDecoder: 转换后的半精度模型实例.
            """
            # 实现方法体
            raise NotImplementedError
    
        # 方法用于将模型参数转换为 bfloat16
        def to_bf16(self):
            # 参数解释
            """
            将模型参数转换为 bfloat16.
    
            Returns:
                EncoderDecoder: 转换后的 bfloat16 模型实例.
            """
            # 实现方法体
            raise NotImplementedError
"""

ENCODER_DECODER_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            For sequence to sequence training, `decoder_input_ids` should be provided. `decoder_input_ids` should be
            created outside of the model by shifting the `labels` to the right, replacing -100 by the `pad_token_id`
            and prepending them with the `decoder_start_token_id`.
        decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.encoder.max_position_embeddings - 1]`.
        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
            range `[0, config.decoder.max_position_embeddings - 1]`.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            If set to `True`, the model will return a [`~utils.FlaxSeq2SeqLMOutput`] instead of a plain tuple.
"""

ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.encoder.max_position_embeddings - 1]`.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            If set to `True`, the model will return a [`~utils.FlaxSeq2SeqLMOutput`] instead of a plain tuple.
"""
    Args:
        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.
            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
            [What are input IDs?](../glossary#input-ids)
        
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.encoder.max_position_embeddings - 1]`.
        
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        
        return_dict (`bool`, *optional*):
            If set to `True`, the model will return a [`~utils.FlaxBaseModelOutput`] instead of a plain tuple.
"""

ENCODER_DECODER_DECODE_INPUTS_DOCSTRING = r"""
"""

# 定义一个 Flax 编码器解码器模块的类
class FlaxEncoderDecoderModule(nn.Module):
    # 类属性：配置信息为 EncoderDecoderConfig 类型，数据类型为 jnp.float32
    config: EncoderDecoderConfig
    dtype: jnp.dtype = jnp.float32

    # 初始化方法
    def setup(self):
        # 获取编码器和解码器的配置
        encoder_config = self.config.encoder
        decoder_config = self.config.decoder

        # 从 modeling_flax_auto 模块导入 FLAX_MODEL_MAPPING 和 FLAX_MODEL_FOR_CAUSAL_LM_MAPPING
        # encoder_module 是根据 encoder_config 类型从 FLAX_MODEL_MAPPING 中获取的模块类
        encoder_module = FLAX_MODEL_MAPPING[encoder_config.__class__].module_class
        # decoder_module 是根据 decoder_config 类型从 FLAX_MODEL_FOR_CAUSAL_LM_MAPPING 中获取的模块类
        decoder_module = FLAX_MODEL_FOR_CAUSAL_LM_MAPPING[decoder_config.__class__].module_class

        # 使用 encoder_module 和 decoder_module 初始化编码器和解码器实例
        self.encoder = encoder_module(encoder_config, dtype=self.dtype)
        self.decoder = decoder_module(decoder_config, dtype=self.dtype)

        # 如果编码器输出的隐藏状态维度与解码器不同，并且解码器的交叉注意力隐藏状态尺寸为 None
        # 则定义一个线性层 enc_to_dec_proj，用于将编码器输出投影到解码器所需的隐藏状态维度
        if (
            self.encoder.config.hidden_size != self.decoder.config.hidden_size
            and self.decoder.config.cross_attention_hidden_size is None
        ):
            self.enc_to_dec_proj = nn.Dense(
                self.decoder.config.hidden_size,
                kernel_init=jax.nn.initializers.normal(self.decoder.config.initializer_range),
                dtype=self.dtype,
            )
        else:
            self.enc_to_dec_proj = None

    # 获取编码器模块的方法
    def _get_encoder_module(self):
        return self.encoder

    # 获取投影模块的方法
    def _get_projection_module(self):
        return self.enc_to_dec_proj

    # 获取解码器模块的方法
    def _get_decoder_module(self):
        return self.decoder

    # 调用实例时的方法，用于执行编码解码过程
    def __call__(
        self,
        input_ids,
        attention_mask,
        decoder_input_ids,
        decoder_attention_mask,
        position_ids,
        decoder_position_ids,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        deterministic: bool = True,
        ):
            # 调用编码器模型，传入输入的编码器相关参数
            encoder_outputs = self.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                position_ids=position_ids,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                deterministic=deterministic,
            )

            # 获取编码器的隐藏状态
            encoder_hidden_states = encoder_outputs[0]

            # 可选地投影编码器的隐藏状态到解码器
            if self.enc_to_dec_proj is not None:
                encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)

            # 调用解码器模型，传入解码器相关参数以及编码器的隐藏状态
            decoder_outputs = self.decoder(
                input_ids=decoder_input_ids,
                attention_mask=decoder_attention_mask,
                position_ids=decoder_position_ids,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=attention_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                deterministic=deterministic,
            )

            # 如果 return_dict 为 False，则返回解码器和编码器的输出
            if not return_dict:
                return decoder_outputs + encoder_outputs

            # 如果 return_dict 为 True，则返回 FlaxSeq2SeqLMOutput 对象，包含解码器的输出和编码器的相关信息
            return FlaxSeq2SeqLMOutput(
                logits=decoder_outputs.logits,
                decoder_hidden_states=decoder_outputs.hidden_states,
                decoder_attentions=decoder_outputs.attentions,
                cross_attentions=decoder_outputs.cross_attentions,
                encoder_last_hidden_state=encoder_outputs.last_hidden_state,
                encoder_hidden_states=encoder_outputs.hidden_states,
                encoder_attentions=encoder_outputs.attentions,
            )
# 使用装饰器向FlaxEncoderDecoderModel类添加文档字符串
@add_start_docstrings(ENCODER_DECODER_START_DOCSTRING)
# 定义FlaxEncoderDecoderModel类，继承自FlaxPreTrainedModel
class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
    """
    [`FlaxEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with
    the module (flax.nn.Module) of one of the base model classes of the library as encoder module and another one as
    decoder module when created with the :meth*~transformers.FlaxAutoModel.from_pretrained* class method for the
    encoder and :meth*~transformers.FlaxAutoModelForCausalLM.from_pretrained* class method for the decoder.
    """

    # 指定配置类为EncoderDecoderConfig
    config_class = EncoderDecoderConfig
    # 指定基础模型的前缀
    base_model_prefix = "encoder_decoder"
    # 指定模块类为FlaxEncoderDecoderModule
    module_class = FlaxEncoderDecoderModule

    # 初始化方法
    def __init__(
        self,
        config: EncoderDecoderConfig,           # 配置对象，类型为EncoderDecoderConfig
        input_shape: Optional[Tuple] = None,    # 输入形状，可选的元组
        seed: int = 0,                          # 随机种子，默认为0
        dtype: jnp.dtype = jnp.float32,         # 数据类型，默认为jnp.float32
        _do_init: bool = True,                  # 是否初始化的标志，默认为True
        **kwargs,                               # 其他关键字参数
    ):
        # 如果没有指定输入形状，则设置默认输入形状为((1, 1), (1, 1))
        if input_shape is None:
            input_shape = ((1, 1), (1, 1))

        # 如果_do_init为False，则抛出错误，不能创建未初始化的FlaxEncoderDecoderModel
        if not _do_init:
            raise ValueError(
                "`FlaxEncoderDecoderModel` cannot be created without initializing, `_do_init` must be `True`."
            )

        # 如果配置中decoder的cross_attention_hidden_size不为None
        if config.decoder.cross_attention_hidden_size is not None:
            # 检查decoder的cross_attention_hidden_size是否等于encoder的hidden_size
            if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size:
                raise ValueError(
                    "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal"
                    f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
                    f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for"
                    " `config.encoder.hidden_size`."
                )

        # 使用配置和其他关键字参数初始化模块对象
        module = self.module_class(config=config, dtype=dtype, **kwargs)
        # 调用父类FlaxPreTrainedModel的初始化方法
        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
        encoder_input_shape, decoder_input_shape = input_shape  # 解包输入形状元组

        # 初始化编码器的输入张量
        input_ids = jnp.zeros(encoder_input_shape, dtype="i4")  # 创建全零的整数张量
        attention_mask = jnp.ones_like(input_ids)  # 创建与input_ids形状相同的全1张量作为注意力掩码

        # 初始化解码器的输入张量
        decoder_input_ids = jnp.zeros(decoder_input_shape, dtype="i4")  # 创建全零的整数张量
        decoder_attention_mask = jnp.ones_like(decoder_input_ids)  # 创建与decoder_input_ids形状相同的全1张量作为注意力掩码

        batch_size, sequence_length = input_ids.shape  # 获取编码器输入张量的批量大小和序列长度
        position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))  # 根据序列长度广播位置编码

        decoder_batch_size, decoder_sequence_length = decoder_input_ids.shape  # 获取解码器输入张量的批量大小和序列长度
        if not decoder_batch_size == batch_size:  # 如果编码器和解码器的批量大小不相等，抛出值错误
            raise ValueError(
                f"The inputs of encoder and decoder should have the same batch size, but got {batch_size} for encoder"
                f" and {decoder_batch_size} for decoder."
            )
        decoder_position_ids = jnp.broadcast_to(
            jnp.arange(decoder_sequence_length)[None, :], (decoder_batch_size, decoder_sequence_length)
        )  # 根据解码器序列长度广播解码器的位置编码

        params_rng, dropout_rng = jax.random.split(rng)  # 使用随机数生成器拆分用于参数初始化和dropout的随机数种子
        rngs = {"params": params_rng, "dropout": dropout_rng}  # 组成随机数种子字典

        random_params = self.module.init(  # 使用模块的初始化方法初始化随机参数
            rngs,
            input_ids,
            attention_mask,
            decoder_input_ids,
            decoder_attention_mask,
            position_ids,
            decoder_position_ids,
        )["params"]  # 返回初始化后的参数

        if params is not None:  # 如果给定了预定义的参数
            random_params = flatten_dict(unfreeze(random_params))  # 展平和解冻随机参数
            params = flatten_dict(unfreeze(params))  # 展平和解冻预定义参数
            for missing_key in self._missing_keys:  # 对于每个缺失的键
                params[missing_key] = random_params[missing_key]  # 使用随机参数填充预定义参数的缺失部分
            self._missing_keys = set()  # 清空缺失键集合
            return freeze(unflatten_dict(params))  # 冻结和重构预定义参数并返回
        else:
            return random_params  # 否则直接返回随机初始化的参数
    # 初始化缓存函数，用于自动回归解码
    def init_cache(self, batch_size, max_length, encoder_outputs):
        r"""
        Args:
            batch_size (`int`):
                用于快速自动回归解码的批大小。定义了初始化缓存的批大小。
            max_length (`int`):
                自动回归解码的最大可能长度。定义了初始化缓存的序列长度。
            encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
                `encoder_outputs` 包括 (`last_hidden_state`, *可选*: `hidden_states`, *可选*: `attentions`)。
                `last_hidden_state` 的形状为 `(batch_size, sequence_length, hidden_size)`，*可选* 是编码器最后一层的隐藏状态输出，
                在解码器的交叉注意力中使用。
        """
        # 初始化解码器输入的变量以检索缓存
        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
        decoder_position_ids = jnp.broadcast_to(
            jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape
        )

        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
            # 获取解码器模块
            decoder_module = module._get_decoder_module()
            return decoder_module(
                input_ids=decoder_input_ids,
                attention_mask=decoder_attention_mask,
                position_ids=decoder_position_ids,
                **kwargs,
            )

        # 使用解码器来初始化缓存，只需调用解码器来初始化缓存
        init_variables = self.module.init(
            jax.random.PRNGKey(0),
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            decoder_position_ids=decoder_position_ids,
            encoder_hidden_states=encoder_outputs[0],
            init_cache=True,
            method=_decoder_forward,
        )
        # 解冻并返回初始化的缓存
        return unfreeze(init_variables["cache"])

    @add_start_docstrings(ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=_CONFIG_FOR_DOC)
    # 编码函数，用于对输入进行编码
    def encode(
        self,
        input_ids: jnp.ndarray,
        attention_mask: Optional[jnp.ndarray] = None,
        position_ids: Optional[jnp.ndarray] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        train: bool = False,
        params: dict = None,
        dropout_rng: PRNGKey = None,
        ):
        r"""
        Returns:

        Example:

        ```
        >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer

        >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-cased", "openai-community/gpt2")

        >>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")

        >>> text = "My friends are cool but they eat too many carbs."
        >>> input_ids = tokenizer.encode(text, return_tensors="np")
        >>> encoder_outputs = model.encode(input_ids)
        ```"""
        # 设置输出注意力机制参数，若未指定则使用配置文件中的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 设置输出隐藏状态参数，若未指定则使用配置文件中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置返回字典参数，若未指定则使用配置文件中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        # 如果没有提供注意力掩码，则创建一个与输入相同形状的全1注意力掩码
        if attention_mask is None:
            attention_mask = jnp.ones_like(input_ids)
        # 如果没有提供位置编码，则根据输入的长度广播生成位置编码
        if position_ids is None:
            batch_size, sequence_length = input_ids.shape
            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))

        # 如果需要处理任何伪随机数生成器（PRNG）
        rngs = {}
        if dropout_rng is not None:
            rngs["dropout"] = dropout_rng

        # 定义编码器前向传播函数
        def _encoder_forward(module, input_ids, attention_mask, position_ids, **kwargs):
            # 获取编码器模块
            encode_module = module._get_encoder_module()
            # 调用编码器模块进行编码
            return encode_module(input_ids, attention_mask, position_ids, **kwargs)

        # 应用模型的前向传播，传入参数和配置
        outputs = self.module.apply(
            {"params": params or self.params},
            input_ids=jnp.array(input_ids, dtype="i4"),
            attention_mask=jnp.array(attention_mask, dtype="i4"),
            position_ids=jnp.array(position_ids, dtype="i4"),
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            deterministic=not train,
            rngs=rngs,
            method=_encoder_forward,
        )

        # 如果需要返回字典，则构建相应的输出对象
        if return_dict:
            outputs = FlaxBaseModelOutput(
                last_hidden_state=outputs.last_hidden_state,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
            )

        # 返回模型的输出结果
        return outputs

    # 添加开始的文档字符串注释，指定输入的解码器解码文档字符串
    @add_start_docstrings(ENCODER_DECODER_DECODE_INPUTS_DOCSTRING)
    # 替换返回文档字符串，指定输出类型为带交叉注意力的FlaxCausalLMOutputWithCrossAttentions，配置类为_CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    # 定义一个解码方法，用于将编码器和解码器的输入转换为模型的输出
    def decode(
        self,
        decoder_input_ids,
        encoder_outputs,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        decoder_attention_mask: Optional[jnp.ndarray] = None,
        decoder_position_ids: Optional[jnp.ndarray] = None,
        past_key_values: dict = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        train: bool = False,
        params: dict = None,
        dropout_rng: PRNGKey = None,
    ):
        # 这里使用了自定义的函数装饰器，为模型的前向传播添加了文档字符串
        @add_start_docstrings_to_model_forward(ENCODER_DECODER_INPUTS_DOCSTRING)
        @replace_return_docstrings(output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
        def __call__(
            self,
            input_ids: jnp.ndarray,
            attention_mask: Optional[jnp.ndarray] = None,
            decoder_input_ids: Optional[jnp.ndarray] = None,
            decoder_attention_mask: Optional[jnp.ndarray] = None,
            position_ids: Optional[jnp.ndarray] = None,
            decoder_position_ids: Optional[jnp.ndarray] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            train: bool = False,
            params: dict = None,
            dropout_rng: PRNGKey = None,
        ):
            pass  # 此处省略了函数具体实现，由于是在类内部定义，可以访问类的其他成员变量和方法

    # 准备生成时的输入，初始化缓存和注意力掩码等
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        max_length,
        attention_mask: Optional[jax.Array] = None,
        decoder_attention_mask: Optional[jax.Array] = None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 初始化缓存，通常用于存储解码器的过去键值对
        batch_size, seq_length = decoder_input_ids.shape
        past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)

        # 创建一个扩展的注意力掩码，用于确保模型只关注当前生成位置之前的信息
        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
        if decoder_attention_mask is not None:
            # 根据解码器的注意力掩码动态更新扩展的注意力掩码
            decoder_position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
        else:
            # 如果没有提供解码器的注意力掩码，则使用默认的位置 IDs
            decoder_position_ids = jnp.broadcast_to(
                jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)
            )

        return {
            "past_key_values": past_key_values,
            "encoder_outputs": encoder_outputs,
            "encoder_attention_mask": attention_mask,
            "decoder_attention_mask": extended_attention_mask,
            "decoder_position_ids": decoder_position_ids,
        }
    # 更新生成过程中的模型参数
    def update_inputs_for_generation(self, model_outputs, model_kwargs):
        # 将模型输出中的过去键值添加到模型参数中
        model_kwargs["past_key_values"] = model_outputs.past_key_values
        # 更新解码器位置标识符，将其限制为最后一个位置加1
        model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
        # 返回更新后的模型参数
        return model_kwargs

    # 从预训练的编码器-解码器模型中创建实例
    @classmethod
    def from_encoder_decoder_pretrained(
        cls,
        encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
        decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
        *model_args,
        **kwargs,

`.\models\encoder_decoder\modeling_tf_encoder_decoder.py`

# 设置文件编码为 UTF-8
# 版权声明，声明代码版权归 HuggingFace Inc. 团队所有，采用 Apache License, Version 2.0
# 只有在遵循许可证的情况下才能使用此文件
# 可以在以下网址获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
# 除非法律要求或书面同意，否则不得使用本文件中的代码
# 本文件中的代码按"原样"提供，不提供任何形式的担保或条件，无论是明示的还是暗示的
# 详细信息请参阅许可证
""" Classes to support TF Encoder-Decoder architectures"""

from __future__ import annotations  # 支持在注解中使用自身类名

import inspect  # 导入用于获取对象信息的模块
import re  # 导入正则表达式模块
import warnings  # 导入警告处理模块
from typing import Optional, Tuple, Union  # 导入类型提示

import numpy as np  # 导入 NumPy 数学库
import tensorflow as tf  # 导入 TensorFlow 深度学习库

from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...modeling_tf_outputs import TFBaseModelOutput, TFSeq2SeqLMOutput  # 导入 TensorFlow 模型输出类
from ...modeling_tf_utils import (  # 导入 TensorFlow 模型工具函数
    TFCausalLanguageModelingLoss,
    TFModelInputType,
    TFPreTrainedModel,
    get_initializer,
    keras,
    unpack_inputs,
)
from ...tf_utils import shape_list  # 导入获取张量形状的函数
from ...utils import (  # 导入实用函数
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from ..auto.configuration_auto import AutoConfig  # 导入自动配置类
from ..auto.modeling_tf_auto import TFAutoModel, TFAutoModelForCausalLM  # 导入自动 TensorFlow 模型类
from .configuration_encoder_decoder import EncoderDecoderConfig  # 导入编码解码器配置类

logger = logging.get_logger(__name__)  # 获取日志记录器对象

_CONFIG_FOR_DOC = "EncoderDecoderConfig"  # 用于文档的配置名称

DEPRECATION_WARNING = (
    "Version v4.17.0 introduces a better way to train encoder-decoder models by computing the loss inside the"
    " encoder-decoder framework rather than in the decoder itself. You may observe training discrepancies if"
    " fine-tuning a model trained with versions anterior to 4.17.0. The decoder_input_ids are now created based on the"
    " labels, no need to pass them yourself anymore."
)  # 弃用警告信息

ENCODER_DECODER_START_DOCSTRING = r"""
    This class can be used to initialize a sequence-to-sequence model with any pretrained autoencoding model as the
    encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
    [`~TFAutoModel.from_pretrained`] function and the decoder is loaded via [`~TFAutoModelForCausalLM.from_pretrained`]
    function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream
    generative task, like summarization.

    The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
    tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
    Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
    Zhou, Wei Li, Peter J. Liu.

    After such an Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other models

(see the examples for more information).



This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)



This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.



Parameters:
    config ([`EncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
        Initializing with a config file does not load the weights associated with the model, only the
        configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.

"""

ENCODER_DECODER_INPUTS_DOCSTRING = r"""
"""

定义一个函数，用于将输入的token_ids向右移动，模拟decoder端的输入

def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
# 如果pad_token_id未设置，则抛出数值错误
if pad_token_id is None:
raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
# 将pad_token_id转换为与input_ids相同的数据类型
pad_token_id = tf.cast(pad_token_id, input_ids.dtype)

# 如果decoder_start_token_id未设置，则抛出数值错误
if decoder_start_token_id is None:
    raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
# 将decoder_start_token_id转换为与input_ids相同的数据类型
decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)

# 创建一个形状为(input_ids的行数, 1)的张量，每个元素均为decoder_start_token_id
start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id)
# 将start_tokens与input_ids的前几列合并，构成向右移动后的输入token_ids
shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
# 将shifted_input_ids中可能的-100值替换为pad_token_id
shifted_input_ids = tf.where(
    shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids
)

# 断言shifted_input_ids中的值均大于等于0
assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype))

# 确保断言操作的调用，通过在结果中包装一个身份无操作
with tf.control_dependencies([assert_gte0]):
    shifted_input_ids = tf.identity(shifted_input_ids)

return shifted_input_ids

@add_start_docstrings(ENCODER_DECODER_START_DOCSTRING)

TFEncoderDecoderModel类，继承自TFPreTrainedModel和TFCausalLanguageModelingLoss

class TFEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss):
r"""
[TFEncoderDecoderModel]是一个通用的模型类，创建时会使用库中的一个基础模型类作为encoder和另一个作为decoder，
使用[~TFAutoModel.from_pretrained]类方法创建encoder，和使用[~TFAutoModelForCausalLM.from_pretrained]类方法创建decoder。
"""

# 类属性，指定配置类为EncoderDecoderConfig
config_class = EncoderDecoderConfig
# 模型前缀为"encoder_decoder"
base_model_prefix = "encoder_decoder"
# 加载权重时的前缀为"tf_encoder_decoder_model"
load_weight_prefix = "tf_encoder_decoder_model"

# 初始化方法
def __init__(
    self,
    config: Optional[PretrainedConfig] = None,
    encoder: Optional[TFPreTrainedModel] = None,
    decoder: Optional[TFPreTrainedModel] = None,
):
    super().__init__(config)
    self.encoder = encoder
    self.decoder = decoder

# 获取encoder方法
def get_encoder(self):
    return self.encoder

# 获取decoder方法
def get_decoder(self):
    return self.decoder

# 获取输入嵌入方法，委托给encoder的get_input_embeddings方法
def get_input_embeddings(self):
    return self.encoder.get_input_embeddings()

# 获取输出嵌入方法，委托给decoder的get_output_embeddings方法
def get_output_embeddings(self):
    return self.decoder.get_output_embeddings()

# 设置输出嵌入方法，委托给decoder的set_output_embeddings方法
def set_output_embeddings(self, new_embeddings):
    return self.decoder.set_output_embeddings(new_embeddings)
# Matt: The TF and PT weights don't align because our TF base classes have an extra layer compared to PT models
# (the main model stem is in the MainLayer class). If we remove that layer, then weight names sync up as normal.
# However, the name of that extra layer is the name of the MainLayer in the base model. We make the assumption
# here that the config model_type is the same as the name of the MainLayer. I don't know of anywhere that's
# not the case, and I wasn't sure how else to go from the config to the correct MainLayer name!
def tf_to_pt_weight_rename(self, tf_weight):
    # 函数用于重命名 TF 到 PT 权重的函数。由于 TF 基类比 PT 模型多了一个层（主模型干部在 MainLayer 类中），
    # 导致权重名称不匹配。如果去除这一层，则权重名称会正常对应。假设配置文件中的 model_type 和 MainLayer 名称相同，
    # 因此在此处进行这一假设。不清楚是否存在不符合这一假设的情况，也不确定如何从配置中得到正确的 MainLayer 名称。

    # This override is only needed in the case where we're crossloading weights from PT. However, since weights are
    # often safetensors now, we don't know if we're going to be crossloading until we sniff the weights file.
    # Therefore, we specify tf_to_pt_weight_rename anyway, and let the super method figure out if it needs it
    # or not.
    # 此重写仅在从 PT 加载权重时需要。但是，由于现在权重通常是 SafeTensor，因此在嗅探权重文件之前，我们不知道是否会进行跨加载。
    # 因此，我们仍然指定 tf_to_pt_weight_rename，让超类方法决定是否需要使用它。
    encoder_model_type = self.config.encoder.model_type
    if "encoder" in tf_weight and "decoder" not in tf_weight:
        # 如果权重名称中包含 "encoder" 但不包含 "decoder"，则替换名称中的 encoder.model_type 部分为 encoder.
        return (re.sub(rf"encoder\.{encoder_model_type}\.", "encoder.", tf_weight),)
    else:
        # 否则直接返回原始的 TF 权重名称
        return (tf_weight,)

@classmethod
def from_encoder_decoder_pretrained(
    cls,
    encoder_pretrained_model_name_or_path: str = None,
    decoder_pretrained_model_name_or_path: str = None,
    *model_args,
    **kwargs,
# 函数装饰器，用于从预训练的编码器-解码器模型名称或路径创建模型的类方法。
@unpack_inputs
@add_start_docstrings_to_model_forward(ENCODER_DECODER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
def call(
    self,
    input_ids: TFModelInputType | None = None,
    attention_mask: np.ndarray | tf.Tensor | None = None,
    decoder_input_ids: np.ndarray | tf.Tensor | None = None,
    decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
    encoder_outputs: np.ndarray | tf.Tensor | None = None,
    past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
    inputs_embeds: np.ndarray | tf.Tensor | None = None,
    decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
    labels: np.ndarray | tf.Tensor | None = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    training: bool = False,
    **kwargs,
# 函数装饰器，添加到 call 方法，用于指定输入、输出和返回文档字符串的生成和替换规则。
def prepare_inputs_for_generation(
    self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
# 函数用于为生成准备输入，接受输入 ID、过去的关键值、注意力掩码、缓存标志、编码器输出等参数。
):
    # 准备解码器生成所需的输入
    decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
    # 获取解码器的注意力遮罩，如果存在的话
    decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
    # 获取过去的关键值
    past_key_values = decoder_inputs.get("past_key_values")
    # 如果过去的关键值不存在，则在TF GPT2上获取过去的值
    if past_key_values is None:
        past_key_values = decoder_inputs.get("past")  # 例如在TF GPT2上
    # 构建输入字典，准备传递给Keras.layer.__call__以确保正常工作
    input_dict = {
        "input_ids": None,  # 需要传递以使Keras.layer.__call__工作正常
        "attention_mask": attention_mask,
        "decoder_attention_mask": decoder_attention_mask,
        "decoder_input_ids": decoder_inputs["input_ids"],
        # TODO (joao): 在生成重构完成后，不应再需要`TFBaseModelOutput`包装器
        "encoder_outputs": TFBaseModelOutput(last_hidden_state=encoder_outputs[0]),
        "past_key_values": past_key_values,
        "use_cache": use_cache,
    }
    return input_dict

def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
    # 根据标签为解码器的输入准备输入ID
    return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)

def resize_token_embeddings(self, *args, **kwargs):
    # 抛出未实现错误，直接调整嵌入层不支持通过TFEncoderDecoderModel
    raise NotImplementedError(
        "Resizing the embedding layers via the TFEncoderDecoderModel directly is not supported. Please use the"
        " respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or"
        " model.decoder.resize_token_embeddings(...))"
    )

def _reorder_cache(self, past, beam_idx):
    # 在此应用解码器缓存重新排序
    return self.decoder._reorder_cache(past, beam_idx)

def build(self, input_shape=None):
    # 如果已经构建，则直接返回
    if self.built:
        return
    self.built = True
    # 如果存在enc_to_dec_proj属性，则构建编码器到解码器的投影
    if getattr(self, "enc_to_dec_proj", None) is not None:
        with tf.name_scope(self.enc_to_dec_proj.name):
            self.enc_to_dec_proj.build([None, None, self.encoder.config.hidden_size])
    # 如果存在encoder属性，则构建编码器
    if getattr(self, "encoder", None) is not None:
        with tf.name_scope(self.encoder.name):
            self.encoder.build(None)
    # 如果存在decoder属性，则构建解码器
    if getattr(self, "decoder", None) is not None:
        with tf.name_scope(self.decoder.name):
            self.decoder.build(None)


# `.\models\encoder_decoder\__init__.py`

```py
# 版权声明和许可证信息

# 引入类型检查标记
from typing import TYPE_CHECKING

# 从 utils 模块中导入必要的异常和工具函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_torch_available,
)

# 定义模块的导入结构，包含配置模块中的 EncoderDecoderConfig
_import_structure = {"configuration_encoder_decoder": ["EncoderDecoderConfig"]}

# 检查是否支持 PyTorch，若不支持则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果支持 PyTorch，则导入 EncoderDecoderModel 模型
    _import_structure["modeling_encoder_decoder"] = ["EncoderDecoderModel"]

# 检查是否支持 TensorFlow，若不支持则抛出异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果支持 TensorFlow，则导入 TFEncoderDecoderModel 模型
    _import_structure["modeling_tf_encoder_decoder"] = ["TFEncoderDecoderModel"]

# 检查是否支持 Flax，若不支持则抛出异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果支持 Flax，则导入 FlaxEncoderDecoderModel 模型
    _import_structure["modeling_flax_encoder_decoder"] = ["FlaxEncoderDecoderModel"]

# 如果当前为类型检查模式
if TYPE_CHECKING:
    # 从当前模块中导入 EncoderDecoderConfig 类型
    from .configuration_encoder_decoder import EncoderDecoderConfig

    # 检查是否支持 PyTorch，若支持则从 modeling_encoder_decoder 中导入 EncoderDecoderModel 类型
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_encoder_decoder import EncoderDecoderModel

    # 检查是否支持 TensorFlow，若支持则从 modeling_tf_encoder_decoder 中导入 TFEncoderDecoderModel 类型
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_tf_encoder_decoder import TFEncoderDecoderModel

    # 检查是否支持 Flax，若支持则从 modeling_flax_encoder_decoder 中导入 FlaxEncoderDecoderModel 类型
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_flax_encoder_decoder import FlaxEncoderDecoderModel

# 如果不是类型检查模式，则将当前模块设为延迟加载模块
else:
    import sys

    # 动态设置当前模块为 _LazyModule 类型
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\ernie\configuration_ernie.py`

# 导入必要的模块和类
from collections import OrderedDict  # 导入OrderedDict类，用于有序字典
from typing import Mapping  # 导入Mapping类，用于类型提示

# 从transformers库中导入所需的配置类和模块
from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...onnx import OnnxConfig  # 导入ONNX配置类
from ...utils import logging  # 导入日志模块

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# ERNIE预训练模型配置的映射表，每个模型名称对应其配置文件的URL
ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "nghuyong/ernie-1.0-base-zh": "https://huggingface.co/nghuyong/ernie-1.0-base-zh/resolve/main/config.json",
    "nghuyong/ernie-2.0-base-en": "https://huggingface.co/nghuyong/ernie-2.0-base-en/resolve/main/config.json",
    "nghuyong/ernie-2.0-large-en": "https://huggingface.co/nghuyong/ernie-2.0-large-en/resolve/main/config.json",
    "nghuyong/ernie-3.0-base-zh": "https://huggingface.co/nghuyong/ernie-3.0-base-zh/resolve/main/config.json",
    "nghuyong/ernie-3.0-medium-zh": "https://huggingface.co/nghuyong/ernie-3.0-medium-zh/resolve/main/config.json",
    "nghuyong/ernie-3.0-mini-zh": "https://huggingface.co/nghuyong/ernie-3.0-mini-zh/resolve/main/config.json",
    "nghuyong/ernie-3.0-micro-zh": "https://huggingface.co/nghuyong/ernie-3.0-micro-zh/resolve/main/config.json",
    "nghuyong/ernie-3.0-nano-zh": "https://huggingface.co/nghuyong/ernie-3.0-nano-zh/resolve/main/config.json",
    "nghuyong/ernie-gram-zh": "https://huggingface.co/nghuyong/ernie-gram-zh/resolve/main/config.json",
    "nghuyong/ernie-health-zh": "https://huggingface.co/nghuyong/ernie-health-zh/resolve/main/config.json",
}

# 定义ERINE配置类，继承自PretrainedConfig类
class ErnieConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ErnieModel`] or a [`TFErnieModel`]. It is used to
    instantiate a ERNIE model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the ERNIE
    [nghuyong/ernie-3.0-base-zh](https://huggingface.co/nghuyong/ernie-3.0-base-zh) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """

    # 示例用法
    # 实例化一个与[nghuyong/ernie-3.0-base-zh](https://huggingface.co/nghuyong/ernie-3.0-base-zh)类似的配置
    # 参数设置为默认值将产生与ERNIE [nghuyong/ernie-3.0-base-zh](https://huggingface.co/nghuyong/ernie-3.0-base-zh)架构类似的配置

    # 示例代码
    # ```
    # >>> from transformers import ErnieConfig, ErnieModel
    #
    # >>> # Initializing a ERNIE nghuyong/ernie-3.0-base-zh style configuration
    # >>> configuration = ErnieConfig()
    # ```
    # 设置模型类型为ERNIE
    model_type = "ernie"
    
    # 定义ERNIE模型类
    def __init__(
        self,
        vocab_size=30522,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        task_type_vocab_size=3,
        use_task_id=False,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        pad_token_id=0,
        position_embedding_type="absolute",
        use_cache=True,
        classifier_dropout=None,
        **kwargs,
    ):
        # 调用父类的构造函数，初始化模型
        super().__init__(pad_token_id=pad_token_id, **kwargs)
    
        # 初始化模型参数
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.task_type_vocab_size = task_type_vocab_size
        self.use_task_id = use_task_id
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.position_embedding_type = position_embedding_type
        self.use_cache = use_cache
        self.classifier_dropout = classifier_dropout
class ErnieOnnxConfig(OnnxConfig):
    # 定义 Ernie 模型的配置类，继承自 OnnxConfig 类

    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 定义 inputs 属性，返回一个映射，其键为字符串，值为映射的映射，其中键为整数，值为字符串

        if self.task == "multiple-choice":
            # 如果任务类型是多项选择
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            # 否则
            dynamic_axis = {0: "batch", 1: "sequence"}

        return OrderedDict(
            [
                ("input_ids", dynamic_axis),  # 返回包含 input_ids 的动态轴映射
                ("attention_mask", dynamic_axis),  # 返回包含 attention_mask 的动态轴映射
                ("token_type_ids", dynamic_axis),  # 返回包含 token_type_ids 的动态轴映射
                ("task_type_ids", dynamic_axis),  # 返回包含 task_type_ids 的动态轴映射
            ]
        )

`.\models\ernie\modeling_ernie.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch ERNIE model."""


import math
import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入来自HuggingFace库的模块和类
from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    NextSentencePredictorOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_ernie import ErnieConfig

# 获取logger对象用于记录日志
logger = logging.get_logger(__name__)

# 以下两行定义了文档中用到的一些模型和配置信息
_CHECKPOINT_FOR_DOC = "nghuyong/ernie-1.0-base-zh"
_CONFIG_FOR_DOC = "ErnieConfig"

# 预训练模型的存档列表
ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "nghuyong/ernie-1.0-base-zh",
    "nghuyong/ernie-2.0-base-en",
    "nghuyong/ernie-2.0-large-en",
    "nghuyong/ernie-3.0-base-zh",
    "nghuyong/ernie-3.0-medium-zh",
    "nghuyong/ernie-3.0-mini-zh",
    "nghuyong/ernie-3.0-micro-zh",
    "nghuyong/ernie-3.0-nano-zh",
    "nghuyong/ernie-gram-zh",
    "nghuyong/ernie-health-zh",
    # 查看所有 ERNIE 模型：https://huggingface.co/models?filter=ernie
]

# ErnieEmbeddings 类的定义，用于构建来自词嵌入、位置嵌入和标记类型嵌入的嵌入层
class ErnieEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""
    # 初始化函数，用于初始化模型参数和配置
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 定义词嵌入层，根据配置参数设置词表大小、隐藏层大小和填充标记的索引
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 定义位置嵌入层，根据配置参数设置最大位置嵌入数和隐藏层大小
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 定义token类型嵌入层，根据配置参数设置token类型词表大小和隐藏层大小
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        # 如果配置中使用任务ID，定义任务类型嵌入层，根据配置参数设置任务类型词表大小和隐藏层大小
        if config.use_task_id:
            self.task_type_embeddings = nn.Embedding(config.task_type_vocab_size, config.hidden_size)

        # self.LayerNorm 没有使用蛇形命名法以保持与 TensorFlow 模型变量名的一致性，并能够加载任何 TensorFlow 检查点文件
        # 定义Layer Normalization层，根据配置参数设置隐藏层大小和epsilon值
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 定义Dropout层，根据配置参数设置隐藏层的dropout概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 定义位置嵌入类型，根据配置参数获取绝对位置编码类型或其他类型
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册位置ID张量，用于序列化时持久化存储，长度为最大位置嵌入数
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 注册token类型ID张量，用于序列化时持久化存储，形状与位置ID相同，类型为长整型
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

    # 前向传播函数，接受多个输入参数，返回模型的输出结果
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        task_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        past_key_values_length: int = 0,
    ) -> torch.Tensor:
        # 如果给定了 input_ids，则获取其形状作为 input_shape
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            # 否则，从 inputs_embeds 获取形状，排除最后一个维度（通常是 batch 维度）
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度
        seq_length = input_shape[1]

        # 如果未提供 position_ids，则从预设的 position_ids 中切片出相应长度的部分
        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        # 设置 token_type_ids 为注册的缓冲区，默认为全零，当其自动生成时有效，用于在模型追踪过程中解决问题 #5664
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果 inputs_embeds 为 None，则从 word_embeddings 中获取对应 input_ids 的嵌入向量
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        
        # 获取 token_type_ids 对应的 token type embeddings
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将输入的嵌入向量与 token type embeddings 相加得到最终的 embeddings
        embeddings = inputs_embeds + token_type_embeddings

        # 如果使用绝对位置编码，则将位置编码 embeddings 加到当前 embeddings 中
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings

        # 如果模型使用 task_id，则将 task_type_ids 加入 embeddings
        if self.use_task_id:
            if task_type_ids is None:
                task_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
            task_type_embeddings = self.task_type_embeddings(task_type_ids)
            embeddings += task_type_embeddings

        # 对 embeddings 进行 LayerNorm 处理
        embeddings = self.LayerNorm(embeddings)
        # 对 embeddings 进行 dropout 处理
        embeddings = self.dropout(embeddings)
        # 返回最终的 embeddings
        return embeddings
# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Ernie
class ErnieSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 检查隐藏大小是否能被注意力头数整除，否则抛出数值错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 创建查询、键、值的线性层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # dropout 层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        # 如果使用相对位置编码，创建距离嵌入层
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 是否作为解码器使用
        self.is_decoder = config.is_decoder

    # 调整形状以便进行注意力计算
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数定义
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->Ernie
class ErnieSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 密集层
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 层归一化
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数定义
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 密集层计算
        hidden_states = self.dense(hidden_states)
        # dropout 计算
        hidden_states = self.dropout(hidden_states)
        # 层归一化计算并与输入张量相加
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Ernie
# 定义一个名为 ErnieAttention 的自定义神经网络模块，继承自 nn.Module 类
class ErnieAttention(nn.Module):
    # 初始化函数，接受配置参数 config 和位置嵌入类型 position_embedding_type
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 创建 ErnieSelfAttention 层，并赋值给 self.self 属性
        self.self = ErnieSelfAttention(config, position_embedding_type=position_embedding_type)
        # 创建 ErnieSelfOutput 层，并赋值给 self.output 属性
        self.output = ErnieSelfOutput(config)
        # 初始化一个空集合，用于存储被剪枝的注意力头信息
        self.pruned_heads = set()

    # 定义一个方法，用于剪枝注意力头
    def prune_heads(self, heads):
        # 如果 heads 列表为空，则直接返回
        if len(heads) == 0:
            return
        # 调用 find_pruneable_heads_and_indices 函数，找到可以剪枝的头部和对应的索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层中的权重
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储剪枝的头部信息
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播函数，接受多个输入张量并返回一个张量元组
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用 self.self 的前向传播，获取自注意力输出
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 将自注意力输出和输入 hidden_states 传入 self.output 层，获取注意力输出
        attention_output = self.output(self_outputs[0], hidden_states)
        # 构建输出元组，包含注意力输出和可能的注意力权重
        outputs = (attention_output,) + self_outputs[1:]  # 如果有的话，加入注意力权重
        return outputs


# 从 transformers.models.bert.modeling_bert.BertIntermediate 复制并改为 Ernie
# 定义一个名为 ErnieIntermediate 的神经网络模块，继承自 nn.Module 类
class ErnieIntermediate(nn.Module):
    # 初始化函数，接受配置参数 config
    def __init__(self, config):
        super().__init__()
        # 创建一个线性层，将输入特征大小 config.hidden_size 映射到 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 如果 config.hidden_act 是字符串类型，则使用 ACT2FN 字典映射的激活函数，否则直接使用配置中的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    # 前向传播函数，接受输入张量 hidden_states 并返回张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入张量经过线性层 dense，得到中间状态 hidden_states
        hidden_states = self.dense(hidden_states)
        # 将中间状态 hidden_states 经过激活函数 intermediate_act_fn
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 返回激活后的中间状态 hidden_states
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertOutput 复制并改为 Ernie
# 定义一个名为 ErnieOutput 的神经网络模块，继承自 nn.Module 类
class ErnieOutput(nn.Module):
    # 初始化方法，用于创建一个新的对象实例
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个全连接层，输入维度为config.intermediate_size，输出维度为config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个 LayerNorm 层，对输入进行归一化处理，设置epsilon为config.layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个 Dropout 层，用于在训练过程中随机置零输入张量的部分元素，概率为config.hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播方法，定义了如何从输入计算输出
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将隐藏状态输入全连接层，进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对全连接层的输出进行 Dropout 操作，以防止过拟合
        hidden_states = self.dropout(hidden_states)
        # 对加上输入张量的结果进行 LayerNorm 归一化处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的张量作为输出
        return hidden_states
# 从 transformers.models.bert.modeling_bert.BertLayer 复制并修改为 ErnieLayer
class ErnieLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 设置前向传播的块大小（feed forward chunk size）
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度设为1
        self.seq_len_dim = 1
        # 初始化 Ernie 注意力层
        self.attention = ErnieAttention(config)
        # 是否作为解码器使用
        self.is_decoder = config.is_decoder
        # 是否添加跨注意力（cross attention）
        self.add_cross_attention = config.add_cross_attention
        # 如果添加了跨注意力，检查是否作为解码器使用，否则引发异常
        if self.add_cross_attention:
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            # 初始化 Ernie 跨注意力层，使用绝对位置嵌入
            self.crossattention = ErnieAttention(config, position_embedding_type="absolute")
        # 初始化 Ernie 中间层
        self.intermediate = ErnieIntermediate(config)
        # 初始化 Ernie 输出层
        self.output = ErnieOutput(config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 使用过去的键/值缓存（如果存在）的前两个位置来初始化自注意力机制的过去键/值
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        # 执行自注意力机制
        attention_output = self_attention_outputs[0]

        # 如果是解码器，最后一个输出是自注意力缓存的元组
        if self.is_decoder:
            outputs = self_attention_outputs[1:-1]
            present_key_value = self_attention_outputs[-1]
        else:
            outputs = self_attention_outputs[1:]  # 如果需要输出注意力权重，则添加自注意力
                                                  
        cross_attn_present_key_value = None
        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # 交叉注意力缓存的键/值元组位于过去键/值元组的第3、4个位置
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # 执行交叉注意力机制
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            attention_output = cross_attention_outputs[0]
            outputs = outputs + cross_attention_outputs[1:-1]  # 如果需要输出注意力权重，则添加交叉注意力

            # 将交叉注意力的缓存添加到现在键/值元组的第3、4个位置
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # 将注意力输出应用于前向传播的分块处理
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        outputs = (layer_output,) + outputs

        # 如果是解码器，将注意力的键/值作为最后一个输出返回
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        return outputs

    def feed_forward_chunk(self, attention_output):
        # 将注意力输出应用于前向传播的分块处理
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output
# 从transformers.models.bert.modeling_bert.BertEncoder复制并修改为ErnieEncoder
class ErnieEncoder(nn.Module):
    # 初始化方法，接受一个config对象作为参数
    def __init__(self, config):
        super().__init__()
        # 将config对象保存到实例的self.config属性中
        self.config = config
        # 创建一个包含多个ErnieLayer对象的列表，列表长度为config.num_hidden_layers
        self.layer = nn.ModuleList([ErnieLayer(config) for _ in range(config.num_hidden_layers)])
        # 是否使用梯度检查点，默认为False
        self.gradient_checkpointing = False

    # 前向传播方法，接受多个输入参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        # 如果不需要输出隐藏状态，设置一个空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果不需要输出注意力权重，设置一个空元组
        all_self_attentions = () if output_attentions else None
        # 如果不需要输出交叉注意力权重或者配置不支持，设置一个空元组
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果启用了梯度检查点并且在训练阶段，检查是否与使用缓存参数冲突，如有冲突则警告并强制关闭使用缓存
        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # 如果不使用缓存，初始化一个空元组以保存下一个解码器缓存
        next_decoder_cache = () if use_cache else None
        # 遍历所有层次的解码器层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，将当前隐藏状态加入到所有隐藏状态元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码，如果没有指定头部掩码，则设为None
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 获取当前层的过去键值对，如果没有指定过去键值对，则设为None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果启用了梯度检查点并且在训练阶段，使用梯度检查点函数来计算当前层的输出
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则，直接调用当前层模块来计算当前层的输出
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            # 更新当前隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果使用缓存，将当前层的缓存信息加入到下一个解码器缓存中
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果需要输出注意力权重，将当前层的自注意力权重加入到所有自注意力权重元组中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果模型配置支持添加交叉注意力，将当前层的交叉注意力权重加入到所有交叉注意力权重元组中
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果需要输出隐藏状态，将最终的隐藏状态加入到所有隐藏状态元组中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典形式的结果，将需要返回的各项整合成一个元组并返回
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 否则，返回一个带有过去键值对和交叉注意力的基础模型输出对象
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->Ernie
class ErniePooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一个全连接层，输入和输出维度均为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 激活函数使用双曲正切函数
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 取出隐藏状态中的第一个 token 对应的隐藏状态
        first_token_tensor = hidden_states[:, 0]
        # 将第一个 token 的隐藏状态输入全连接层
        pooled_output = self.dense(first_token_tensor)
        # 使用激活函数处理全连接层的输出
        pooled_output = self.activation(pooled_output)
        # 返回池化后的输出
        return pooled_output


# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->Ernie
class ErniePredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一个全连接层，输入和输出维度均为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 根据配置选择激活函数
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        # LayerNorm 层，输入维度为 config.hidden_size，epsilon 为 config.layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 输入隐藏状态经过全连接层
        hidden_states = self.dense(hidden_states)
        # 使用指定的激活函数处理全连接层的输出
        hidden_states = self.transform_act_fn(hidden_states)
        # 输入 LayerNorm 层处理后返回
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->Ernie
class ErnieLMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 声明一个 ErniePredictionHeadTransform 对象，用于转换隐藏状态
        self.transform = ErniePredictionHeadTransform(config)

        # 输出权重与输入嵌入相同，但每个 token 都有一个独立的输出偏置
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 声明一个 bias 参数，用于输出层每个 token 的偏置
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        # 需要建立 decoder.bias 与 self.bias 之间的关联，以便在调整 token embeddings 时正确调整偏置
        self.decoder.bias = self.bias

    def forward(self, hidden_states):
        # 隐藏状态经过 transform 转换
        hidden_states = self.transform(hidden_states)
        # 转换后的隐藏状态经过线性层，输出预测分数
        hidden_states = self.decoder(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->Ernie
class ErnieOnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 声明一个 ErnieLMPredictionHead 对象，用于预测 MLM 的结果
        self.predictions = ErnieLMPredictionHead(config)

    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
        # 序列的输出经过预测层，得到预测分数
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores


# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->Ernie
class ErnieOnlyNSPHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 一个线性层，用于预测 NSP（Next Sentence Prediction）
        self.seq_relationship = nn.Linear(config.hidden_size, 2)
    # 定义一个方法 `forward`，用于执行前向传播
    def forward(self, pooled_output):
        # 调用 `seq_relationship` 方法计算序列关系分数
        seq_relationship_score = self.seq_relationship(pooled_output)
        # 返回计算得到的序列关系分数
        return seq_relationship_score
# 从 transformers.models.bert.modeling_bert.BertPreTrainingHeads 复制的代码，将 Bert 替换为 Ernie
class ErniePreTrainingHeads(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建 ErnieLMPredictionHead 对象，用于预测下一个词的概率分布
        self.predictions = ErnieLMPredictionHead(config)
        # 创建线性层，用于预测两个句子之间的关系
        self.seq_relationship = nn.Linear(config.hidden_size, 2)

    def forward(self, sequence_output, pooled_output):
        # 调用 predictions 对象进行预测，生成预测分数
        prediction_scores = self.predictions(sequence_output)
        # 使用 seq_relationship 层预测句子之间的关系得分
        seq_relationship_score = self.seq_relationship(pooled_output)
        # 返回预测的语言模型分数和句子关系分数
        return prediction_scores, seq_relationship_score


class ErniePreTrainedModel(PreTrainedModel):
    """
    处理权重初始化和预训练模型下载加载的抽象类。
    """

    config_class = ErnieConfig
    base_model_prefix = "ernie"
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """初始化权重"""
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化线性层的权重，标准差为 config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果存在偏置，则将其初始化为零
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化嵌入层的权重，标准差为 config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                # 如果有 padding_idx，则将对应的权重初始化为零
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 将 LayerNorm 层的偏置初始化为零，权重初始化为 1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


@dataclass
# 从 transformers.models.bert.modeling_bert.BertForPreTrainingOutput 复制的代码，将 Bert 替换为 Ernie
class ErnieForPreTrainingOutput(ModelOutput):
    """
    [`ErnieForPreTraining`] 的输出类型。
    """
    # 定义函数的参数说明和类型注解
    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            # 如果提供了 `labels` 参数，则返回的可选参数，表示总损失，包括掩码语言建模损失和下一个序列预测（分类）损失。
        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            # 语言建模头部的预测分数（softmax之前的每个词汇标记的分数）。
        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
            # 下一个序列预测（分类）头部的预测分数（softmax之前的True/False连续性得分）。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            # 模型在每层输出后的隐藏状态，以及初始嵌入输出的元组。
            # 如果传递了 `output_hidden_states=True` 或者 `config.output_hidden_states=True`，则返回。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            # 自注意力头部中的注意力权重，经过注意力softmax后的权重，用于计算加权平均值。
            # 如果传递了 `output_attentions=True` 或者 `config.output_attentions=True`，则返回。
    """

    # 损失值，类型为可选的浮点张量
    loss: Optional[torch.FloatTensor] = None
    # 语言建模头部的预测分数，张量形状为 `(batch_size, sequence_length, config.vocab_size)`
    prediction_logits: torch.FloatTensor = None
    # 下一个序列预测头部的预测分数，张量形状为 `(batch_size, 2)`
    seq_relationship_logits: torch.FloatTensor = None
    # 隐藏状态，类型为可选的浮点张量元组，形状为 `(batch_size, sequence_length, hidden_size)`
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 注意力权重，类型为可选的浮点张量元组，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`
    attentions: Optional[Tuple[torch.FloatTensor]] = None
"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`ErnieConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""



"""
"""
"""

@add_start_docstrings(
    "The bare Ernie Model transformer outputting raw hidden-states without any specific head on top.",
    ERNIE_START_DOCSTRING,
)
"""
# 定义 ErnieModel 类，继承自 ErniePreTrainedModel
class ErnieModel(ErniePreTrainedModel):
    """
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    """



    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Ernie
    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config

        self.embeddings = ErnieEmbeddings(config)  # 初始化 ErnieEmbeddings，用于处理输入的词嵌入
        self.encoder = ErnieEncoder(config)  # 初始化 ErnieEncoder，用于进行编码器的编码

        self.pooler = ErniePooler(config) if add_pooling_layer else None  # 如果 add_pooling_layer 为真，初始化 ErniePooler，用于池化层处理

        # Initialize weights and apply final processing
        self.post_init()  # 执行初始化权重和最终处理



    # Copied from transformers.models.bert.modeling_bert.BertModel.get_input_embeddings
    def get_input_embeddings(self):
        return self.embeddings.word_embeddings  # 返回输入嵌入的词嵌入



    # Copied from transformers.models.bert.modeling_bert.BertModel.set_input_embeddings
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value  # 设置输入嵌入的词嵌入为给定的值



    # Copied from transformers.models.bert.modeling_bert.BertModel._prune_heads
    # 定义一个方法 `_prune_heads`，用于修剪模型中的注意力头部
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历 heads_to_prune 字典中的每个层及对应要修剪的注意力头部列表
        for layer, heads in heads_to_prune.items():
            # 在模型的编码器（encoder）中定位到指定层的注意力（attention）对象，并执行修剪操作
            self.encoder.layer[layer].attention.prune_heads(heads)

    # 声明一个前向传播方法 `forward`，并应用装饰器添加文档字符串和代码示例文档字符串
    @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
@add_start_docstrings(
    """
    Ernie Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    """,
    ERNIE_START_DOCSTRING,
)
class ErnieForPreTraining(ErniePreTrainedModel):
    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]

    # 从 transformers.models.bert.modeling_bert.BertForPreTraining.__init__ 复制而来，将 Bert 替换为 Ernie，bert 替换为 ernie
    def __init__(self, config):
        super().__init__(config)

        self.ernie = ErnieModel(config)
        self.cls = ErniePreTrainingHeads(config)

        # 初始化权重并进行最终处理
        self.post_init()

    # 从 transformers.models.bert.modeling_bert.BertForPreTraining.get_output_embeddings 复制而来
    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    # 从 transformers.models.bert.modeling_bert.BertForPreTraining.set_output_embeddings 复制而来
    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings

    @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=ErnieForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        next_sentence_label: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        前向传播方法，接受多个输入参数，执行 Ernie 模型的预测任务。

        Args:
            input_ids (Optional[torch.Tensor], optional): 输入 token IDs. Defaults to None.
            attention_mask (Optional[torch.Tensor], optional): 注意力掩码，指示哪些元素是填充项. Defaults to None.
            token_type_ids (Optional[torch.Tensor], optional): token 类型 IDs，用于区分句子 A 和句子 B. Defaults to None.
            task_type_ids (Optional[torch.Tensor], optional): 任务类型 IDs，用于特定任务的区分. Defaults to None.
            position_ids (Optional[torch.Tensor], optional): 位置 IDs，指示每个 token 的位置. Defaults to None.
            head_mask (Optional[torch.Tensor], optional): 头部掩码，用于指定哪些注意力头应该被屏蔽. Defaults to None.
            inputs_embeds (Optional[torch.Tensor], optional): 直接输入的嵌入表示. Defaults to None.
            labels (Optional[torch.Tensor], optional): 模型的标签，用于 MLM 损失计算. Defaults to None.
            next_sentence_label (Optional[torch.Tensor], optional): 下一个句子预测的标签. Defaults to None.
            output_attentions (Optional[bool], optional): 是否输出注意力权重. Defaults to None.
            output_hidden_states (Optional[bool], optional): 是否输出隐藏状态. Defaults to None.
            return_dict (Optional[bool], optional): 是否返回字典格式的输出. Defaults to None.

        Returns:
            ErnieForPreTrainingOutput or torch.Tensor: 根据 return_dict 决定返回 ErnieForPreTrainingOutput 对象或直接的张量输出.
        """
        # 实现具体的前向传播逻辑，包括输入处理、模型计算和输出处理
        pass


@add_start_docstrings(
    """Ernie Model with a `language modeling` head on top for CLM fine-tuning.""", ERNIE_START_DOCSTRING
)
class ErnieForCausalLM(ErniePreTrainedModel):
    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]

    # 从 transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ 复制而来，将 BertLMHeadModel->ErnieForCausalLM, Bert->Ernie, bert->ernie
    def __init__(self, config):
        super().__init__(config)

        if not config.is_decoder:
            logger.warning("If you want to use `ErnieForCausalLM` as a standalone, add `is_decoder=True.`")

        # 初始化 Ernie 模型和仅含 MLM 头部的头部
        self.ernie = ErnieModel(config, add_pooling_layer=False)
        self.cls = ErnieOnlyMLMHead(config)

        # 初始化权重并进行最终处理
        self.post_init()

    # 从 transformers.models.bert.modeling_bert.BertLMHeadModel.get_output_embeddings 复制而来
    def get_output_embeddings(self):
        return self.cls.predictions.decoder
    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.set_output_embeddings
    def set_output_embeddings(self, new_embeddings):
        # 将预测层的解码器替换为新的嵌入层
        self.cls.predictions.decoder = new_embeddings
    
    @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.forward
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.Tensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.prepare_inputs_for_generation
    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, use_cache=True, **model_kwargs
    ):
        input_shape = input_ids.shape
        # 如果没有提供注意力掩码，则创建全为1的掩码
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_shape)
    
        # 如果使用了过去的键值对，根据需要截取输入的decoder_input_ids
        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[2]
    
            # 一些生成方法已经只传递了最后一个输入ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认行为：保留最后一个ID
                remove_prefix_length = input_ids.shape[1] - 1
    
            input_ids = input_ids[:, remove_prefix_length:]
    
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "past_key_values": past_key_values,
            "use_cache": use_cache,
        }
    
    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel._reorder_cache
    def _reorder_cache(self, past_key_values, beam_idx):
        reordered_past = ()
        # 重新排序过去的键值对，以匹配新的beam索引
        for layer_past in past_key_values:
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past
@add_start_docstrings("""Ernie Model with a `language modeling` head on top.""", ERNIE_START_DOCSTRING)
class ErnieForMaskedLM(ErniePreTrainedModel):
    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]

    # 从transformers.models.bert.modeling_bert.BertForMaskedLM.__init__复制而来，将Bert->Ernie，bert->ernie
    def __init__(self, config):
        super().__init__(config)

        # 如果配置为decoder，发出警告，因为ErnieForMaskedLM需要使用双向self-attention，所以要求config.is_decoder=False
        if config.is_decoder:
            logger.warning(
                "If you want to use `ErnieForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        # 初始化Ernie模型，不添加池化层
        self.ernie = ErnieModel(config, add_pooling_layer=False)
        # 初始化仅包含MLM头部的ErnieOnlyMLMHead
        self.cls = ErnieOnlyMLMHead(config)

        # 初始化权重并进行最终处理
        self.post_init()

    # 从transformers.models.bert.modeling_bert.BertForMaskedLM.get_output_embeddings复制而来
    def get_output_embeddings(self):
        # 返回MLM头部的预测解码器
        return self.cls.predictions.decoder

    # 从transformers.models.bert.modeling_bert.BertForMaskedLM.set_output_embeddings复制而来
    def set_output_embeddings(self, new_embeddings):
        # 设置MLM头部的预测解码器为新的嵌入层
        self.cls.predictions.decoder = new_embeddings

    # 使用add_start_docstrings_to_model_forward装饰器添加文档字符串到forward方法
    # 使用add_code_sample_docstrings添加代码示例和期望输出的文档字符串
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """

        # 如果 return_dict 不为 None，则使用给定的值；否则使用 self.config.use_return_dict 的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 ERNIE 模型进行前向传播
        outputs = self.ernie(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出的序列输出
        sequence_output = outputs[0]
        # 通过分类器获取预测得分
        prediction_scores = self.cls(sequence_output)

        masked_lm_loss = None
        # 如果提供了标签，则计算 masked language modeling 的损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # 使用交叉熵损失函数，-100 代表填充标记
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        # 如果 return_dict 为 False，则返回元组形式的输出
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # 如果 return_dict 为 True，则返回 MaskedLMOutput 对象
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    # 从 transformers.models.bert.modeling_bert.BertForMaskedLM.prepare_inputs_for_generation 复制而来
    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
        input_shape = input_ids.shape
        effective_batch_size = input_shape[0]

        # 添加一个虚拟的 token
        if self.config.pad_token_id is None:
            raise ValueError("The PAD token should be defined for generation")

        # 扩展 attention_mask，在最后添加一个全零的列
        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
        # 创建一个全是 PAD token 的虚拟 token
        dummy_token = torch.full(
            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
        )
        # 在 input_ids 后面添加虚拟 token
        input_ids = torch.cat([input_ids, dummy_token], dim=1)

        # 返回输入字典，包括修改后的 input_ids 和 attention_mask
        return {"input_ids": input_ids, "attention_mask": attention_mask}
# 定义 ErnieForNextSentencePrediction 类，它在 ERNIE 模型的基础上添加了一个“下一个句子预测（分类）”的头部。
@add_start_docstrings(
    """Ernie Model with a `next sentence prediction (classification)` head on top.""",
    ERNIE_START_DOCSTRING,
)
class ErnieForNextSentencePrediction(ErniePreTrainedModel):
    # 从 transformers.models.bert.modeling_bert.BertForNextSentencePrediction.__init__ 复制而来，将其中的 Bert 改为 Ernie，bert 改为 ernie
    def __init__(self, config):
        super().__init__(config)

        # 初始化 Ernie 模型
        self.ernie = ErnieModel(config)
        # 初始化仅包含 NSP 头部的 ErnieOnlyNSPHead
        self.cls = ErnieOnlyNSPHead(config)

        # 初始化权重并应用最终处理
        self.post_init()

    # 向前传播函数，接受多个输入参数并返回一个输出结果，使用了 add_start_docstrings_to_model_forward 和 replace_return_docstrings 进行文档字符串的注释和替换
    @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
):
        ) -> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring). Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Returns:
            Tuple containing either logits or a full NextSentencePredictorOutput if configured.

        Example:

        ```
        >>> from transformers import AutoTokenizer, ErnieForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
        >>> model = ErnieForNextSentencePrediction.from_pretrained("nghuyong/ernie-1.0-base-zh")

        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> logits = outputs.logits
        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
        ```
        """

        if "next_sentence_label" in kwargs:
            # 如果传入了过时的参数 `next_sentence_label`，发出警告并使用 `labels` 替代
            warnings.warn(
                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
                " `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("next_sentence_label")

        # 确定是否返回字典格式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 ERNIE 模型进行前向传播
        outputs = self.ernie(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 ERNIE 模型输出中提取池化后的输出
        pooled_output = outputs[1]

        # 使用分类器对池化输出进行预测下一个句子关系的分数
        seq_relationship_scores = self.cls(pooled_output)

        next_sentence_loss = None
        if labels is not None:
            # 如果提供了标签，计算下一个句子预测的损失
            loss_fct = CrossEntropyLoss()
            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))

        if not return_dict:
            # 如果不返回字典，则按照旧版格式构造输出
            output = (seq_relationship_scores,) + outputs[2:]
            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output

        # 返回包含损失、分数、隐藏状态和注意力权重的 NextSentencePredictorOutput 对象
        return NextSentencePredictorOutput(
            loss=next_sentence_loss,
            logits=seq_relationship_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    Ernie Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    """,
    ERNIE_START_DOCSTRING,
)
class ErnieForSequenceClassification(ErniePreTrainedModel):
    # 从 transformers.models.bert.modeling_bert.BertForSequenceClassification.__init__ 复制并修改为 Ernie 模型的序列分类/回归头部
    def __init__(self, config):
        super().__init__(config)
        # 初始化时设置标签数量和配置
        self.num_labels = config.num_labels
        self.config = config

        # 使用 ErnieModel 初始化 Ernie 模型
        self.ernie = ErnieModel(config)
        # 根据配置设置分类器的丢弃率，如果未指定，则使用隐藏层丢弃率
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 使用丢弃率初始化 Dropout 层
        self.dropout = nn.Dropout(classifier_dropout)
        # 设置线性分类器层，输入大小为隐藏层大小，输出大小为标签数量
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 初始化返回字典，根据是否已定义确定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用ERNIE模型进行前向传播，获取输出
        outputs = self.ernie(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从ERNIE模型的输出中获取池化后的表示
        pooled_output = outputs[1]

        # 应用Dropout层到池化后的表示
        pooled_output = self.dropout(pooled_output)
        
        # 通过分类器获取预测的逻辑回归
        logits = self.classifier(pooled_output)

        # 初始化损失为None
        loss = None

        # 如果提供了标签，则计算损失
        if labels is not None:
            # 如果问题类型未定义，则根据标签的数据类型和类别数量确定问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择损失函数并计算损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不使用返回字典，则返回输出和损失
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 使用返回字典对象封装损失和模型输出
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
"""
Ernie Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
"""

# 继承自 ErniePreTrainedModel 的 ErnieForMultipleChoice 类，用于多项选择任务的 Ernie 模型
class ErnieForMultipleChoice(ErniePreTrainedModel):
    
    # 从 transformers.models.bert.modeling_bert.BertForMultipleChoice.__init__ 复制而来，将其中的 Bert 替换为 Ernie
    def __init__(self, config):
        super().__init__(config)
        
        # 初始化 ErnieModel
        self.ernie = ErnieModel(config)
        
        # 分类器的 dropout 率，默认使用 config 中的 hidden_dropout_prob
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        
        # 线性分类器，将隐藏状态大小（hidden_size）映射到1，用于多项选择任务
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    # 添加输入文档字符串和示例代码文档字符串到模型前向传播方法
    @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 根据函数声明，接受输入并返回包含损失或输出的元组或多选模型输出对象
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 确定 num_choices，如果没有提供 input_ids，则从 inputs_embeds 计算
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 根据是否为 None，重新形状化输入张量
        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 调用 ERNIE 模型，获取输出
        outputs = self.ernie(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 ERNIE 输出中获取汇聚后的输出
        pooled_output = outputs[1]

        # 对汇聚后的输出应用 dropout
        pooled_output = self.dropout(pooled_output)
        # 使用分类器计算 logits
        logits = self.classifier(pooled_output)
        # 重新形状化 logits 以匹配 num_choices
        reshaped_logits = logits.view(-1, num_choices)

        # 初始化损失为 None
        loss = None
        # 如果提供了标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 如果不要求返回字典形式的输出，构建输出元组
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回多选模型输出对象，包括损失、logits、隐藏状态和注意力权重
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
"""
Ernie Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
"""
# 导入所需的库
@add_start_docstrings(
    """
    添加一个头部的令牌分类器（在隐藏状态输出的顶部添加一个线性层），例如用于命名实体识别（NER）任务。
    """,
    ERNIE_START_DOCSTRING,
)
# 定义 ErnieForTokenClassification 类，继承自 ErniePreTrainedModel
class ErnieForTokenClassification(ErniePreTrainedModel):
    # 从 transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ 复制而来，将 Bert 替换为 Ernie
    def __init__(self, config):
        # 调用父类的构造函数
        super().__init__(config)
        # 设置标签数目
        self.num_labels = config.num_labels

        # 创建 Ernie 模型，不添加池化层
        self.ernie = ErnieModel(config, add_pooling_layer=False)
        
        # 根据配置设置分类器的 dropout，如果未指定则使用隐藏层 dropout
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 创建 Dropout 层
        self.dropout = nn.Dropout(classifier_dropout)
        # 创建线性层作为分类器
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并进行后续处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 定义前向传播方法
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 输入参数说明文档字符串

        batch_size, sequence_length
        """
        # 确保返回的字典选项
        if return_dict is None:
            return_dict = self.config.use_return_dict

        # 执行 Ernie 模型的前向传播，获取输出
        outputs = self.ernie(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 若标签存在，将输出传递给分类器进行分类
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        # 若返回字典，将 logits 加入到输出中
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((logits,) + outputs[2:]) if return_dict else output

        # 创建命名元组并返回
        return TokenClassifierOutput(
            loss=None if labels is None else self.compute_loss(logits, labels),
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 如果 return_dict 为 None，则根据配置决定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 ERNIE 模型进行前向传播
        outputs = self.ernie(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出中的序列输出
        sequence_output = outputs[0]

        # 对序列输出进行 dropout 处理
        sequence_output = self.dropout(sequence_output)
        # 使用分类器对处理后的序列输出进行分类预测
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None
        # 如果存在标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不使用返回字典，则构造输出元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果使用返回字典，则构造 TokenClassifierOutput 对象返回
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    Ernie Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    ERNIE_START_DOCSTRING,
)
class ErnieForQuestionAnswering(ErniePreTrainedModel):
    # 从 transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ 复制过来，将其中的 Bert 修改为 Ernie
    def __init__(self, config):
        # 调用父类的初始化方法，传入配置参数
        super().__init__(config)
        # 设置分类标签的数量
        self.num_labels = config.num_labels

        # 创建 Ernie 模型，不添加池化层
        self.ernie = ErnieModel(config, add_pooling_layer=False)
        # 创建一个线性层，用于输出答案起始位置和结束位置的 logit
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.Tensor] = None,
        end_positions: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ):
    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # Determine if return_dict should be set to self.config.use_return_dict if not provided
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Perform forward pass through the ERNIE model
        outputs = self.ernie(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract the sequence output from the model outputs
        sequence_output = outputs[0]

        # Compute logits for the question answering task
        logits = self.qa_outputs(sequence_output)

        # Split logits into start and end logits
        start_logits, end_logits = logits.split(1, dim=-1)

        # Squeeze unnecessary dimensions and ensure contiguous memory layout
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If start_positions or end_positions have extra dimensions, squeeze them
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)

            # Clamp positions to avoid out-of-bound errors
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # Define CrossEntropyLoss with ignored index
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)

            # Compute start and end loss
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)

            # Calculate total loss as the average of start and end loss
            total_loss = (start_loss + end_loss) / 2

        # If return_dict is False, return outputs in a tuple
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # If return_dict is True, return structured QuestionAnsweringModelOutput
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\ernie\init.py`

# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import TYPE_CHECKING

# 从相对路径导入所需模块和函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tensorflow_text_available, is_torch_available

# 定义预期的模块导入结构
_import_structure = {
    "configuration_ernie": ["ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ErnieConfig", "ErnieOnnxConfig"],
}

# 尝试导入 torch，如果不可用则引发异常并捕获
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 torch 可用，则添加下列模块到导入结构
    _import_structure["modeling_ernie"] = [
        "ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST",
        "ErnieForCausalLM",
        "ErnieForMaskedLM",
        "ErnieForMultipleChoice",
        "ErnieForNextSentencePrediction",
        "ErnieForPreTraining",
        "ErnieForQuestionAnswering",
        "ErnieForSequenceClassification",
        "ErnieForTokenClassification",
        "ErnieModel",
        "ErniePreTrainedModel",
    ]

# 如果是类型检查模式，导入配置和模型相关内容
if TYPE_CHECKING:
    from .configuration_ernie import ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieConfig, ErnieOnnxConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_ernie import (
            ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST,
            ErnieForCausalLM,
            ErnieForMaskedLM,
            ErnieForMultipleChoice,
            ErnieForNextSentencePrediction,
            ErnieForPreTraining,
            ErnieForQuestionAnswering,
            ErnieForSequenceClassification,
            ErnieForTokenClassification,
            ErnieModel,
            ErniePreTrainedModel,
        )

# 如果不是类型检查模式，则设置 LazyModule 来处理模块的惰性加载
else:
    import sys

    # 将当前模块设置为 LazyModule 的实例，以处理按需导入模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\ernie_m\configuration_ernie_m.py`

# coding=utf-8
# 上面是声明文件编码格式为 UTF-8，确保支持中文等特殊字符的正确显示
# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang and The HuggingFace Inc. team. All rights reserved.
# 版权声明，保留所有权利给 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang 和 HuggingFace Inc. 团队
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 许可证授权，除非遵守许可证，否则不得使用此文件
# You may obtain a copy of the License at
# 可以在上述网址获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# 除非适用法律要求或书面同意，否则根据许可证分发的软件基于“按原样”分发，没有任何明示或暗示的担保或条件
""" ErnieM model configuration"""
# ErnieM 模型配置信息
# Adapted from original paddlenlp repository.(https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_m/configuration.py)
# 改编自原始 PaddleNLP 仓库中的代码，此处为 ErnieM 模型的配置文件位置

from __future__ import annotations

from typing import Dict

from ...configuration_utils import PretrainedConfig

# 定义预训练模型及其对应的配置文件链接映射
ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "susnato/ernie-m-base_pytorch": "https://huggingface.co/susnato/ernie-m-base_pytorch/blob/main/config.json",
    "susnato/ernie-m-large_pytorch": "https://huggingface.co/susnato/ernie-m-large_pytorch/blob/main/config.json",
}


class ErnieMConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ErnieMModel`]. It is used to instantiate a
    Ernie-M model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the `Ernie-M`
    [susnato/ernie-m-base_pytorch](https://huggingface.co/susnato/ernie-m-base_pytorch) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    pass  # 此处为占位符，表示该类暂时不需要添加额外的属性或方法
    # 定义函数的默认参数和说明文档，指定了ErnieMModel的输入参数
    Args:
        vocab_size (`int`, *optional*, defaults to 250002):
            `inputs_ids`的词汇表大小。也是令牌嵌入矩阵的词汇大小。
            定义了在调用`ErnieMModel`时`inputs_ids`可以表示的不同令牌数量。
        hidden_size (`int`, *optional*, defaults to 768):
            嵌入层、编码器层和池化层的维度。
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Transformer编码器中隐藏层的数量。
        num_attention_heads (`int`, *optional*, defaults to 12):
            Transformer编码器每个注意力层的注意力头数。
        intermediate_size (`int`, *optional*, defaults to 3072):
            编码器中前馈（ff）层的维度。输入张量首先从hidden_size投影到intermediate_size，
            然后再从intermediate_size投影回hidden_size。通常，intermediate_size大于hidden_size。
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            前馈层中的非线性激活函数。支持`"gelu"`、`"relu"`和其他torch支持的激活函数。
        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            嵌入层和编码器中所有全连接层的dropout概率。
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            所有编码器层中`MultiHeadAttention`中使用的dropout概率，用于丢弃部分注意力目标。
        max_position_embeddings (`int`, *optional*, defaults to 514):
            位置编码维度的最大值，决定了输入序列的最大支持长度。
        initializer_range (`float`, *optional*, defaults to 0.02):
            用于初始化所有权重矩阵的正态分布的标准差。
            令牌词汇表中填充令牌的索引。
        pad_token_id (`int`, *optional*, defaults to 1):
            填充令牌的ID。
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            层归一化层使用的epsilon。
        classifier_dropout (`float`, *optional*):
            分类头部的dropout比率。
        act_dropout (`float`, *optional*, defaults to 0.0):
            在激活函数后使用的dropout概率，用于`ErnieMEncoderLayer`。
    
    # 定义了模型类型为"ernie_m"，并创建了属性映射字典
    model_type = "ernie_m"
    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
    # 初始化函数，用于初始化一个 Transformer 模型的配置参数
    def __init__(
        self,
        vocab_size: int = 250002,                     # 词汇表大小，默认为 250002
        hidden_size: int = 768,                       # 隐藏层大小，默认为 768
        num_hidden_layers: int = 12,                  # 隐藏层数，默认为 12
        num_attention_heads: int = 12,                # 注意力头数，默认为 12
        intermediate_size: int = 3072,                # 中间层大小，默认为 3072
        hidden_act: str = "gelu",                     # 隐藏层激活函数，默认为 "gelu"
        hidden_dropout_prob: float = 0.1,             # 隐藏层 dropout 概率，默认为 0.1
        attention_probs_dropout_prob: float = 0.1,    # 注意力矩阵 dropout 概率，默认为 0.1
        max_position_embeddings: int = 514,           # 最大位置编码数，默认为 514
        initializer_range: float = 0.02,              # 参数初始化范围，默认为 0.02
        pad_token_id: int = 1,                        # 填充 token 的 id，默认为 1
        layer_norm_eps: float = 1e-05,                # Layer Normalization 的 epsilon，默认为 1e-05
        classifier_dropout=None,                      # 分类器 dropout 概率，默认为 None
        act_dropout=0.0,                              # 激活函数 dropout 概率，默认为 0.0
        **kwargs,                                     # 其余未命名参数
    ):
        # 调用父类的初始化函数，设置填充 token 的 id 和其它未命名参数
        super().__init__(pad_token_id=pad_token_id, **kwargs)
        # 设置模型的各项参数
        self.vocab_size = vocab_size                   # 设置词汇表大小
        self.hidden_size = hidden_size                 # 设置隐藏层大小
        self.num_hidden_layers = num_hidden_layers     # 设置隐藏层数
        self.num_attention_heads = num_attention_heads # 设置注意力头数
        self.intermediate_size = intermediate_size     # 设置中间层大小
        self.hidden_act = hidden_act                   # 设置隐藏层激活函数
        self.hidden_dropout_prob = hidden_dropout_prob # 设置隐藏层 dropout 概率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob # 设置注意力矩阵 dropout 概率
        self.max_position_embeddings = max_position_embeddings  # 设置最大位置编码数
        self.initializer_range = initializer_range    # 设置参数初始化范围
        self.layer_norm_eps = layer_norm_eps          # 设置 Layer Normalization 的 epsilon
        self.classifier_dropout = classifier_dropout  # 设置分类器 dropout 概率
        self.act_dropout = act_dropout                # 设置激活函数 dropout 概率

`.\models\ernie_m\modeling_ernie_m.py`

# coding=utf-8
# 版权 2023 年 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang The HuggingFace Inc. team. 保留所有权利。
#
# 根据 Apache 许可证 2.0 版本许可；
# 除非符合许可证的规定，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发的软件
# 没有任何形式的保证或条件，包括但不限于
# 特定用途的隐含保证或条件。
# 有关详细信息，请参阅许可证。

""" PyTorch ErnieM 模型。"""


import math
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn, tensor
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
from .configuration_ernie_m import ErnieMConfig

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "susnato/ernie-m-base_pytorch"
_CONFIG_FOR_DOC = "ErnieMConfig"
_TOKENIZER_FOR_DOC = "ErnieMTokenizer"

# ErnieM 预训练模型存档列表
ERNIE_M_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "susnato/ernie-m-base_pytorch",
    "susnato/ernie-m-large_pytorch",
    # 查看所有 ErnieM 模型，请访问 https://huggingface.co/models?filter=ernie_m
]

# 从 paddlenlp.transformers.ernie_m.modeling.ErnieEmbeddings 改编而来
class ErnieMEmbeddings(nn.Module):
    """从词嵌入和位置嵌入构造嵌入。"""

    def __init__(self, config):
        super().__init__()
        self.hidden_size = config.hidden_size
        # 定义词嵌入层，将词汇表中的词映射到隐藏大小的向量空间
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 定义位置嵌入层，将位置索引映射到隐藏大小的向量空间
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=config.pad_token_id
        )
        # LayerNorm 层，用于归一化隐藏层的输出
        self.layer_norm = nn.LayerNorm(normalized_shape=config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层，用于随机失活以防止过拟合
        self.dropout = nn.Dropout(p=config.hidden_dropout_prob)
        # padding 的索引
        self.padding_idx = config.pad_token_id

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.LongTensor] = None,
        past_key_values_length: int = 0,
    ) -> torch.Tensor:
        # 如果输入的嵌入向量为None，则使用模型的词嵌入层对输入的token IDs进行嵌入
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        
        # 如果位置ID为None，则计算序列的形状并生成位置ID
        if position_ids is None:
            input_shape = inputs_embeds.size()[:-1]  # 获取输入嵌入向量的形状（去掉最后一个维度，通常是序列长度）
            ones = torch.ones(input_shape, dtype=torch.int64, device=inputs_embeds.device)  # 创建全为1的张量，与inputs_embeds设备相同
            seq_length = torch.cumsum(ones, dim=1)  # 按行累积和，生成序列长度张量
            position_ids = seq_length - ones  # 生成位置ID，每个位置ID等于其位置在序列中的索引值减去1

            # 如果过去的键值长度大于0，则调整位置ID
            if past_key_values_length > 0:
                position_ids = position_ids + past_key_values_length
        
        # 为了模仿paddlenlp的实现，在位置ID上增加一个偏移量2
        position_ids += 2
        
        # 使用位置ID获取位置嵌入向量
        position_embeddings = self.position_embeddings(position_ids)
        
        # 将输入嵌入向量和位置嵌入向量相加得到最终的嵌入向量表示
        embeddings = inputs_embeds + position_embeddings
        
        # 对嵌入向量进行Layer Norm归一化
        embeddings = self.layer_norm(embeddings)
        
        # 对归一化后的向量应用Dropout操作
        embeddings = self.dropout(embeddings)

        # 返回最终的嵌入向量表示
        return embeddings
# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->ErnieM,self.value->self.v_proj,self.key->self.k_proj,self.query->self.q_proj
class ErnieMSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 检查隐藏层大小是否能被注意力头数整除，确保兼容性
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 定义线性变换层，将隐藏状态映射到注意力头大小的维度空间
        self.q_proj = nn.Linear(config.hidden_size, self.all_head_size)
        self.k_proj = nn.Linear(config.hidden_size, self.all_head_size)
        self.v_proj = nn.Linear(config.hidden_size, self.all_head_size)

        # 定义 dropout 层，用于在注意力计算时进行随机失活
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        # 如果使用相对位置编码，初始化距离编码的嵌入层
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        self.is_decoder = config.is_decoder

    # 将输入张量重塑为注意力分数计算所需的形状
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数，实现自注意力机制
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
        ):
        pass


class ErnieMAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 初始化 ErnieMAttention 的自注意力层
        self.self_attn = ErnieMSelfAttention(config, position_embedding_type=position_embedding_type)
        # 输出投影层，将隐藏状态映射回原始隐藏大小的空间
        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
        # 初始化一个空集合，用于记录要修剪的注意力头
        self.pruned_heads = set()
    # 根据给定的头部列表来修剪自注意力机制中的头部
    def prune_heads(self, heads):
        # 如果头部列表为空，则直接返回，不执行修剪操作
        if len(heads) == 0:
            return
        
        # 调用函数找到可修剪的头部及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self_attn.num_attention_heads, self.self_attn.attention_head_size, self.pruned_heads
        )

        # 修剪自注意力机制中的线性层
        self.self_attn.q_proj = prune_linear_layer(self.self_attn.q_proj, index)
        self.self_attn.k_proj = prune_linear_layer(self.self_attn.k_proj, index)
        self.self_attn.v_proj = prune_linear_layer(self.self_attn.v_proj, index)
        self.out_proj = prune_linear_layer(self.out_proj, index, dim=1)

        # 更新超参数并存储已修剪的头部信息
        self.self_attn.num_attention_heads = self.self_attn.num_attention_heads - len(heads)
        self.self_attn.all_head_size = self.self_attn.attention_head_size * self.self_attn.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 定义模型的前向传播方法
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 使用自注意力机制处理输入的隐藏状态和其他可选参数
        self_outputs = self.self_attn(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 将自注意力机制的输出经过输出投影层处理
        attention_output = self.out_proj(self_outputs[0])
        # 如果需要输出注意力权重信息，则在输出中包含注意力权重
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要输出注意力权重，则添加到输出中
        return outputs
class ErnieMEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 模仿 PaddleNLP 的实现，设置 dropout 为 0.1，如果配置中未指定隐藏层dropout，则使用默认值
        dropout = 0.1 if config.hidden_dropout_prob is None else config.hidden_dropout_prob
        # 如果配置中未指定激活层dropout，则使用隐藏层dropout值作为激活层dropout
        act_dropout = config.hidden_dropout_prob if config.act_dropout is None else config.act_dropout

        # 初始化自注意力层
        self.self_attn = ErnieMAttention(config)
        # 第一个线性变换层，将隐藏层大小转换为中间层大小
        self.linear1 = nn.Linear(config.hidden_size, config.intermediate_size)
        # 激活层dropout
        self.dropout = nn.Dropout(act_dropout)
        # 第二个线性变换层，将中间层大小转换回隐藏层大小
        self.linear2 = nn.Linear(config.intermediate_size, config.hidden_size)
        # 第一个 LayerNorm 层，用于归一化隐藏层数据
        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 第二个 LayerNorm 层，用于归一化线性变换后的数据
        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 第一个 dropout 层，应用于第一个线性变换后的数据
        self.dropout1 = nn.Dropout(dropout)
        # 第二个 dropout 层，应用于第二个线性变换后的数据
        self.dropout2 = nn.Dropout(dropout)
        
        # 根据配置中的激活函数类型选择相应的激活函数
        if isinstance(config.hidden_act, str):
            self.activation = ACT2FN[config.hidden_act]
        else:
            self.activation = config.hidden_act

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = True,
    ):
        # 保留残差连接
        residual = hidden_states
        # 如果需要输出注意力权重，则在自注意力层中返回注意力权重
        if output_attentions:
            hidden_states, attention_opt_weights = self.self_attn(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
            )
        else:
            # 否则，仅返回自注意力层的输出隐藏状态
            hidden_states = self.self_attn(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
            )
        
        # 添加第一个 dropout，并与残差连接
        hidden_states = residual + self.dropout1(hidden_states)
        # 第一个 LayerNorm 层，用于归一化第一次线性变换后的数据
        hidden_states = self.norm1(hidden_states)
        # 更新残差连接
        residual = hidden_states
        
        # 第二次线性变换，应用于归一化后的数据
        hidden_states = self.linear1(hidden_states)
        # 应用激活函数
        hidden_states = self.activation(hidden_states)
        # 第一个 dropout 层，应用于激活后的数据
        hidden_states = self.dropout(hidden_states)
        # 第二次线性变换
        hidden_states = self.linear2(hidden_states)
        # 添加第二个 dropout，并与残差连接
        hidden_states = residual + self.dropout2(hidden_states)
        # 第二个 LayerNorm 层，用于归一化第二次线性变换后的数据
        hidden_states = self.norm2(hidden_states)

        # 如果需要输出注意力权重，则返回注意力权重和隐藏状态
        if output_attentions:
            return hidden_states, attention_opt_weights
        else:
            # 否则，仅返回隐藏状态
            return hidden_states


class ErnieMEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 存储配置
        self.config = config
        # 创建多个 ErnieMEncoderLayer 层，根据配置中的隐藏层数量
        self.layers = nn.ModuleList([ErnieMEncoderLayer(config) for _ in range(config.num_hidden_layers)])
    # 定义前向传播函数，接收多个输入参数和可选的返回值设定
    def forward(
        self,
        input_embeds: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        # 如果输出隐藏状态，则初始化一个空元组用于存储隐藏状态
        hidden_states = () if output_hidden_states else None
        # 如果输出注意力权重，则初始化一个空元组用于存储注意力权重
        attentions = () if output_attentions else None

        # 初始化输出为输入的嵌入向量
        output = input_embeds
        # 如果需要输出隐藏状态，则将当前输出加入到隐藏状态元组中
        if output_hidden_states:
            hidden_states = hidden_states + (output,)

        # 遍历所有层进行前向传播
        for i, layer in enumerate(self.layers):
            # 获取当前层的头部掩码，如果未提供头部掩码则为None
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 获取当前层的过去键值对，如果未提供则为None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 调用当前层的前向传播函数，更新输出和可选的注意力权重
            output, opt_attn_weights = layer(
                hidden_states=output,
                attention_mask=attention_mask,
                head_mask=layer_head_mask,
                past_key_value=past_key_value,
            )

            # 如果需要输出隐藏状态，则将当前输出加入到隐藏状态元组中
            if output_hidden_states:
                hidden_states = hidden_states + (output,)
            # 如果需要输出注意力权重，则将当前注意力权重加入到注意力元组中
            if output_attentions:
                attentions = attentions + (opt_attn_weights,)

        # 最终的隐藏状态为最后一层的输出
        last_hidden_state = output
        # 如果不需要返回字典形式的输出，则返回非空的元组
        if not return_dict:
            return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None)

        # 返回带有过去和交叉注意力的基础模型输出对象
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=last_hidden_state, hidden_states=hidden_states, attentions=attentions
        )
# 从transformers.models.bert.modeling_bert.BertPooler复制过来，将Bert->ErnieM
class ErnieMPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)  # 初始化线性层，输入和输出维度都是config.hidden_size
        self.activation = nn.Tanh()  # Tanh激活函数

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 我们通过简单地取第一个标记对应的隐藏状态来“汇聚”模型
        first_token_tensor = hidden_states[:, 0]  # 取第一个标记对应的隐藏状态
        pooled_output = self.dense(first_token_tensor)  # 输入到线性层
        pooled_output = self.activation(pooled_output)  # 应用Tanh激活函数
        return pooled_output  # 返回汇聚输出


class ErnieMPreTrainedModel(PreTrainedModel):
    """
    一个抽象类，处理权重初始化以及下载和加载预训练模型的简单接口。
    """

    config_class = ErnieMConfig  # 配置类为ErnieMConfig
    base_model_prefix = "ernie_m"  # 基础模型前缀为"ernie_m"

    def _init_weights(self, module):
        """初始化权重"""
        if isinstance(module, nn.Linear):
            # 与TF版本稍有不同，TF版本使用截断正态分布进行初始化
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


ERNIE_M_START_DOCSTRING = r"""

    此模型继承自[`PreTrainedModel`]。查看超类文档以获取库实现的所有模型的通用方法（例如下载或保存、调整输入嵌入、修剪头等）。

    此模型是PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)的子类。将其用作常规PyTorch模块，并参考PyTorch文档，了解与一般使用和行为相关的所有内容。

    参数:
        config ([`ErnieMConfig`]): 包含模型所有参数的配置类。
            使用配置文件初始化不会加载与模型关联的权重，只加载配置。请查看[`~PreTrainedModel.from_pretrained`]方法以加载模型权重。
"""

ERNIE_M_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`ErnieMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.


注释：


# input_ids: 输入序列标记在词汇表中的索引
#   这些索引可以使用 ErnieMTokenizer 获取。参见 PreTrainedTokenizer.encode 和 PreTrainedTokenizer.__call__ 以获取详细信息。
#   更多关于输入 ID 的信息请参考 glossary 中的 input-ids 页面。

# attention_mask: 注意力掩码，避免在填充的标记索引上执行注意力操作。掩码的取值范围为 [0, 1]：
#   - 1 表示不屏蔽的标记，
#   - 0 表示被屏蔽的标记。
#   更多关于注意力掩码的信息请参考 glossary 中的 attention-mask 页面。

# position_ids: 输入序列中每个标记的位置索引，在位置嵌入中使用。取值范围为 [0, config.max_position_embeddings - 1]。
#   更多关于位置 ID 的信息请参考 glossary 中的 position-ids 页面。

# head_mask: 自注意力模块中需要屏蔽的头部掩码。掩码的取值范围为 [0, 1]：
#   - 1 表示未屏蔽的头部，
#   - 0 表示屏蔽的头部。
#   更多关于头部掩码的信息请参考 glossary 中的 attention-mask 页面。

# inputs_embeds: 可选项，可以直接传入嵌入表示而不是传入 input_ids。如果希望更精确地控制如何将 input_ids 转换为关联向量，这非常有用。
#   这种方式比模型内部的嵌入查找矩阵更具控制性。
  
# output_attentions: 是否返回所有注意力层的注意力张量。请参见返回的张量中的 attentions 获取更多细节。

# output_hidden_states: 是否返回所有层的隐藏状态。请参见返回的张量中的 hidden_states 获取更多细节。

# return_dict: 是否返回 utils.ModelOutput 而不是普通的元组。
"""

@add_start_docstrings(
    "The bare ErnieM Model transformer outputting raw hidden-states without any specific head on top.",
    ERNIE_M_START_DOCSTRING,
)
# 定义 ErnieMModel 类，继承自 ErnieMPreTrainedModel
class ErnieMModel(ErnieMPreTrainedModel):
    # 初始化方法
    def __init__(self, config, add_pooling_layer=True):
        # 调用父类初始化方法
        super(ErnieMModel, self).__init__(config)
        # 初始化变量 initializer_range
        self.initializer_range = config.initializer_range
        # 创建 ErnieMEmbeddings 对象
        self.embeddings = ErnieMEmbeddings(config)
        # 创建 ErnieMEncoder 对象
        self.encoder = ErnieMEncoder(config)
        # 如果 add_pooling_layer 为 True，则创建 ErnieMPooler 对象
        self.pooler = ErnieMPooler(config) if add_pooling_layer else None
        # 执行后续初始化
        self.post_init()

    # 获取输入嵌入层对象的方法
    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    # 设置输入嵌入层对象的方法
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    # 剪枝模型中的注意力头方法
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layers[layer].self_attn.prune_heads(heads)

    # 定义前向传播方法，用于模型推理
    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        processor_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPastAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[tensor] = None,
        position_ids: Optional[tensor] = None,
        attention_mask: Optional[tensor] = None,
        head_mask: Optional[tensor] = None,
        inputs_embeds: Optional[tensor] = None,
        past_key_values: Optional[Tuple[Tuple[tensor]]] = None,
        use_cache: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 详见模型前向传播的文档字符串
        pass


@add_start_docstrings(
    """ErnieM Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.""",
    ERNIE_M_START_DOCSTRING,
)
# 定义 ErnieMForSequenceClassification 类，继承自 ErnieMPreTrainedModel
class ErnieMForSequenceClassification(ErnieMPreTrainedModel):
    # 初始化方法
    def __init__(self, config):
        # 调用父类初始化方法
        super().__init__(config)
        # 初始化变量 num_labels
        self.num_labels = config.num_labels
        # 将配置参数保存在 self.config 中
        self.config = config

        # 创建 ErnieMModel 对象
        self.ernie_m = ErnieMModel(config)
        # 设置分类器的 dropout
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 创建 Dropout 层对象
        self.dropout = nn.Dropout(classifier_dropout)
        # 创建线性分类器层对象
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并进行后续处理
        self.post_init()

    # 定义前向传播方法，用于模型推理
    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    def forward(
        self,
        input_ids: Optional[tensor] = None,
        position_ids: Optional[tensor] = None,
        attention_mask: Optional[tensor] = None,
        head_mask: Optional[tensor] = None,
        inputs_embeds: Optional[tensor] = None,
        past_key_values: Optional[Tuple[Tuple[tensor]]] = None,
        use_cache: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 详见模型前向传播的文档字符串
        pass
    # 添加代码示例的文档字符串，用于自动文档生成
    @add_code_sample_docstrings(
        # 指定用于处理的处理器类别
        processor_class=_TOKENIZER_FOR_DOC,
        # 指定用于文档的检查点
        checkpoint=_CHECKPOINT_FOR_DOC,
        # 指定输出类型为序列分类器输出对象
        output_type=SequenceClassifierOutput,
        # 指定用于配置的配置类
        config_class=_CONFIG_FOR_DOC,
    )
    # 前向传播函数，接受多个输入参数并返回模型输出
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.Tensor]] = None,
        use_cache: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        return_dict: Optional[bool] = True,
        labels: Optional[torch.Tensor] = None,
    ) -> Union[Tuple[torch.FloatTensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用给定的值；否则使用 self.config.use_return_dict 的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 ERNIE 模型进行前向传播
        outputs = self.ernie_m(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            past_key_values=past_key_values,
            output_hidden_states=output_hidden_states,
            output_attentions=output_attentions,
            return_dict=return_dict,
        )

        # 获取池化后的输出
        pooled_output = outputs[1]

        # 对池化输出进行 dropout 处理
        pooled_output = self.dropout(pooled_output)
        # 将处理后的输出传入分类器，得到 logits
        logits = self.classifier(pooled_output)

        loss = None
        # 如果存在 labels，则计算损失
        if labels is not None:
            # 根据问题类型配置 self.config.problem_type
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择损失函数并计算损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则返回一个元组，包含 logits 和可能的额外输出
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回一个 SequenceClassifierOutput 对象
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 添加类的文档字符串，描述该类是基于ErnieM模型的多选分类模型，用于例如RocStories/SWAG任务
@add_start_docstrings(
    """ErnieM Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
    ERNIE_M_START_DOCSTRING,
)
# 定义ErnieMForMultipleChoice类，继承自ErnieMPreTrainedModel类
class ErnieMForMultipleChoice(ErnieMPreTrainedModel):
    
    # 从transformers.models.bert.modeling_bert.BertForMultipleChoice.__init__复制而来，修改了Bert为ErnieM，bert为ernie_m
    # 初始化方法
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        
        # 创建ErnieMModel实例，用于提取特征
        self.ernie_m = ErnieMModel(config)
        
        # 根据配置设置分类器的dropout比率
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 创建一个dropout层，应用于分类器
        self.dropout = nn.Dropout(classifier_dropout)
        
        # 创建一个线性层，将隐藏状态的特征映射到1维输出（用于二元分类）
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    # 添加文档字符串到模型的前向传播方法，描述了输入的参数形状和用法
    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    # 添加代码示例的文档字符串，指定了检查点、输出类型和配置类
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义前向传播方法
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = True,
        # 输入参数详细描述如下：
        # input_ids: 输入的token IDs
        # attention_mask: 注意力掩码，指示模型注意力的计算范围
        # position_ids: 位置 IDs，指示输入token的位置信息
        # head_mask: 头部掩码，用于指定哪些注意力头部被屏蔽
        # inputs_embeds: 嵌入的输入特征，如果不是None，则忽略input_ids
        # labels: 模型的标签，用于训练时计算损失
        # output_attentions: 是否输出注意力权重
        # output_hidden_states: 是否输出隐藏状态
        # return_dict: 是否返回字典格式的输出
        

        # return_dict: 是否返回字典格式的输出
        ) -> Union[Tuple[torch.FloatTensor], MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 根据函数签名，此函数接受输入并返回一个元组，包含浮点张量或多选模型输出对象
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 确定选择数目，根据输入的 `input_ids` 的第二维度或者 `inputs_embeds` 的第二维度
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 如果 `input_ids` 不为 `None`，重新视图化为二维张量，否则为 `None`
        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        # 如果 `attention_mask` 不为 `None`，重新视图化为二维张量，否则为 `None`
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        # 如果 `position_ids` 不为 `None`，重新视图化为二维张量，否则为 `None`
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        # 如果 `inputs_embeds` 不为 `None`，重新视图化为三维张量，否则为 `None`
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 调用 ERNIE 模型 (`self.ernie_m`) 进行前向传播
        outputs = self.ernie_m(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取汇聚输出，通常是 ERNIE 模型的第二个输出
        pooled_output = outputs[1]

        # 应用 dropout
        pooled_output = self.dropout(pooled_output)
        # 使用分类器得出 logits
        logits = self.classifier(pooled_output)
        # 调整 logits 的形状，以便与标签匹配
        reshaped_logits = logits.view(-1, num_choices)

        # 初始化损失为 None
        loss = None
        # 如果提供了标签，计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 如果 `return_dict` 是 False，返回一个元组，包含重塑后的 logits 和可能的隐藏状态
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 `return_dict` 是 True，返回一个 `MultipleChoiceModelOutput` 对象
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 使用装饰器添加文档字符串，描述了 ErnieM 模型在标记分类任务上的用途，例如命名实体识别（NER）任务
@add_start_docstrings(
    """ErnieM Model with a token classification head on top (a linear layer on top of
    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""",
    ERNIE_M_START_DOCSTRING,
)
# 定义 ErnieMForTokenClassification 类，继承自 ErnieMPreTrainedModel 类
class ErnieMForTokenClassification(ErnieMPreTrainedModel):
    # 从 transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ 复制而来，将 Bert 替换为 ErnieM，bert 替换为 ernie_m
    def __init__(self, config):
        # 调用父类的构造函数
        super().__init__(config)
        # 设置类别数目
        self.num_labels = config.num_labels

        # 使用 ErnieMModel 构建 ErnieM 模型，关闭 pooling 层
        self.ernie_m = ErnieMModel(config, add_pooling_layer=False)
        
        # 根据配置决定分类器的 dropout，若未设置，则使用隐藏层 dropout
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 定义 Dropout 层
        self.dropout = nn.Dropout(classifier_dropout)
        # 定义线性分类器，输入大小为隐藏层大小，输出大小为类别数目
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并进行最终处理
        self.post_init()

    # 使用装饰器添加文档字符串到 forward 方法，描述了输入参数的含义和用法
    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 添加代码示例的文档字符串，描述了 processor_class、checkpoint、output_type 和 config_class 的信息
    @add_code_sample_docstrings(
        processor_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义 forward 方法，接收多个输入参数，返回模型的输出
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.Tensor]] = None,
        output_hidden_states: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        return_dict: Optional[bool] = True,
        labels: Optional[torch.Tensor] = None,
    ) -> Union[Tuple[torch.FloatTensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 如果 return_dict 为 None，则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用 ErnieModel 对象进行前向传播
        outputs = self.ernie_m(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            past_key_values=past_key_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出的序列输出
        sequence_output = outputs[0]

        # 对序列输出进行 dropout 操作
        sequence_output = self.dropout(sequence_output)
        # 将 dropout 后的结果输入分类器，得到 logits
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None
        # 如果提供了标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果 return_dict 为 False，则返回输出的元组
        if not return_dict:
            output = (logits,) + outputs[2:]  # 这里的 outputs[2:] 包含额外的隐藏状态
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 TokenClassifierOutput 对象
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,  # 返回所有隐藏状态
            attentions=outputs.attentions,        # 返回所有注意力权重
        )
# 在ErnieM模型基础上添加一个用于抽取式问答任务的分类头部，例如SQuAD任务（在隐藏状态输出之上的线性层，用于计算`span start logits`和`span end logits`）。
@add_start_docstrings(
    """ErnieM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
    ERNIE_M_START_DOCSTRING,
)
class ErnieMForQuestionAnswering(ErnieMPreTrainedModel):
    # 从transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__中复制而来，将Bert->ErnieM, bert->ernie_m
    def __init__(self, config):
        # 调用父类初始化函数
        super().__init__(config)
        # 设置分类任务的标签数目
        self.num_labels = config.num_labels

        # 初始化ErnieM模型，不添加池化层
        self.ernie_m = ErnieMModel(config, add_pooling_layer=False)
        # 线性层，用于生成分类任务的输出
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        processor_class=_TOKENIZER_FOR_DOC,
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=QuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.Tensor] = None,
        end_positions: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple[torch.FloatTensor], QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # Decide whether to use return_dict based on input or default configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass input through the ERNIE model and retrieve outputs
        outputs = self.ernie_m(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract the sequence output from the model outputs
        sequence_output = outputs[0]

        # Compute logits for question answering from the sequence output
        logits = self.qa_outputs(sequence_output)
        
        # Split logits into start and end logits for the predicted spans
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        # Calculate total loss if start_positions and end_positions are provided
        if start_positions is not None and end_positions is not None:
            # If inputs are on multi-GPU, adjust dimensions
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            
            # Clamp positions to avoid errors when indices are out of range
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # Define CrossEntropyLoss with ignored_index
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        # Prepare output based on whether return_dict is False
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output
        
        # Return structured output using QuestionAnsweringModelOutput
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 添加类的文档字符串，描述了 ErnieMForInformationExtraction 类的作用和设计用途
@add_start_docstrings(
    """ErnieMForInformationExtraction is a Ernie-M Model with two linear layer on top of the hidden-states output to
    compute `start_prob` and `end_prob`, designed for Universal Information Extraction.""",
    ERNIE_M_START_DOCSTRING,
)
# 继承自 ErnieMPreTrainedModel 的 ErnieMForInformationExtraction 类，用于信息抽取任务
class ErnieMForInformationExtraction(ErnieMPreTrainedModel):
    def __init__(self, config):
        # 调用父类的初始化方法
        super(ErnieMForInformationExtraction, self).__init__(config)
        # 初始化 ErnieMModel 模型
        self.ernie_m = ErnieMModel(config)
        # 创建线性层，用于计算起始位置的概率
        self.linear_start = nn.Linear(config.hidden_size, 1)
        # 创建线性层，用于计算结束位置的概率
        self.linear_end = nn.Linear(config.hidden_size, 1)
        # 创建 sigmoid 激活函数，用于输出概率值
        self.sigmoid = nn.Sigmoid()
        # 执行后初始化操作
        self.post_init()

    # 为 forward 方法添加文档字符串，描述了输入参数及其含义
    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.Tensor] = None,
        end_positions: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple[torch.FloatTensor], QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for position (index) for computing the start_positions loss. Position outside of the sequence are
            not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) for computing the end_positions loss. Position outside of the sequence are not
            taken into account for computing the loss.
        """

        # 使用 ERNIE 模型处理输入数据，根据参数配置返回不同的输出格式
        result = self.ernie_m(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        if return_dict:
            # 如果 return_dict 为 True，则直接从 result 中获取最后一层隐藏状态
            sequence_output = result.last_hidden_state
        elif not return_dict:
            # 如果 return_dict 为 False，则从 result 的第一个元素获取最后一层隐藏状态
            sequence_output = result[0]

        # 经过线性层处理，获取起始位置的 logits，并进行维度压缩
        start_logits = self.linear_start(sequence_output)
        start_logits = start_logits.squeeze(-1)
        # 经过线性层处理，获取结束位置的 logits，并进行维度压缩
        end_logits = self.linear_end(sequence_output)
        end_logits = end_logits.squeeze(-1)

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # 如果 start_positions 或 end_positions 的维度大于 1，进行维度压缩
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # 对超出模型输入范围的 start/end positions 进行修正
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # 定义二元交叉熵损失函数
            loss_fct = BCEWithLogitsLoss()
            # 计算起始位置和结束位置的损失值
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            # 如果 return_dict 为 False，返回一个包含非空结果的元组
            return tuple(
                i
                for i in [total_loss, start_logits, end_logits, result.hidden_states, result.attentions]
                if i is not None
            )

        # 如果 return_dict 为 True，返回一个 QuestionAnsweringModelOutput 对象
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=result.hidden_states,
            attentions=result.attentions,
        )

posted @ 2024-06-30 15:37 绝不原创的飞龙阅读(11) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-四十五-

Transformers 源码解析（四十五）

`.\models\encodec\feature_extraction_encodec.py`

`.\models\encodec\modeling_encodec.py`

`.\models\encodec\init.py`

`.\models\encoder_decoder\configuration_encoder_decoder.py`

`.\models\encoder_decoder\modeling_encoder_decoder.py`

`.\models\encoder_decoder\modeling_flax_encoder_decoder.py`

`.\models\encoder_decoder\modeling_tf_encoder_decoder.py`

定义一个函数，用于将输入的token_ids向右移动，模拟decoder端的输入

TFEncoderDecoderModel类，继承自TFPreTrainedModel和TFCausalLanguageModelingLoss

`.\models\ernie\configuration_ernie.py`

`.\models\ernie\modeling_ernie.py`

`.\models\ernie\init.py`

`.\models\ernie_m\configuration_ernie_m.py`

`.\models\ernie_m\modeling_ernie_m.py`

公告

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-四十五-

Transformers 源码解析（四十五）

.\models\encodec\feature_extraction_encodec.py

.\models\encodec\modeling_encodec.py

.\models\encodec\__init__.py

.\models\encoder_decoder\configuration_encoder_decoder.py

.\models\encoder_decoder\modeling_encoder_decoder.py

.\models\encoder_decoder\modeling_flax_encoder_decoder.py

.\models\encoder_decoder\modeling_tf_encoder_decoder.py

定义一个函数，用于将输入的token_ids向右移动，模拟decoder端的输入

TFEncoderDecoderModel类，继承自TFPreTrainedModel和TFCausalLanguageModelingLoss

.\models\ernie\configuration_ernie.py

.\models\ernie\modeling_ernie.py

.\models\ernie\__init__.py

.\models\ernie_m\configuration_ernie_m.py

.\models\ernie_m\modeling_ernie_m.py

公告

`.\models\encodec\feature_extraction_encodec.py`

`.\models\encodec\modeling_encodec.py`

`.\models\encodec\init.py`

`.\models\encoder_decoder\configuration_encoder_decoder.py`

`.\models\encoder_decoder\modeling_encoder_decoder.py`

`.\models\encoder_decoder\modeling_flax_encoder_decoder.py`

`.\models\encoder_decoder\modeling_tf_encoder_decoder.py`

`.\models\ernie\configuration_ernie.py`

`.\models\ernie\modeling_ernie.py`

`.\models\ernie\init.py`

`.\models\ernie_m\configuration_ernie_m.py`

`.\models\ernie_m\modeling_ernie_m.py`