Transformers-源码解析-一百二十七-

Transformers 源码解析（一百二十七）

`.\models\xlm_roberta\modeling_tf_xlm_roberta.py`

# 编码声明，指定文件编码为UTF-8
# 版权声明，版权归Facebook AI Research和HuggingFace Inc.团队所有
# 版权声明，版权归NVIDIA CORPORATION所有，保留所有权利
#
# 根据Apache许可证2.0版授权，除非符合许可证要求，否则不得使用此文件
# 您可以在以下链接获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则依据“原样”分发本软件，
# 不提供任何形式的明示或暗示担保或条件。
# 请查阅许可证了解详细信息
""" TF 2.0 XLM-RoBERTa 模型。"""

from __future__ import annotations  # 用于支持后续版本的类型注释

import math  # 导入数学模块
import warnings  # 导入警告模块
from typing import Optional, Tuple, Union  # 导入类型注释相关模块

import numpy as np  # 导入NumPy库
import tensorflow as tf  # 导入TensorFlow库

from ...activations_tf import get_tf_activation  # 导入自定义TensorFlow激活函数
from ...modeling_tf_outputs import (  # 导入TensorFlow模型输出相关模块
    TFBaseModelOutputWithPastAndCrossAttentions,
    TFBaseModelOutputWithPoolingAndCrossAttentions,
    TFCausalLMOutputWithCrossAttentions,
    TFMaskedLMOutput,
    TFMultipleChoiceModelOutput,
    TFQuestionAnsweringModelOutput,
    TFSequenceClassifierOutput,
    TFTokenClassifierOutput,
)
from ...modeling_tf_utils import (  # 导入TensorFlow模型工具函数
    TFCausalLanguageModelingLoss,
    TFMaskedLanguageModelingLoss,
    TFModelInputType,
    TFMultipleChoiceLoss,
    TFPreTrainedModel,
    TFQuestionAnsweringLoss,
    TFSequenceClassificationLoss,
    TFTokenClassificationLoss,
    get_initializer,
    keras,
    keras_serializable,
    unpack_inputs,
)
from ...tf_utils import (  # 导入TensorFlow工具函数
    check_embeddings_within_bounds,
    shape_list,
    stable_softmax,
)
from ...utils import (  # 导入通用工具函数
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from .configuration_xlm_roberta import XLMRobertaConfig  # 导入XLM-RoBERTa配置

logger = logging.get_logger(__name__)  # 获取模块专用的日志记录器

logger = logging.get_logger(__name__)  # 获取模块专用的日志记录器

_CHECKPOINT_FOR_DOC = "FacebookAI/xlm-roberta-base"  # 预训练模型的检查点名称，用于文档
_CONFIG_FOR_DOC = "XLMRobertaConfig"  # XLM-RoBERTa配置的名称，用于文档

# XLM-RoBERTa预训练模型的存档列表
TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "FacebookAI/xlm-roberta-base",
    "FacebookAI/xlm-roberta-large",
    "joeddav/xlm-roberta-large-xnli",
    "cardiffnlp/twitter-xlm-roberta-base-sentiment",
    # 更多的模型存档可以在https://huggingface.co/models?filter=xlm-roberta查看
]

# XLM-RoBERTa模型文档的起始描述
XLM_ROBERTA_START_DOCSTRING = r"""

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:
"""
    # having all inputs as keyword arguments (like PyTorch models), or
    # having all inputs as a list, tuple or dict in the first positional argument.
    
    # The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    # and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    # pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    # format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    # the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    # positional argument:
    
    # a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    # a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    # `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    # a dictionary with one or several input Tensors associated to the input names given in the docstring:
    # `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
    
    # Note that when creating models and layers with
    # [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    # about any of this, as you can just pass inputs like you would to any other Python function!
"""

XLM_ROBERTA_INPUTS_DOCSTRING = r"""
"""


# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings 复制并修改为 XLMRobertaEmbeddings
class TFXLMRobertaEmbeddings(keras.layers.Layer):
    """
    和 BertEmbeddings 相同，但稍作调整以适应位置嵌入的索引。
    """

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)

        # 设定填充符索引为1
        self.padding_idx = 1
        self.config = config
        self.hidden_size = config.hidden_size
        self.max_position_embeddings = config.max_position_embeddings
        self.initializer_range = config.initializer_range
        # 使用配置的 epsilon 值创建 LayerNormalization 层
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 使用配置的 dropout 概率创建 Dropout 层
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)

    def build(self, input_shape=None):
        with tf.name_scope("word_embeddings"):
            # 创建词嵌入权重矩阵，形状为 [vocab_size, hidden_size]
            self.weight = self.add_weight(
                name="weight",
                shape=[self.config.vocab_size, self.hidden_size],
                initializer=get_initializer(self.initializer_range),
            )

        with tf.name_scope("token_type_embeddings"):
            # 创建类型嵌入权重矩阵，形状为 [type_vocab_size, hidden_size]
            self.token_type_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.config.type_vocab_size, self.hidden_size],
                initializer=get_initializer(self.initializer_range),
            )

        with tf.name_scope("position_embeddings"):
            # 创建位置嵌入权重矩阵，形状为 [max_position_embeddings, hidden_size]
            self.position_embeddings = self.add_weight(
                name="embeddings",
                shape=[self.max_position_embeddings, self.hidden_size],
                initializer=get_initializer(self.initializer_range),
            )

        if self.built:
            return
        self.built = True
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                # 构建 LayerNormalization 层，输入形状为 [None, None, hidden_size]
                self.LayerNorm.build([None, None, self.config.hidden_size])

    def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
        """
        将非填充符号替换为它们的位置编号。位置编号从 padding_idx+1 开始，忽略填充符号。
        这是从 fairseq 的 `utils.make_positions` 修改而来。

        Args:
            input_ids: tf.Tensor
        Returns: tf.Tensor
        """
        # 创建一个掩码，将非填充符号转换为 1，其余为 0
        mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype)
        # 计算累积位置索引，跳过填充符号
        incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask

        return incremental_indices + self.padding_idx

    def call(
        self,
        input_ids=None,
        position_ids=None,
        token_type_ids=None,
        inputs_embeds=None,
        past_key_values_length=0,
        training=False,
    ):
        """
        Applies embedding based on inputs tensor.

        Returns:
            final_embeddings (`tf.Tensor`): output embedding tensor.
        """
        assert not (input_ids is None and inputs_embeds is None)  # 断言确保 input_ids 和 inputs_embeds 至少有一个不为空

        if input_ids is not None:
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)  # 检查 input_ids 是否在有效范围内
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)  # 根据 input_ids 从权重矩阵中获取对应的嵌入向量

        input_shape = shape_list(inputs_embeds)[:-1]  # 获取输入嵌入张量的形状，去掉最后一个维度（通常是嵌入维度）

        if token_type_ids is None:
            token_type_ids = tf.fill(dims=input_shape, value=0)  # 如果未提供 token_type_ids，则创建一个全零的张量，形状与输入嵌入张量一致

        if position_ids is None:
            if input_ids is not None:
                # 如果存在 input_ids，则基于它创建 position_ids，确保任何填充的标记仍然是填充的
                position_ids = self.create_position_ids_from_input_ids(
                    input_ids=input_ids, past_key_values_length=past_key_values_length
                )
            else:
                # 否则，创建默认的 position_ids，从 self.padding_idx 开始，长度为 input_shape[-1]
                position_ids = tf.expand_dims(
                    tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
                )

        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)  # 根据 position_ids 从位置嵌入矩阵中获取位置嵌入向量
        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)  # 根据 token_type_ids 从 token type 嵌入矩阵中获取 token type 嵌入向量
        final_embeddings = inputs_embeds + position_embeds + token_type_embeds  # 将输入嵌入、位置嵌入和 token type 嵌入相加，形成最终的嵌入向量
        final_embeddings = self.LayerNorm(inputs=final_embeddings)  # 对最终的嵌入向量进行 Layer Normalization 处理
        final_embeddings = self.dropout(inputs=final_embeddings, training=training)  # 应用 dropout 处理最终的嵌入向量，用于训练中的随机失活

        return final_embeddings  # 返回处理后的最终嵌入向量作为输出
# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->XLMRoberta
class TFXLMRobertaPooler(keras.layers.Layer):
    def __init__(self, config: XLMRobertaConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化一个全连接层，用于池化隐藏状态
        self.dense = keras.layers.Dense(
            units=config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="tanh",
            name="dense",
        )
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 池化模型的输出，通过取第一个标记对应的隐藏状态
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(inputs=first_token_tensor)

        return pooled_output

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "dense", None) is not None:
            # 构建密集层
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->XLMRoberta
class TFXLMRobertaSelfAttention(keras.layers.Layer):
    def __init__(self, config: XLMRobertaConfig, **kwargs):
        super().__init__(**kwargs)

        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
                f"of attention heads ({config.num_attention_heads})"
            )

        # 定义自注意力层，确保隐藏大小是注意力头数的倍数
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)

        # 定义查询、键、值的全连接层
        self.query = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
        )
        self.key = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
        )
        self.value = keras.layers.Dense(
            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
        )
        # 定义dropout层，用于注意力概率的随机丢弃
        self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)

        self.is_decoder = config.is_decoder
        self.config = config
    # 将输入的张量重塑为 [batch_size, seq_length, num_attention_heads, attention_head_size] 的形状
    tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))

    # 将张量进行转置，从 [batch_size, seq_length, num_attention_heads, attention_head_size] 变为 [batch_size, num_attention_heads, seq_length, attention_head_size]
    return tf.transpose(tensor, perm=[0, 2, 1, 3])

    # 检查是否已经构建过网络层，如果是则直接返回
    if self.built:
        return

    # 标记网络层已构建
    self.built = True

    # 如果存在查询（query）张量，则按指定形状构建
    if getattr(self, "query", None) is not None:
        with tf.name_scope(self.query.name):
            self.query.build([None, None, self.config.hidden_size])

    # 如果存在键（key）张量，则按指定形状构建
    if getattr(self, "key", None) is not None:
        with tf.name_scope(self.key.name):
            self.key.build([None, None, self.config.hidden_size])

    # 如果存在值（value）张量，则按指定形状构建
    if getattr(self, "value", None) is not None:
        with tf.name_scope(self.value.name):
            self.value.build([None, None, self.config.hidden_size])
# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->XLMRoberta
class TFXLMRobertaSelfOutput(keras.layers.Layer):
    def __init__(self, config: XLMRobertaConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化一个全连接层，用于变换隐藏状态的维度
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 初始化 LayerNormalization 层，用于归一化隐藏状态
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # 初始化 Dropout 层，用于在训练时随机失活部分神经元，防止过拟合
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 通过全连接层变换隐藏状态的维度
        hidden_states = self.dense(inputs=hidden_states)
        # 在训练时应用 Dropout 操作，随机失活部分神经元
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 对变换后的隐藏状态进行 LayerNormalization，加上输入张量，构成残差连接
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        return hidden_states

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建，直接返回；否则按照指定的维度构建全连接层和 LayerNormalization 层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->XLMRoberta
class TFXLMRobertaAttention(keras.layers.Layer):
    def __init__(self, config: XLMRobertaConfig, **kwargs):
        super().__init__(**kwargs)

        # 初始化 XLMRoberta 的自注意力层和输出层
        self.self_attention = TFXLMRobertaSelfAttention(config, name="self")
        self.dense_output = TFXLMRobertaSelfOutput(config, name="output")

    def prune_heads(self, heads):
        raise NotImplementedError

    def call(
        self,
        input_tensor: tf.Tensor,
        attention_mask: tf.Tensor,
        head_mask: tf.Tensor,
        encoder_hidden_states: tf.Tensor,
        encoder_attention_mask: tf.Tensor,
        past_key_value: Tuple[tf.Tensor],
        output_attentions: bool,
        training: bool = False,
    ) -> Tuple[tf.Tensor]:
        # 使用自注意力层处理输入张量，得到自注意力层的输出
        self_outputs = self.self_attention(
            hidden_states=input_tensor,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            training=training,
        )
        # 将自注意力层的输出传入输出层，得到注意力输出
        attention_output = self.dense_output(
            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
        )
        # 如果需要输出注意力信息，将其添加到输出元组中
        outputs = (attention_output,) + self_outputs[1:]

        return outputs
    # 定义一个方法用于建立模型，在输入形状为None时
    def build(self, input_shape=None):
        # 如果模型已经建立过，则直接返回，不再重复建立
        if self.built:
            return
        # 设置模型已经建立的标志为True
        self.built = True
        # 如果存在self_attention属性，则构建self_attention模块
        if getattr(self, "self_attention", None) is not None:
            # 在TensorFlow的命名作用域下，建立self_attention模块
            with tf.name_scope(self.self_attention.name):
                # 调用self_attention的build方法，传入None作为输入形状
                self.self_attention.build(None)
        # 如果存在dense_output属性，则构建dense_output模块
        if getattr(self, "dense_output", None) is not None:
            # 在TensorFlow的命名作用域下，建立dense_output模块
            with tf.name_scope(self.dense_output.name):
                # 调用dense_output的build方法，传入None作为输入形状
                self.dense_output.build(None)
# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->XLMRoberta
class TFXLMRobertaIntermediate(keras.layers.Layer):
    def __init__(self, config: XLMRobertaConfig, **kwargs):
        super().__init__(**kwargs)

        # 定义一个全连接层，用于中间层的转换，输出单元数为config.intermediate_size
        self.dense = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )

        # 根据配置选择中间激活函数，若配置为字符串则使用相应的 TensorFlow 激活函数，否则直接使用给定的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
        else:
            self.intermediate_act_fn = config.hidden_act
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 将输入的隐藏状态通过全连接层进行线性转换
        hidden_states = self.dense(inputs=hidden_states)
        # 应用中间激活函数对转换后的隐藏状态进行非线性变换
        hidden_states = self.intermediate_act_fn(hidden_states)

        return hidden_states

    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        self.built = True
        # 如果定义了全连接层dense，则按照给定的形状构建该层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])


# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->XLMRoberta
class TFXLMRobertaOutput(keras.layers.Layer):
    def __init__(self, config: XLMRobertaConfig, **kwargs):
        super().__init__(**kwargs)

        # 定义一个全连接层，用于输出层的转换，输出单元数为config.hidden_size
        self.dense = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # LayerNormalization层，用于归一化输出
        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
        # Dropout层，用于随机失活以防止过拟合
        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
        self.config = config

    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
        # 将输入的隐藏状态通过全连接层进行线性转换
        hidden_states = self.dense(inputs=hidden_states)
        # 应用Dropout层进行随机失活
        hidden_states = self.dropout(inputs=hidden_states, training=training)
        # 将输出与输入张量进行残差连接，并通过LayerNormalization层进行归一化
        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)

        return hidden_states

    def build(self, input_shape=None):
        # 如果已经构建过，直接返回
        if self.built:
            return
        self.built = True
        # 如果定义了全连接层dense，则按照给定的形状构建该层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.intermediate_size])
        # 如果定义了LayerNorm层，则按照给定的形状构建该层
        if getattr(self, "LayerNorm", None) is not None:
            with tf.name_scope(self.LayerNorm.name):
                self.LayerNorm.build([None, None, self.config.hidden_size])


# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->XLMRoberta
class TFXLMRobertaLayer(keras.layers.Layer):
    # 这里将代码片段留空，因为它未完全给出
    pass
    # 初始化函数，用于初始化一个 XLMRobertaModel 对象
    def __init__(self, config: XLMRobertaConfig, **kwargs):
        # 调用父类的初始化函数
        super().__init__(**kwargs)

        # 创建自注意力层对象，使用给定的配置和名称
        self.attention = TFXLMRobertaAttention(config, name="attention")
        
        # 设置是否作为解码器模型的标志
        self.is_decoder = config.is_decoder
        
        # 设置是否添加交叉注意力的标志，并进行相应的检查
        self.add_cross_attention = config.add_cross_attention
        if self.add_cross_attention:
            if not self.is_decoder:
                # 如果未设置为解码器模型但添加了交叉注意力，则抛出异常
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            
            # 创建交叉注意力层对象，使用给定的配置和名称
            self.crossattention = TFXLMRobertaAttention(config, name="crossattention")
        
        # 创建中间层对象，使用给定的配置和名称
        self.intermediate = TFXLMRobertaIntermediate(config, name="intermediate")
        
        # 创建BERT输出层对象，使用给定的配置和名称
        self.bert_output = TFXLMRobertaOutput(config, name="output")
    
    # 调用函数，用于执行模型的前向传播
    def call(
        self,
        hidden_states: tf.Tensor,  # 输入的隐藏状态张量
        attention_mask: tf.Tensor,  # 注意力掩码张量
        head_mask: tf.Tensor,  # 头部掩码张量
        encoder_hidden_states: tf.Tensor | None,  # 编码器隐藏状态张量或None
        encoder_attention_mask: tf.Tensor | None,  # 编码器注意力掩码张量或None
        past_key_value: Tuple[tf.Tensor] | None,  # 过去键值元组或None
        output_attentions: bool,  # 是否输出注意力权重
        training: bool = False,  # 是否在训练模式下
    ) -> Tuple[tf.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # Perform self-attention on the input hidden states
        self_attention_outputs = self.attention(
            input_tensor=hidden_states,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=None,
            encoder_attention_mask=None,
            past_key_value=self_attn_past_key_value,
            output_attentions=output_attentions,
            training=training,
        )
        attention_output = self_attention_outputs[0]

        # if decoder, the last output is tuple of self-attn cache
        if self.is_decoder:
            outputs = self_attention_outputs[1:-1]  # slice to exclude self-attention cache tuple
            present_key_value = self_attention_outputs[-1]  # last element is present key/value
        else:
            outputs = self_attention_outputs[1:]  # include self attentions if outputting attention weights

        cross_attn_present_key_value = None
        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # Perform cross-attention using the self-attention output and encoder hidden states
            cross_attention_outputs = self.crossattention(
                input_tensor=attention_output,
                attention_mask=attention_mask,
                head_mask=head_mask,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                past_key_value=cross_attn_past_key_value,
                output_attentions=output_attentions,
                training=training,
            )
            attention_output = cross_attention_outputs[0]
            # append cross-attention outputs to existing outputs
            outputs = outputs + cross_attention_outputs[1:-1]

            # add cross-attn cache to positions 3,4 of present_key_value tuple
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # Compute intermediate output using the attention output
        intermediate_output = self.intermediate(hidden_states=attention_output)
        # Compute final layer output using intermediate output and attention output
        layer_output = self.bert_output(
            hidden_states=intermediate_output, input_tensor=attention_output, training=training
        )
        outputs = (layer_output,) + outputs  # append attentions if outputting them

        # if decoder, return the attn key/values as the last output
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        return outputs
    # 构建方法用于构建模型层，接受输入形状作为参数，如果已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        # 将标记置为已构建
        self.built = True
        
        # 如果存在注意力层，则按名称作用域构建并调用其 build 方法
        if getattr(self, "attention", None) is not None:
            with tf.name_scope(self.attention.name):
                self.attention.build(None)
        
        # 如果存在中间层，则按名称作用域构建并调用其 build 方法
        if getattr(self, "intermediate", None) is not None:
            with tf.name_scope(self.intermediate.name):
                self.intermediate.build(None)
        
        # 如果存在 BERT 输出层，则按名称作用域构建并调用其 build 方法
        if getattr(self, "bert_output", None) is not None:
            with tf.name_scope(self.bert_output.name):
                self.bert_output.build(None)
        
        # 如果存在交叉注意力层，则按名称作用域构建并调用其 build 方法
        if getattr(self, "crossattention", None) is not None:
            with tf.name_scope(self.crossattention.name):
                self.crossattention.build(None)
# 从 transformers.models.bert.modeling_tf_bert.TFBertEncoder 复制并修改为 XLMRobertaEncoder 类
class TFXLMRobertaEncoder(keras.layers.Layer):
    def __init__(self, config: XLMRobertaConfig, **kwargs):
        super().__init__(**kwargs)
        self.config = config
        # 初始化层列表，每一层使用 TFXLMRobertaLayer 类，命名为 layer_._{i}
        self.layer = [TFXLMRobertaLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]

    def call(
        self,
        hidden_states: tf.Tensor,                   # 输入的隐藏状态张量
        attention_mask: tf.Tensor,                  # 注意力掩码张量
        head_mask: tf.Tensor,                       # 头部掩码张量
        encoder_hidden_states: tf.Tensor | None,    # 编码器的隐藏状态张量或 None
        encoder_attention_mask: tf.Tensor | None,   # 编码器的注意力掩码张量或 None
        past_key_values: Tuple[Tuple[tf.Tensor]] | None,  # 过去的键值对或 None
        use_cache: Optional[bool],                  # 是否使用缓存的可选布尔值
        output_attentions: bool,                    # 是否输出注意力张量的布尔值
        output_hidden_states: bool,                 # 是否输出隐藏状态的布尔值
        return_dict: bool,                          # 是否返回字典格式的布尔值
        training: bool = False,                     # 是否在训练模式下的布尔值，默认为 False
    ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
        # 如果需要输出隐藏状态，则初始化 all_hidden_states 为空元组，否则为 None
        all_hidden_states = () if output_hidden_states else None
        # 如果需要输出注意力，则初始化 all_attentions 为空元组，否则为 None
        all_attentions = () if output_attentions else None
        # 如果需要输出交叉注意力且配置允许，则初始化 all_cross_attentions 为空元组，否则为 None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果需要使用缓存，则初始化 next_decoder_cache 为空元组，否则为 None
        next_decoder_cache = () if use_cache else None
        # 遍历每一层进行处理
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果存在过去的键值对，则获取当前层的过去键值对，否则为 None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 调用当前层的处理函数，获取当前层的输出
            layer_outputs = layer_module(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                head_mask=head_mask[i],
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=encoder_attention_mask,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                training=training,
            )
            # 更新隐藏状态为当前层输出的第一个元素
            hidden_states = layer_outputs[0]

            # 如果需要使用缓存，则将当前层输出的最后一个元素添加到 next_decoder_cache 中
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)

            # 如果需要输出注意力，则将当前层输出的第二个元素添加到 all_attentions 中
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)
                # 如果配置允许且存在编码器的隐藏状态，则将当前层输出的第三个元素添加到 all_cross_attentions 中
                if self.config.add_cross_attention and encoder_hidden_states is not None:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果需要输出隐藏状态，则将最后一层的隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要返回字典格式，则返回非空结果的元组
        if not return_dict:
            return tuple(
                v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
            )

        # 返回 TFBaseModelOutputWithPastAndCrossAttentions 类型的字典格式结果
        return TFBaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_attentions,
            cross_attentions=all_cross_attentions,
        )
    # 定义神经网络模型的构建方法，用于建立模型的层次结构
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 将模型标记为已构建状态
        self.built = True
        # 检查是否存在层属性，如果存在，则遍历每一层并构建
        if getattr(self, "layer", None) is not None:
            for layer in self.layer:
                # 使用 TensorFlow 的命名作用域，为每一层设置命名空间
                with tf.name_scope(layer.name):
                    # 调用每一层的 build 方法，传入 None 作为输入形状，具体参数由输入数据形状决定
                    layer.build(None)
@keras_serializable
# 使用 keras_serializable 装饰器，指示此类可以在 Keras 中序列化

# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer 复制而来，将 Roberta 替换为 XLMRoberta
class TFXLMRobertaMainLayer(keras.layers.Layer):
    # 使用 XLMRobertaConfig 作为配置类
    config_class = XLMRobertaConfig

    def __init__(self, config, add_pooling_layer=True, **kwargs):
        super().__init__(**kwargs)

        # 保存配置对象
        self.config = config
        # 是否为解码器
        self.is_decoder = config.is_decoder

        # 从配置中获取一些参数
        self.num_hidden_layers = config.num_hidden_layers
        self.initializer_range = config.initializer_range
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
        self.return_dict = config.use_return_dict

        # 创建编码器（TFXLMRobertaEncoder），名称为 "encoder"
        self.encoder = TFXLMRobertaEncoder(config, name="encoder")

        # 如果指定了要添加池化层，则创建池化层（TFXLMRobertaPooler），名称为 "pooler"
        self.pooler = TFXLMRobertaPooler(config, name="pooler") if add_pooling_layer else None

        # 创建嵌入层（TFXLMRobertaEmbeddings），名称为 "embeddings"
        # embeddings 必须是最后声明的以保持权重顺序
        self.embeddings = TFXLMRobertaEmbeddings(config, name="embeddings")

    # 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings 复制而来
    # 返回嵌入层对象
    def get_input_embeddings(self) -> keras.layers.Layer:
        return self.embeddings

    # 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings 复制而来
    # 设置嵌入层的权重和词汇表大小
    def set_input_embeddings(self, value: tf.Variable):
        self.embeddings.weight = value
        self.embeddings.vocab_size = shape_list(value)[0]

    # 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads 复制而来
    # 用于剪枝模型中的注意力头部
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        raise NotImplementedError

    @unpack_inputs
    # 从 transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call 复制而来
    # 模型的前向传播函数，处理输入并返回输出
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: bool = False,
    # 定义神经网络模型的 build 方法，用于构建模型的网络结构
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 标记模型已构建
        self.built = True
        
        # 如果模型中存在编码器（encoder）属性，则构建编码器
        if getattr(self, "encoder", None) is not None:
            # 在 TensorFlow 中，使用 name_scope 可以为模型的不同部分指定命名空间
            with tf.name_scope(self.encoder.name):
                # 调用编码器对象的 build 方法来构建编码器的网络结构
                self.encoder.build(None)
        
        # 如果模型中存在池化器（pooler）属性，则构建池化器
        if getattr(self, "pooler", None) is not None:
            with tf.name_scope(self.pooler.name):
                # 调用池化器对象的 build 方法来构建池化器的网络结构
                self.pooler.build(None)
        
        # 如果模型中存在嵌入层（embeddings）属性，则构建嵌入层
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                # 调用嵌入层对象的 build 方法来构建嵌入层的网络结构
                self.embeddings.build(None)
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaPreTrainedModel with Roberta->XLMRoberta
class TFXLMRobertaPreTrainedModel(TFPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定使用的配置类为 XLMRobertaConfig
    config_class = XLMRobertaConfig
    # 基础模型的前缀名为 "roberta"
    base_model_prefix = "roberta"


@add_start_docstrings(
    "The bare XLM RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
    XLM_ROBERTA_START_DOCSTRING,
)
# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaModel with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA
class TFXLMRobertaModel(TFXLMRobertaPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 初始化 XLM-RoBERTa 主层，命名为 "roberta"
        self.roberta = TFXLMRobertaMainLayer(config, name="roberta")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFBaseModelOutputWithPoolingAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
        past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = False,
    ) -> Union[Tuple, TFBaseModelOutputWithPoolingAndCrossAttentions]:
        r"""
        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

        past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`)
            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*, defaults to `True`):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`). Set to `False` during training, `True` during generation
        """
        # 调用 `self.roberta` 模型的前向传播方法，传入各种参数进行计算
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 返回 `self.roberta` 的计算结果
        return outputs

    def build(self, input_shape=None):
        # 如果已经构建过网络结构，则直接返回，避免重复构建
        if self.built:
            return
        # 将网络标记为已构建状态
        self.built = True
        # 如果 `self.roberta` 存在，则在命名作用域内构建 `self.roberta` 模型
        if getattr(self, "roberta", None) is not None:
            with tf.name_scope(self.roberta.name):
                # 调用 `self.roberta` 模型的 build 方法，传入 None 作为输入形状
                self.roberta.build(None)
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead复制而来，将Roberta替换为XLMRoberta
class TFXLMRobertaLMHead(keras.layers.Layer):
    """XLMRoberta Head for masked language modeling."""

    def __init__(self, config, input_embeddings, **kwargs):
        super().__init__(**kwargs)

        self.config = config  # 设置配置参数
        self.hidden_size = config.hidden_size  # 获取隐藏层大小
        # 创建一个全连接层，用于将隐藏状态映射到与词汇表大小相同的向量空间
        self.dense = keras.layers.Dense(
            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
        )
        # 应用LayerNormalization来规范化隐藏状态
        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
        # 使用GELU激活函数
        self.act = get_tf_activation("gelu")

        # 输出权重与输入嵌入相同，但每个token有一个输出偏置
        self.decoder = input_embeddings  # 解码器等于输入嵌入

    def build(self, input_shape=None):
        # 创建一个形状为(config.vocab_size,)的偏置向量，初始化为0，用于输出层
        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")

        if self.built:
            return
        self.built = True
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                # 构建全连接层
                self.dense.build([None, None, self.config.hidden_size])
        if getattr(self, "layer_norm", None) is not None:
            with tf.name_scope(self.layer_norm.name):
                # 构建LayerNormalization层
                self.layer_norm.build([None, None, self.config.hidden_size])

    def get_output_embeddings(self):
        return self.decoder  # 获取输出嵌入

    def set_output_embeddings(self, value):
        self.decoder.weight = value  # 设置输出嵌入的权重
        self.decoder.vocab_size = shape_list(value)[0]  # 设置输出嵌入的词汇表大小

    def get_bias(self):
        return {"bias": self.bias}  # 获取偏置向量

    def set_bias(self, value):
        self.bias = value["bias"]  # 设置偏置向量
        self.config.vocab_size = shape_list(value["bias"])[0]  # 更新配置中的词汇表大小

    def call(self, hidden_states):
        hidden_states = self.dense(hidden_states)  # 全连接层
        hidden_states = self.act(hidden_states)  # GELU激活函数
        hidden_states = self.layer_norm(hidden_states)  # LayerNormalization规范化

        # 使用偏置向量将隐藏状态投影回词汇表大小
        seq_length = shape_list(tensor=hidden_states)[1]  # 获取序列长度
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])  # 重塑张量形状
        hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)  # 矩阵乘法
        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])  # 重塑张量形状
        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)  # 添加偏置向量

        return hidden_states  # 返回隐藏状态


@add_start_docstrings("""XLM RoBERTa Model with a `language modeling` head on top.""", XLM_ROBERTA_START_DOCSTRING)
# 从transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM复制而来，将Roberta替换为XLMRoberta，ROBERTA替换为XLM_ROBERTA
class TFXLMRobertaForMaskedLM(TFXLMRobertaPreTrainedModel, TFMaskedLanguageModelingLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    # 初始化时，指定一些不希望加载的键名列表
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]

    # 初始化方法，继承自父类并传入配置信息及其他可变参数
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 使用TFXLMRobertaMainLayer类构建self.roberta对象，关闭添加池化层选项，命名为"roberta"
        self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
        # 使用TFXLMRobertaLMHead类构建self.lm_head对象，传入self.roberta.embeddings作为参数，命名为"lm_head"
        self.lm_head = TFXLMRobertaLMHead(config, self.roberta.embeddings, name="lm_head")

    # 获取self.lm_head对象的方法
    def get_lm_head(self):
        return self.lm_head

    # 获取前缀偏置名称的方法，已被标记为不推荐使用，发出FutureWarning警告
    def get_prefix_bias_name(self):
        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
        # 返回self.name与self.lm_head.name拼接而成的字符串作为结果
        return self.name + "/" + self.lm_head.name

    # 调用装饰器unpack_inputs、add_start_docstrings_to_model_forward和add_code_sample_docstrings的call方法
    @unpack_inputs
    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,  # 模型的检查点
        output_type=TFMaskedLMOutput,  # 输出类型为TFMaskedLMOutput
        config_class=_CONFIG_FOR_DOC,  # 配置类
        mask="<mask>",  # 掩码标识
        expected_output="' Paris'",  # 预期输出
        expected_loss=0.1,  # 预期损失
    )
    # 模型的前向传播方法，接受多个输入参数，包括输入ID、注意力掩码、标记类型ID等等
    def call(
        self,
        input_ids: TFModelInputType | None = None,  # 输入ID，可能为空
        attention_mask: np.ndarray | tf.Tensor | None = None,  # 注意力掩码，可能为NumPy数组或Tensor，也可能为空
        token_type_ids: np.ndarray | tf.Tensor | None = None,  # 标记类型ID，可能为NumPy数组或Tensor，也可能为空
        position_ids: np.ndarray | tf.Tensor | None = None,  # 位置ID，可能为NumPy数组或Tensor，也可能为空
        head_mask: np.ndarray | tf.Tensor | None = None,  # 头部掩码，可能为NumPy数组或Tensor，也可能为空
        inputs_embeds: np.ndarray | tf.Tensor | None = None,  # 输入嵌入，可能为NumPy数组或Tensor，也可能为空
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，默认为None
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，默认为None
        return_dict: Optional[bool] = None,  # 是否返回字典，默认为None
        labels: np.ndarray | tf.Tensor | None = None,  # 标签，可能为NumPy数组或Tensor，也可能为空
        training: Optional[bool] = False,  # 是否为训练模式，默认为False

        # 函数体内容在下一段继续
        pass
    ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
        r"""
        定义方法的返回类型注解，可以返回 TFMaskedLMOutput 或包含 tf.Tensor 的元组
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            用于计算掩码语言建模损失的标签。索引应在 `[-100, 0, ..., config.vocab_size]` 范围内（参见 `input_ids` 文档）。索引设置为 `-100` 的标记将被忽略（掩码），损失仅计算具有标签 `[0, ..., config.vocab_size]` 的标记。
        """
        # 使用 Roberta 模型处理输入
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取序列输出
        sequence_output = outputs[0]
        # 通过语言建模头部生成预测分数
        prediction_scores = self.lm_head(sequence_output)

        # 如果没有标签，则损失为 None；否则计算预测分数和标签之间的损失
        loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)

        # 如果不返回字典，则按顺序输出损失和其他输出
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TFMaskedLMOutput 对象，包括损失、预测分数、隐藏状态和注意力权重
        return TFMaskedLMOutput(
            loss=loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 标记已经构建
        self.built = True
        # 如果存在 self.roberta，则在相应命名空间下构建
        if getattr(self, "roberta", None) is not None:
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)
        # 如果存在 self.lm_head，则在相应命名空间下构建
        if getattr(self, "lm_head", None) is not None:
            with tf.name_scope(self.lm_head.name):
                self.lm_head.build(None)
@add_start_docstrings(
    "XLM-RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.",
    XLM_ROBERTA_START_DOCSTRING,
)
# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM 复制而来，将所有的 "Roberta" 替换为 "XLMRoberta"，"ROBERTA" 替换为 "XLM_ROBERTA"
class TFXLMRobertaForCausalLM(TFXLMRobertaPreTrainedModel, TFCausalLanguageModelingLoss):
    # 在加载 TF 模型时忽略以下名称的层，这些层可能是意外的或缺失的，当从 PT 模型加载 TF 模型时
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]

    def __init__(self, config: XLMRobertaConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        if not config.is_decoder:
            logger.warning("If you want to use `TFXLMRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")

        # 初始化 XLM-RoBERTa 主层，不添加池化层，命名为 "roberta"
        self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
        # 初始化 XLM-RoBERTa 语言建模头部，使用 roberta 的嵌入层作为输入嵌入，命名为 "lm_head"
        self.lm_head = TFXLMRobertaLMHead(config, input_embeddings=self.roberta.embeddings, name="lm_head")

    def get_lm_head(self):
        return self.lm_head

    def get_prefix_bias_name(self):
        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
        # 返回 lm_head 的完整名称，包括所属的模型名称前缀
        return self.name + "/" + self.lm_head.name

    # 从 transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.prepare_inputs_for_generation 复制而来
    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
        input_shape = input_ids.shape
        # 如果没有给定注意力掩码，则创建一个形状与 input_ids 相同的全 1 的注意力掩码
        if attention_mask is None:
            attention_mask = tf.ones(input_shape)

        # 如果使用了过去的 key values，截取最后一个输入 token
        if past_key_values is not None:
            input_ids = input_ids[:, -1:]

        # 返回用于生成的输入参数字典，包括输入 token IDs、注意力掩码和过去的 key values
        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}

    @unpack_inputs
    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFCausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义一个方法 `build`，用于构建模型结构
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 将标志位 `built` 设置为 True，表示模型已经构建
        self.built = True
        # 如果模型中有名为 `roberta` 的子模型，则构建 `roberta` 子模型
        if getattr(self, "roberta", None) is not None:
            # 在 TensorFlow 中使用命名作用域 `roberta.name` 来构建 `roberta`
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)
        # 如果模型中有名为 `lm_head` 的子模型，则构建 `lm_head` 子模型
        if getattr(self, "lm_head", None) is not None:
            # 在 TensorFlow 中使用命名作用域 `lm_head.name` 来构建 `lm_head`
            with tf.name_scope(self.lm_head.name):
                self.lm_head.build(None)
# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead 复制并修改为 XLMRoberta
class TFXLMRobertaClassificationHead(keras.layers.Layer):
    """用于句子级分类任务的头部。"""

    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        # 创建一个全连接层，用于分类任务
        self.dense = keras.layers.Dense(
            config.hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            activation="tanh",
            name="dense",
        )
        # 根据配置添加分类器的 dropout 层
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = keras.layers.Dropout(classifier_dropout)
        # 输出投影层，将全连接层的输出映射到类别数量
        self.out_proj = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
        )
        self.config = config

    def call(self, features, training=False):
        # 取特征的第一个 token 作为输入（相当于 [CLS]）
        x = features[:, 0, :]
        x = self.dropout(x, training=training)
        x = self.dense(x)
        x = self.dropout(x, training=training)
        x = self.out_proj(x)
        return x

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果 dense 层已定义，则建立 dense 层
        if getattr(self, "dense", None) is not None:
            with tf.name_scope(self.dense.name):
                self.dense.build([None, None, self.config.hidden_size])
        # 如果 out_proj 层已定义，则建立 out_proj 层
        if getattr(self, "out_proj", None) is not None:
            with tf.name_scope(self.out_proj.name):
                self.out_proj.build([None, None, self.config.hidden_size])


@add_start_docstrings(
    """
    XLM RoBERTa 模型的变压器，顶部带有序列分类/回归头部（池化输出上的线性层），例如 GLUE 任务。
    """,
    XLM_ROBERTA_START_DOCSTRING,
)
# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification 复制并修改为 XLMRoberta，ROBERTA->XLM_ROBERTA
class TFXLMRobertaForSequenceClassification(TFXLMRobertaPreTrainedModel, TFSequenceClassificationLoss):
    # 带有 '.' 的名称表示加载 PT 模型时，预期未授权的/丢失的层
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 类别数量
        self.num_labels = config.num_labels

        # XLMRoberta 主层，不添加池化层
        self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
        # 分类头部
        self.classifier = TFXLMRobertaClassificationHead(config, name="classifier")

    @unpack_inputs
    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 使用装饰器添加代码示例的文档字符串，说明模型预训练检查点和输出类型
    @add_code_sample_docstrings(
        checkpoint="cardiffnlp/twitter-roberta-base-emotion",
        output_type=TFSequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="'optimism'",
        expected_loss=0.08,
    )
    # 定义模型的调用方法，接受多个输入参数和一个可选的标签参数，返回序列分类器的输出或元组
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            用于计算序列分类/回归损失的标签。索引应在 `[0, ..., config.num_labels - 1]` 范围内。
            如果 `config.num_labels == 1`，则计算回归损失（均方损失）；如果 `config.num_labels > 1`，则计算分类损失（交叉熵损失）。
        """
        # 使用 Roberta 模型处理输入数据，根据参数设置返回不同的数据结构
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从模型输出中获取序列输出
        sequence_output = outputs[0]
        # 使用分类器对序列输出进行分类，根据训练状态进行不同的处理
        logits = self.classifier(sequence_output, training=training)
    
        # 如果未提供标签，则损失为 None；否则计算预测损失
        loss = None if labels is None else self.hf_compute_loss(labels, logits)
    
        # 如果不要求返回字典形式的输出，则返回 logits 和可能的其他输出
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output
    
        # 返回 TFSequenceClassifierOutput 类的对象，包含损失、logits、隐藏状态和注意力权重
        return TFSequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    
    # 定义模型的构建方法，用于建立模型的层次结构，确保仅在未构建时执行
    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果存在 self.roberta 属性，则在其命名范围内构建 Roberta 模型
        if getattr(self, "roberta", None) is not None:
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)
        # 如果存在 self.classifier 属性，则在其命名范围内构建分类器模型
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build(None)
@add_start_docstrings(
    """
    XLM Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
    a softmax) e.g. for RocStories/SWAG tasks.
    """,
    XLM_ROBERTA_START_DOCSTRING,
)
# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaForMultipleChoice 复制过来，将 Roberta 改为 XLMRoberta，ROBERTA 改为 XLM_ROBERTA
class TFXLMRobertaForMultipleChoice(TFXLMRobertaPreTrainedModel, TFMultipleChoiceLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    # 在加载 TF 模型时，忽略掉这些意外/缺失的层
    _keys_to_ignore_on_load_unexpected = [r"lm_head"]
    _keys_to_ignore_on_load_missing = [r"dropout"]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)

        # 初始化 XLM-Roberta 主层
        self.roberta = TFXLMRobertaMainLayer(config, name="roberta")
        # 添加 dropout 层
        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
        # 分类器层，用于多选分类任务
        self.classifier = keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(
        XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
    )
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TFMultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 模型的前向传播函数，接受多个输入和参数
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
        """

        # 如果输入了 input_ids，则获取 num_choices 和 seq_length
        if input_ids is not None:
            num_choices = shape_list(input_ids)[1]  # 获取选择个数，即第二个维度的大小
            seq_length = shape_list(input_ids)[2]   # 获取序列长度，即第三个维度的大小
        else:
            num_choices = shape_list(inputs_embeds)[1]  # 否则从 inputs_embeds 中获取选择个数
            seq_length = shape_list(inputs_embeds)[2]   # 从 inputs_embeds 中获取序列长度

        # 将输入张量展平，如果存在的话
        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
        
        # 调用 RoBERTa 模型
        outputs = self.roberta(
            flat_input_ids,
            flat_attention_mask,
            flat_token_type_ids,
            flat_position_ids,
            head_mask,
            inputs_embeds,
            output_attentions,
            output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        
        # 获取池化输出并应用 dropout
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=training)
        
        # 使用分类器进行分类
        logits = self.classifier(pooled_output)
        reshaped_logits = tf.reshape(logits, (-1, num_choices))  # 重新整形 logits
        
        # 计算损失（如果提供了 labels）
        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
        
        # 如果不返回字典，则构建输出元组
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output
        
        # 返回 TF 模型输出对象，包括损失、logits、隐藏状态和注意力权重
        return TFMultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        if getattr(self, "roberta", None) is not None:
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)  # 构建 RoBERTa 层
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])  # 构建分类器层
@add_start_docstrings(
    """
    XLM RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
    for Named-Entity-Recognition (NER) tasks.
    """,
    XLM_ROBERTA_START_DOCSTRING,
)
# 从 transformers.models.roberta.modeling_tf_roberta.TFRobertaForTokenClassification 复制而来，将 Roberta 改为 XLMRoberta，ROBERTA 改为 XLM_ROBERTA
class TFXLMRobertaForTokenClassification(TFXLMRobertaPreTrainedModel, TFTokenClassificationLoss):
    # 当从 PT 模型加载 TF 模型时，以下带 '.' 的名称表示在加载时可以忽略的未预期/缺失的层
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
    # 当从 PT 模型加载 TF 模型时，以下名称表示可以忽略的缺失层
    _keys_to_ignore_on_load_missing = [r"dropout"]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        # 使用 XLMRobertaMainLayer 初始化 self.roberta，不添加 pooling 层，命名为 "roberta"
        self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
        # 根据 config 中的 classifier_dropout 或 hidden_dropout_prob 初始化 Dropout 层
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = keras.layers.Dropout(classifier_dropout)
        # 使用 config 中的 initializer_range 初始化 Dense 层，输出维度为 config.num_labels，命名为 "classifier"
        self.classifier = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
        )
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="ydshieh/roberta-large-ner-english",
        output_type=TFTokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
        expected_loss=0.01,
    )
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
    ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]:
        r"""
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 调用 RoBERTa 模型进行前向传播，并返回输出结果
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 RoBERTa 输出的结果中取出序列输出（通常是最后一层的隐藏状态）
        sequence_output = outputs[0]

        # 对序列输出应用 dropout，用于防止过拟合
        sequence_output = self.dropout(sequence_output, training=training)
        # 将 dropout 后的输出传递给分类器，得到预测的 logits
        logits = self.classifier(sequence_output)

        # 如果提供了标签，计算损失值；否则损失值设为 None
        loss = None if labels is None else self.hf_compute_loss(labels, logits)

        # 如果 return_dict=False，则按照非字典格式组织输出
        if not return_dict:
            output = (logits,) + outputs[2:]  # 输出包括 logits 和 RoBERTa 的其他返回值
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict=True，则按 TFTokenClassifierOutput 格式组织输出
        return TFTokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        # 如果已经构建过模型，直接返回
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        # 如果模型中包含 RoBERTa，构建 RoBERTa 层
        if getattr(self, "roberta", None) is not None:
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)
        # 如果模型中包含分类器，构建分类器并指定输入形状
        if getattr(self, "classifier", None) is not None:
            with tf.name_scope(self.classifier.name):
                self.classifier.build([None, None, self.config.hidden_size])
@add_start_docstrings(
    """
    XLM RoBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
    linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    XLM_ROBERTA_START_DOCSTRING,
)
# 基于 XLM RoBERTa 模型，在顶部增加了用于抽取式问答任务（如 SQuAD）的跨度分类头部（在隐藏状态输出的线性层上计算 `span start logits` 和 `span end logits`）。

class TFXLMRobertaForQuestionAnswering(TFXLMRobertaPreTrainedModel, TFQuestionAnsweringLoss):
    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
    # 当从 PyTorch 模型加载到 TF 模型时，带有 '.' 的名称代表授权的意外/丢失层
    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]

    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        # Initialize the XLM-RoBERTa main layer without adding a pooling layer
        # 初始化 XLM-RoBERTa 主层，不添加汇聚层
        self.roberta = TFXLMRobertaMainLayer(config, add_pooling_layer=False, name="roberta")
        
        # Dense layer for question answering output, initialized with specified initializer range
        # 用于问答输出的全连接层，使用指定的初始化范围初始化
        self.qa_outputs = keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
        )
        self.config = config

    @unpack_inputs
    @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint="ydshieh/roberta-base-squad2",
        output_type=TFQuestionAnsweringModelOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output="' puppet'",
        expected_loss=0.86,
    )
    # Forward method for the model, with specific docstrings added for model input details and examples
    # 模型的前向方法，添加了特定的文档字符串，描述了模型输入的细节和示例
    def call(
        self,
        input_ids: TFModelInputType | None = None,
        attention_mask: np.ndarray | tf.Tensor | None = None,
        token_type_ids: np.ndarray | tf.Tensor | None = None,
        position_ids: np.ndarray | tf.Tensor | None = None,
        head_mask: np.ndarray | tf.Tensor | None = None,
        inputs_embeds: np.ndarray | tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        start_positions: np.ndarray | tf.Tensor | None = None,
        end_positions: np.ndarray | tf.Tensor | None = None,
        training: Optional[bool] = False,
        **kwargs
    ):
        # Method signature follows the TFModelInputType and accepts various optional inputs for model processing
        # 方法签名遵循 TFModelInputType，并接受各种可选输入进行模型处理
        pass  # Placeholder for the actual implementation of the call method
    ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]:
        r"""
        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        # 从 RoBERTa 模型中获取输出的序列表示
        sequence_output = outputs[0]

        # 将序列表示传入 QA 输出层得到 logits
        logits = self.qa_outputs(sequence_output)
        
        # 将 logits 按最后一个维度分割为 start_logits 和 end_logits
        start_logits, end_logits = tf.split(logits, 2, axis=-1)
        
        # 去除多余的维度，使得 start_logits 和 end_logits 的形状变为 (batch_size, sequence_length)
        start_logits = tf.squeeze(start_logits, axis=-1)
        end_logits = tf.squeeze(end_logits, axis=-1)

        # 计算损失（如果给定了起始位置和结束位置）
        loss = None
        if start_positions is not None and end_positions is not None:
            labels = {"start_position": start_positions}
            labels["end_position"] = end_positions
            # 使用 Hugging Face 的损失计算函数计算损失
            loss = self.hf_compute_loss(labels, (start_logits, end_logits))

        # 根据 return_dict 决定返回格式
        if not return_dict:
            # 如果不返回字典，则将 loss 和 output 打包成 tuple 返回
            output = (start_logits, end_logits) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果返回字典，则构建 TFQuestionAnsweringModelOutput 对象并返回
        return TFQuestionAnsweringModelOutput(
            loss=loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def build(self, input_shape=None):
        if self.built:
            return
        self.built = True
        # 如果已经构建过，则直接返回

        # 如果 self.roberta 已定义，则构建它
        if getattr(self, "roberta", None) is not None:
            with tf.name_scope(self.roberta.name):
                self.roberta.build(None)

        # 如果 self.qa_outputs 已定义，则构建它
        if getattr(self, "qa_outputs", None) is not None:
            with tf.name_scope(self.qa_outputs.name):
                # 构建 QA 输出层，输入形状为 [None, None, self.config.hidden_size]
                self.qa_outputs.build([None, None, self.config.hidden_size])

`.\models\xlm_roberta\modeling_xlm_roberta.py`

# coding=utf-8
# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch XLM-RoBERTa model."""

import math
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN, gelu
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_xlm_roberta import XLMRobertaConfig

# 获取 logger 对象
logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "FacebookAI/xlm-roberta-base"
_CONFIG_FOR_DOC = "XLMRobertaConfig"

# XLM-RoBERTa 模型的预训练模型存档列表
XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "FacebookAI/xlm-roberta-base",
    "FacebookAI/xlm-roberta-large",
    "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
    "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
    "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
    "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
    # 查看所有 XLM-RoBERTa 模型：https://huggingface.co/models?filter=xlm-roberta
]


# 从 transformers.models.roberta.modeling_roberta.RobertaEmbeddings 复制而来，用于 XLM-RoBERTa
class XLMRobertaEmbeddings(nn.Module):
    """
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    """

    # 从 transformers.models.bert.modeling_bert.BertEmbeddings.__init__ 复制而来
    # 初始化函数，接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建词嵌入层，将词汇表大小、隐藏层大小和填充索引作为参数
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 创建位置嵌入层，将最大位置嵌入大小和隐藏层大小作为参数
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 创建token类型嵌入层，将类型词汇表大小和隐藏层大小作为参数
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # 使用给定的隐藏层大小和epsilon值创建LayerNorm层，以与TensorFlow模型变量名保持一致
        # 可以加载任何TensorFlow检查点文件
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建dropout层，使用给定的隐藏层dropout概率作为参数
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # 设置位置嵌入类型，默认为"absolute"，从配置中获取
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册缓冲区，创建位置ID张量，长度为最大位置嵌入大小，并在序列化时导出
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 注册缓冲区，创建token类型ID张量，形状与位置ID相同，数据类型为长整型
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

        # End copy
        # 设置填充索引为配置文件中的填充token ID
        self.padding_idx = config.pad_token_id
        # 重新创建位置嵌入层，使用最大位置嵌入大小、隐藏层大小和填充索引作为参数
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )
    ):
        # 如果没有指定位置编码，但指定了输入的 token ids
        if position_ids is None:
            # 根据输入的 token ids 创建位置编码。任何填充的 token 保持填充状态。
            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
        else:
            # 如果未指定输入的 token ids，则根据输入的嵌入张量创建位置编码
            position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

        # 如果指定了输入的 token ids
        if input_ids is not None:
            # 获取输入张量的形状
            input_shape = input_ids.size()
        else:
            # 获取输入嵌入张量的形状，不包括最后一个维度（通常是 batch 维度）
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度，即输入的 token 序列的第二个维度大小
        seq_length = input_shape[1]

        # 将 token_type_ids 设置为构造函数中注册的缓冲区，该缓冲区全部为零。这通常在自动生成时发生，
        # 注册的缓冲区有助于在不传递 token_type_ids 的情况下跟踪模型，解决问题 #5664
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                # 获取模型中已注册的 token_type_ids 缓冲区，并截取到与序列长度相匹配的部分
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                # 扩展缓冲区的 token_type_ids 到与输入形状相匹配的大小
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                # 如果模型中没有注册 token_type_ids 缓冲区，则创建一个全零的张量
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果没有提供 inputs_embeds 参数，则通过输入的 token ids 获取对应的嵌入
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)

        # 根据 token_type_ids 获取对应的 token 类型嵌入
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将输入嵌入和 token 类型嵌入相加作为总的嵌入
        embeddings = inputs_embeds + token_type_embeddings

        # 如果位置嵌入类型是 "absolute"，则获取对应的位置嵌入并添加到总的嵌入中
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings

        # 应用 LayerNorm 对嵌入进行归一化
        embeddings = self.LayerNorm(embeddings)

        # 对归一化后的嵌入进行 dropout 处理
        embeddings = self.dropout(embeddings)

        # 返回最终的嵌入张量作为结果
        return embeddings

    # 从输入嵌入张量中创建位置编码
    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
        """
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        """
        # 获取输入嵌入张量的形状，不包括最后一个维度（通常是 batch 维度）
        input_shape = inputs_embeds.size()[:-1]
        # 获取序列长度，即输入嵌入张量的第二个维度大小
        sequence_length = input_shape[1]

        # 根据输入嵌入张量的大小创建顺序的位置编码
        position_ids = torch.arange(
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
        )
        # 将位置编码张量扩展为与输入形状相匹配的大小
        return position_ids.unsqueeze(0).expand(input_shape)
# 从 transformers.models.roberta.modeling_roberta.RobertaSelfAttention 复制并将 Roberta 替换为 XLMRoberta
class XLMRobertaSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 检查隐藏层大小是否能被注意力头数整除，如果不能且 config 没有 embedding_size 属性，则引发 ValueError
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 设置注意力头数和每个注意力头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 创建线性层用于计算查询、键和值
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 创建 dropout 层，用于注意力概率的 dropout
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        
        # 设置位置嵌入类型，默认为绝对位置编码
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        # 如果位置嵌入类型为相对位置编码之一，则创建距离嵌入层
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 设置是否为解码器
        self.is_decoder = config.is_decoder

    # 转置输入张量以适应注意力得分计算的形状
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数定义
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,



# 从 transformers.models.roberta.modeling_roberta.RobertaSelfOutput 复制并将 Roberta 替换为 XLMRoberta
class XLMRobertaSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建全连接层，用于对隐藏状态进行线性变换
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 创建 LayerNorm 层，用于对变换后的状态进行归一化
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建 dropout 层，用于对归一化后的状态进行随机失活
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数定义
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将隐藏状态输入全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对变换后的状态进行 dropout
        hidden_states = self.dropout(hidden_states)
        # 将 dropout 后的状态与输入张量进行残差连接，并经过 LayerNorm 归一化
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
# 从 transformers.models.roberta.modeling_roberta.RobertaAttention 复制并修改为 XLMRobertaAttention
class XLMRobertaAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 初始化 self 注意力机制，使用给定的配置和位置嵌入类型
        self.self = XLMRobertaSelfAttention(config, position_embedding_type=position_embedding_type)
        # 初始化输出层，用于处理 self 注意力机制的输出
        self.output = XLMRobertaSelfOutput(config)
        # 存储需要剪枝的注意力头的集合
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 找到可剪枝的注意力头及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储剪枝的注意力头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 进行 self 注意力机制的前向传播
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 使用输出层处理 self 注意力机制的输出和原始输入的隐藏状态
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果有需要，添加注意力输出到结果中
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要，将注意力加入到输出中
        return outputs


# 从 transformers.models.roberta.modeling_roberta.RobertaIntermediate 复制并修改为 XLMRobertaIntermediate
class XLMRobertaIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 线性层，将隐藏状态的大小映射为中间大小
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 中间激活函数，根据配置选择激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将隐藏状态通过线性层映射到中间大小
        hidden_states = self.dense(hidden_states)
        # 使用中间激活函数处理映射后的隐藏状态
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# 从 transformers.models.roberta.modeling_roberta.RobertaOutput 复制并修改为 XLMRobertaOutput
class XLMRobertaOutput(nn.Module):
    # 初始化函数，用于初始化对象的各个成员变量
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个线性层，输入维度为config.intermediate_size，输出维度为config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个LayerNorm层，对隐藏状态进行归一化，归一化的特征数为config.hidden_size，eps为config.layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个Dropout层，以config.hidden_dropout_prob的概率将输入置为0，用于防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，处理输入的隐藏状态和输入张量，返回处理后的隐藏状态张量
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将隐藏状态通过线性层进行变换
        hidden_states = self.dense(hidden_states)
        # 对变换后的隐藏状态进行Dropout操作，以减少过拟合风险
        hidden_states = self.dropout(hidden_states)
        # 将Dropout后的隐藏状态与输入张量相加，并通过LayerNorm进行归一化处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的隐藏状态张量作为输出
        return hidden_states

从 transformers.models.roberta.modeling_roberta.RobertaLayer 复制并修改，适配 Roberta 到 XLMRoberta

class XLMRobertaLayer(nn.Module):
def init(self, config):
# 调用父类的初始化方法
super().init()
# 设置 feed-forward 的分块大小
self.chunk_size_feed_forward = config.chunk_size_feed_forward
# 设置序列长度的维度，通常为 1
self.seq_len_dim = 1
# 初始化 XLMRobertaAttention 层
self.attention = XLMRobertaAttention(config)
# 设置是否为解码器
self.is_decoder = config.is_decoder
# 设置是否添加交叉注意力
self.add_cross_attention = config.add_cross_attention
# 如果添加了交叉注意力
if self.add_cross_attention:
# 确保如果添加了交叉注意力，则模型必须是解码器模型
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
# 初始化交叉注意力层
self.crossattention = XLMRobertaAttention(config, position_embedding_type="absolute")
# 初始化中间层
self.intermediate = XLMRobertaIntermediate(config)
# 初始化输出层
self.output = XLMRobertaOutput(config)

def forward(
    self,
    hidden_states: torch.Tensor,
    attention_mask: Optional[torch.FloatTensor] = None,
    head_mask: Optional[torch.FloatTensor] = None,
    encoder_hidden_states: Optional[torch.FloatTensor] = None,
    encoder_attention_mask: Optional[torch.FloatTensor] = None,
    past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
    output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
    # 定义函数的输入和输出类型，这里函数接收一个参数并返回一个元组，其中元素类型为 torch.Tensor

    # 如果过去的键/值对不为空，则取前两个作为自注意力机制的过去键/值对，否则为 None
    self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
    
    # 使用 self.attention 方法进行自注意力计算
    self_attention_outputs = self.attention(
        hidden_states,
        attention_mask,
        head_mask,
        output_attentions=output_attentions,
        past_key_value=self_attn_past_key_value,
    )
    # 获取自注意力计算的输出
    attention_output = self_attention_outputs[0]

    # 如果是解码器，则最后一个输出是自注意力缓存的元组
    if self.is_decoder:
        outputs = self_attention_outputs[1:-1]
        present_key_value = self_attention_outputs[-1]
    else:
        outputs = self_attention_outputs[1:]  # 如果输出注意力权重，还需添加自注意力

    cross_attn_present_key_value = None
    # 如果是解码器且有编码器隐藏状态
    if self.is_decoder and encoder_hidden_states is not None:
        # 如果未定义交叉注意力层，抛出 ValueError
        if not hasattr(self, "crossattention"):
            raise ValueError(
                f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                " by setting `config.add_cross_attention=True`"
            )

        # 如果过去的键/值对不为空，则取倒数两个作为交叉注意力机制的过去键/值对，否则为 None
        cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
        
        # 使用 self.crossattention 方法进行交叉注意力计算
        cross_attention_outputs = self.crossattention(
            attention_output,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            cross_attn_past_key_value,
            output_attentions,
        )
        # 获取交叉注意力计算的输出
        attention_output = cross_attention_outputs[0]
        # 添加交叉注意力计算的输出到 outputs 中，如果输出注意力权重
        outputs = outputs + cross_attention_outputs[1:-1]

        # 将交叉注意力缓存添加到当前键/值对的末尾位置
        cross_attn_present_key_value = cross_attention_outputs[-1]
        present_key_value = present_key_value + cross_attn_present_key_value

    # 对 attention_output 应用分块处理
    layer_output = apply_chunking_to_forward(
        self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
    )
    outputs = (layer_output,) + outputs

    # 如果是解码器，将注意力键/值对作为最后一个输出返回
    if self.is_decoder:
        outputs = outputs + (present_key_value,)

    return outputs

def feed_forward_chunk(self, attention_output):
    # 将 attention_output 应用于 feed_forward_chunk 的中间层和输出层，返回层输出
    intermediate_output = self.intermediate(attention_output)
    layer_output = self.output(intermediate_output, attention_output)
    return layer_output

从 transformers.models.roberta.modeling_roberta.RobertaEncoder 复制而来，将 Roberta 替换为 XLMRoberta

class XLMRobertaEncoder(nn.Module):
# 初始化函数，接收一个配置对象 config
def init(self, config):
super().init()
# 将配置对象保存在实例中
self.config = config
# 创建一个层列表，其中每个层都是 XLMRobertaLayer 类的实例，数量等于配置中指定的隐藏层数
self.layer = nn.ModuleList([XLMRobertaLayer(config) for _ in range(config.num_hidden_layers)])
# 梯度检查点设为 False
self.gradient_checkpointing = False

# 前向传播函数，接收多个输入参数
def forward(
    self,
    hidden_states: torch.Tensor,
    attention_mask: Optional[torch.FloatTensor] = None,
    head_mask: Optional[torch.FloatTensor] = None,
    encoder_hidden_states: Optional[torch.FloatTensor] = None,
    encoder_attention_mask: Optional[torch.FloatTensor] = None,
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = False,
    output_hidden_states: Optional[bool] = False,
    return_dict: Optional[bool] = True,
    all_hidden_states = () if output_hidden_states else None
    all_self_attentions = () if output_attentions else None
    all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

初始化变量 all_hidden_states, all_self_attentions, all_cross_attentions，根据 output_hidden_states 和 output_attentions 的布尔值确定是否创建空元组或赋值为 None。

    if self.gradient_checkpointing and self.training:
        if use_cache:
            logger.warning_once(
                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
            )
            use_cache = False

如果启用了梯度检查点并且处于训练模式，且 use_cache=True，则发出警告并将 use_cache 设为 False，因为这两者不兼容。

    next_decoder_cache = () if use_cache else None

根据 use_cache 的布尔值确定是否创建空元组 next_decoder_cache 或赋值为 None。

    for i, layer_module in enumerate(self.layer):

遍历 self.layer 中的每个层模块，i 是索引，layer_module 是当前层模块的引用。

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

如果 output_hidden_states 为真，则将当前 hidden_states 添加到 all_hidden_states 中。

        layer_head_mask = head_mask[i] if head_mask is not None else None
        past_key_value = past_key_values[i] if past_key_values is not None else None

根据索引 i 获取 head_mask 和 past_key_values 中的对应项，若它们不为 None。

        if self.gradient_checkpointing and self.training:
            layer_outputs = self._gradient_checkpointing_func(
                layer_module.__call__,
                hidden_states,
                attention_mask,
                layer_head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                past_key_value,
                output_attentions,
            )
        else:
            layer_outputs = layer_module(
                hidden_states,
                attention_mask,
                layer_head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                past_key_value,
                output_attentions,
            )

根据是否启用梯度检查点和是否在训练模式下，调用相应的层模块函数 _gradient_checkpointing_func 或 layer_module，并将结果赋给 layer_outputs。

        hidden_states = layer_outputs[0]
        if use_cache:
            next_decoder_cache += (layer_outputs[-1],)
        if output_attentions:
            all_self_attentions = all_self_attentions + (layer_outputs[1],)
            if self.config.add_cross_attention:
                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

更新 hidden_states，如果 use_cache 为真，则将当前层的缓存信息添加到 next_decoder_cache 中。如果 output_attentions 为真，更新 all_self_attentions 和 all_cross_attentions，根据是否添加跨层注意力信息。

    if output_hidden_states:
        all_hidden_states = all_hidden_states + (hidden_states,)

如果 output_hidden_states 为真，则将最终的 hidden_states 添加到 all_hidden_states 中。

    if not return_dict:
        return tuple(
            v
            for v in [
                hidden_states,
                next_decoder_cache,
                all_hidden_states,
                all_self_attentions,
                all_cross_attentions,
            ]
            if v is not None
        )

如果不返回字典形式的输出，返回一个元组，包含所有非空的输出变量。

    return BaseModelOutputWithPastAndCrossAttentions(
        last_hidden_state=hidden_states,
        past_key_values=next_decoder_cache,
        hidden_states=all_hidden_states,
        attentions=all_self_attentions,
        cross_attentions=all_cross_attentions,
    )

以字典形式返回模型的输出，使用 BaseModelOutputWithPastAndCrossAttentions 类，包含 hidden_states、next_decoder_cache、all_hidden_states、all_self_attentions 和 all_cross_attentions。

从 transformers.models.roberta.modeling_roberta.RobertaPooler 复制并修改为 XLMRobertaPooler 类

class XLMRobertaPooler(nn.Module):
def init(self, config):
super().init()
# 创建一个全连接层，输入和输出大小都为 config.hidden_size
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# 激活函数使用双曲正切函数
self.activation = nn.Tanh()

def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
    # 通过取第一个 token 的隐藏状态来进行池化
    first_token_tensor = hidden_states[:, 0]
    # 将池化后的输出通过全连接层处理
    pooled_output = self.dense(first_token_tensor)
    # 对全连接层输出应用激活函数
    pooled_output = self.activation(pooled_output)
    # 返回池化后的输出张量
    return pooled_output

从 transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel 复制并修改为 XLMRobertaPreTrainedModel 类

class XLMRobertaPreTrainedModel(PreTrainedModel):
"""
处理权重初始化以及预训练模型下载和加载的抽象类。
"""

# 使用 XLMRobertaConfig 进行配置
config_class = XLMRobertaConfig
# 基础模型前缀为 "roberta"
base_model_prefix = "roberta"
# 支持梯度检查点
supports_gradient_checkpointing = True
# 不分割的模块列表
_no_split_modules = ["XLMRobertaEmbeddings", "XLMRobertaSelfAttention"]

# 从 transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights 复制并修改为 _init_weights 方法
def _init_weights(self, module):
    """初始化权重"""
    if isinstance(module, nn.Linear):
        # 使用正态分布初始化权重，标准差为 config.initializer_range
        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
            # 如果存在偏置，则初始化为零向量
            module.bias.data.zero_()
    elif isinstance(module, nn.Embedding):
        # 使用正态分布初始化权重，标准差为 config.initializer_range
        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.padding_idx is not None:
            # 如果存在 padding_idx，则对应的权重初始化为零向量
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
        # 初始化 LayerNorm 层的偏置为零向量，权重为全一向量
        module.bias.data.zero_()
        module.weight.data.fill_(1.0)

XLM_ROBERTA_START_DOCSTRING 从原文档复制

XLM_ROBERTA_START_DOCSTRING = r"""

This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)

This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.

Parameters:
    config ([`XLMRobertaConfig`]): Model configuration class with all the parameters of the
        model. Initializing with a config file does not load the weights associated with the model, only the
        configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.

"""

XLM_ROBERTA_INPUTS_DOCSTRING 从原文档复制

XLM_ROBERTA_INPUTS_DOCSTRING = r"""
Args:
input_ids (torch.LongTensor of shape ({0})):
# 输入序列标记在词汇表中的索引。

        # 可以使用 `AutoTokenizer` 获取这些索引。参见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__` 获取详细信息。

        # [什么是输入 ID？](../glossary#input-ids)
    attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
        # 遮罩，用于避免在填充标记索引上执行注意力操作。遮罩的取值为 `[0, 1]`：

        # - 1 表示**未遮罩**的标记，
        # - 0 表示**遮罩**的标记。

        # [什么是注意力遮罩？](../glossary#attention-mask)
    token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
        # 段标记索引，指示输入的第一部分和第二部分。索引取值为 `[0, 1]`：

        # - 0 对应于 *句子 A* 的标记，
        # - 1 对应于 *句子 B* 的标记。

        # [什么是标记类型 ID？](../glossary#token-type-ids)
    position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
        # 输入序列中每个标记的位置索引，在位置嵌入中使用。索引范围为 `[0, config.max_position_embeddings - 1]`。

        # [什么是位置 ID？](../glossary#position-ids)
    head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
        # 遮罩，用于将自注意力模块的某些头部置为零。遮罩的取值为 `[0, 1]`：

        # - 1 表示**未遮罩**的头部，
        # - 0 表示**遮罩**的头部。

    inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
        # 可选参数，可以直接传递嵌入表示，而不是 `input_ids`。如果希望更好地控制如何将 `input_ids` 索引转换为相关向量，这将非常有用。

    output_attentions (`bool`, *optional*):
        # 是否返回所有注意力层的注意力张量。查看返回张量中的 `attentions` 以获取更多详细信息。

    output_hidden_states (`bool`, *optional*):
        # 是否返回所有层的隐藏状态。查看返回张量中的 `hidden_states` 以获取更多详细信息。

    return_dict (`bool`, *optional*):
        # 是否返回 [`~utils.ModelOutput`] 而不是普通元组。

"""
@add_start_docstrings(
"The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
XLM_ROBERTA_START_DOCSTRING,
)

从transformers.models.roberta.modeling_roberta.RobertaModel中复制过来，将Roberta替换为XLMRoberta，ROBERTA替换为XLM_ROBERTA

class XLMRobertaModel(XLMRobertaPreTrainedModel):
"""
该模型可以作为编码器（仅自注意力）或解码器使用，后者在自注意力层之间添加了一个交叉注意力层，遵循Ashish Vaswani等人在《Attention is all you need》中描述的架构。

若要作为解码器使用，需要使用配置中的`is_decoder`参数初始化为True。
若要在Seq2Seq模型中使用，需要将`is_decoder`和`add_cross_attention`参数都设置为True；在前向传递中，需要将`encoder_hidden_states`作为输入。

.. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
"""

# 从transformers.models.bert.modeling_bert.BertModel.__init__中复制过来，将Bert替换为XLMRoberta
def __init__(self, config, add_pooling_layer=True):
    super().__init__(config)
    self.config = config

    self.embeddings = XLMRobertaEmbeddings(config)
    self.encoder = XLMRobertaEncoder(config)

    self.pooler = XLMRobertaPooler(config) if add_pooling_layer else None

    # 初始化权重并应用最终处理
    self.post_init()

def get_input_embeddings(self):
    return self.embeddings.word_embeddings

def set_input_embeddings(self, value):
    self.embeddings.word_embeddings = value

def _prune_heads(self, heads_to_prune):
    """
    对模型的注意力头进行修剪。heads_to_prune: 字典，格式为{层号: 要在该层中修剪的头列表}，参见基类PreTrainedModel
    """
    for layer, heads in heads_to_prune.items():
        self.encoder.layer[layer].attention.prune_heads(heads)

@add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
    checkpoint=_CHECKPOINT_FOR_DOC,
    output_type=BaseModelOutputWithPoolingAndCrossAttentions,
    config_class=_CONFIG_FOR_DOC,
)
# 从transformers.models.bert.modeling_bert.BertModel.forward中复制过来
# 定义一个方法 forward，用于模型的前向传播
def forward(
    self,
    input_ids: Optional[torch.Tensor] = None,  # 输入的 token IDs，类型为可选的 PyTorch Tensor
    attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，用于指示哪些元素是 padding，类型为可选的 PyTorch Tensor
    token_type_ids: Optional[torch.Tensor] = None,  # token 类型 IDs，如用于区分两个句子，类型为可选的 PyTorch Tensor
    position_ids: Optional[torch.Tensor] = None,  # 位置 IDs，标识输入 tokens 的位置信息，类型为可选的 PyTorch Tensor
    head_mask: Optional[torch.Tensor] = None,  # 头部掩码，用于指定每个注意力头是否执行，类型为可选的 PyTorch Tensor
    inputs_embeds: Optional[torch.Tensor] = None,  # 输入的嵌入表示，代替 input_ids 的嵌入输入，类型为可选的 PyTorch Tensor
    encoder_hidden_states: Optional[torch.Tensor] = None,  # 编码器的隐藏状态，用于传递给注意力层，类型为可选的 PyTorch Tensor
    encoder_attention_mask: Optional[torch.Tensor] = None,  # 编码器的注意力掩码，用于指示哪些编码器隐藏状态应该被忽略，类型为可选的 PyTorch Tensor
    past_key_values: Optional[List[torch.FloatTensor]] = None,  # 过去的键值对，用于在解码器自回归生成时保存历史状态，类型为可选的列表，元素为 PyTorch Tensor
    use_cache: Optional[bool] = None,  # 是否使用缓存加速解码器自回归生成，类型为可选的布尔值
    output_attentions: Optional[bool] = None,  # 是否输出注意力权重，类型为可选的布尔值
    output_hidden_states: Optional[bool] = None,  # 是否输出所有隐藏状态，类型为可选的布尔值
    return_dict: Optional[bool] = None,  # 是否以字典形式返回输出，类型为可选的布尔值

@add_start_docstrings(
"XLM-RoBERTa Model with a language modeling head on top for CLM fine-tuning.",
XLM_ROBERTA_START_DOCSTRING,
)

从 transformers.models.roberta.modeling_roberta.RobertaForCausalLM 复制过来，并将 Roberta 改为 XLMRoberta，ROBERTA 改为 XLM_ROBERTA

class XLMRobertaForCausalLM(XLMRobertaPreTrainedModel):
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]

def __init__(self, config):
    super().__init__(config)

    # 如果配置不是解码器，则发出警告
    if not config.is_decoder:
        logger.warning("If you want to use `XLMRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")

    # 初始化 XLM-RoBERTa 模型和语言建模头部
    self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
    self.lm_head = XLMRobertaLMHead(config)

    # 初始化权重并进行最终处理
    self.post_init()

def get_output_embeddings(self):
    # 返回语言建模头部的解码器权重
    return self.lm_head.decoder

def set_output_embeddings(self, new_embeddings):
    # 设置语言建模头部的解码器权重为新的嵌入
    self.lm_head.decoder = new_embeddings

@add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.FloatTensor] = None,
    token_type_ids: Optional[torch.LongTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    head_mask: Optional[torch.FloatTensor] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    encoder_hidden_states: Optional[torch.FloatTensor] = None,
    encoder_attention_mask: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
# 准备生成过程中的输入，接受输入的ID，过去的键值（用于存储中间状态），注意力掩码以及模型关键字参数
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
    # 获取输入ID的形状信息
    input_shape = input_ids.shape
    # 如果未提供注意力掩码，则创建一个全为1的张量，形状与输入ID相同
    if attention_mask is None:
        attention_mask = input_ids.new_ones(input_shape)

    # 如果提供了过去的键值（用于缓存中间状态）
    if past_key_values is not None:
        # 获取过去键值中每层的长度（通常对应解码器状态）
        past_length = past_key_values[0][0].shape[2]

        # 如果输入ID的长度大于过去的长度，说明需要裁剪输入ID
        if input_ids.shape[1] > past_length:
            remove_prefix_length = past_length  # 裁剪长度设为过去长度
        else:
            # 否则，默认行为：仅保留最后一个ID
            remove_prefix_length = input_ids.shape[1] - 1

        # 从输入ID中裁剪掉指定长度的前缀部分
        input_ids = input_ids[:, remove_prefix_length:]

    # 返回一个包含输入ID、注意力掩码和过去键值的字典
    return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}

# 重新排序缓存数据，根据给定的beam索引
def _reorder_cache(self, past_key_values, beam_idx):
    # 创建一个空的元组用于存储重新排序后的过去键值
    reordered_past = ()
    # 遍历每层的过去键值
    for layer_past in past_key_values:
        # 对每个过去状态根据beam索引进行重新排序，并添加到重新排序后的过去状态元组中
        reordered_past += (
            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
        )
    # 返回重新排序后的过去状态元组
    return reordered_past

为 XLM-RoBERTa 模型添加文档字符串，指明其是在语言建模任务上使用的模型

@add_start_docstrings(
"""XLM-RoBERTa Model with a language modeling head on top.""",
XLM_ROBERTA_START_DOCSTRING,
)

从 transformers.models.roberta.modeling_roberta.RobertaForMaskedLM 复制代码，并将 Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA

class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel):
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]

# 初始化函数，接受一个配置对象 config
def __init__(self, config):
    super().__init__(config)

    # 如果配置中设置为 decoder，则发出警告信息，建议设置为 bi-directional self-attention
    if config.is_decoder:
        logger.warning(
            "If you want to use `XLMRobertaForMaskedLM` make sure `config.is_decoder=False` for "
            "bi-directional self-attention."
        )

    # 创建 XLM-RoBERTa 模型，设置不添加 pooling 层
    self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
    # 创建 XLM-RoBERTa 的语言建模头部
    self.lm_head = XLMRobertaLMHead(config)

    # 初始化权重并应用最终处理
    self.post_init()

# 获取输出嵌入的函数
def get_output_embeddings(self):
    return self.lm_head.decoder

# 设置输出嵌入的函数
def set_output_embeddings(self, new_embeddings):
    self.lm_head.decoder = new_embeddings

# 前向传播函数，接受多个输入参数并返回输出结果
@add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
    checkpoint=_CHECKPOINT_FOR_DOC,
    output_type=MaskedLMOutput,
    config_class=_CONFIG_FOR_DOC,
    mask="<mask>",
    expected_output="' Paris'",
    expected_loss=0.1,
)
def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.FloatTensor] = None,
    token_type_ids: Optional[torch.LongTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    head_mask: Optional[torch.FloatTensor] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    encoder_hidden_states: Optional[torch.FloatTensor] = None,
    encoder_attention_mask: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    # 输入参数文档字符串，描述了输入的形状
    # 示例代码文档字符串，展示了如何使用该函数的例子，包括模型的检查点、输出类型、配置类等信息
    # 这里没有函数体，因为该函数定义被截断，后续代码未展示
    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
    r"""
    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
        config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
        loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
    kwargs (`Dict[str, any]`, optional, defaults to *{}*):
        Used to hide legacy arguments that have been deprecated.
    """
    # 根据是否有 return_dict 参数决定是否返回字典格式的输出
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # 使用 RoBERTa 模型进行前向传播
    outputs = self.roberta(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        encoder_hidden_states=encoder_hidden_states,
        encoder_attention_mask=encoder_attention_mask,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    # 获取 RoBERTa 输出的序列输出
    sequence_output = outputs[0]
    # 使用语言模型头部对序列输出进行预测
    prediction_scores = self.lm_head(sequence_output)

    masked_lm_loss = None
    if labels is not None:
        # 将标签移到正确的设备以支持模型并行计算
        labels = labels.to(prediction_scores.device)
        # 定义交叉熵损失函数
        loss_fct = CrossEntropyLoss()
        # 计算掩码语言模型损失
        masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

    if not return_dict:
        # 如果不返回字典格式的输出，则返回额外的输出参数
        output = (prediction_scores,) + outputs[2:]
        return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

    # 返回 MaskedLMOutput 对象，包含损失、预测结果、隐藏状态和注意力权重
    return MaskedLMOutput(
        loss=masked_lm_loss,
        logits=prediction_scores,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead

class XLMRobertaLMHead(nn.Module):
"""Roberta Head for masked language modeling."""

def __init__(self, config):
    super().__init__()
    # Linear layer for transforming hidden states to vocab size
    self.dense = nn.Linear(config.hidden_size, config.hidden_size)
    # Layer normalization for stabilizing learning
    self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    # Linear layer for decoding to vocabulary size
    self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
    # Bias parameter for the decoder
    self.bias = nn.Parameter(torch.zeros(config.vocab_size))
    self.decoder.bias = self.bias

def forward(self, features, **kwargs):
    # Project features to hidden size using dense layer
    x = self.dense(features)
    # Apply GELU activation function
    x = gelu(x)
    # Normalize using layer normalization
    x = self.layer_norm(x)

    # Project back to size of vocabulary with bias
    x = self.decoder(x)

    return x

def _tie_weights(self):
    # Tie weights to prevent disconnection (TPU or bias resizing)
    if self.decoder.bias.device.type == "meta":
        self.decoder.bias = self.bias
    else:
        self.bias = self.decoder.bias

@add_start_docstrings(
"""
XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
pooled output) e.g. for GLUE tasks.
""",
XLM_ROBERTA_START_DOCSTRING,
)

Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA

class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel):
def init(self, config):
super().init(config)
# Number of labels in the classification task
self.num_labels = config.num_labels
self.config = config

    # XLM-RoBERTa model without pooling layer
    self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
    # Classification head for XLM-RoBERTa
    self.classifier = XLMRobertaClassificationHead(config)

    # Initialize weights and apply final processing
    self.post_init()

@add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
    checkpoint="cardiffnlp/twitter-roberta-base-emotion",
    output_type=SequenceClassifierOutput,
    config_class=_CONFIG_FOR_DOC,
    expected_output="'optimism'",
    expected_loss=0.08,
)
def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.FloatTensor] = None,
    token_type_ids: Optional[torch.LongTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    head_mask: Optional[torch.FloatTensor] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
    r"""
    labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
        config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
        `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    """
    # 确定是否使用返回字典，如果未指定则使用配置中的默认设置
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # 使用 RoBERTa 模型进行前向传播
    outputs = self.roberta(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    # 获取模型输出的序列表示
    sequence_output = outputs[0]
    # 将序列表示传递给分类器以获取 logits
    logits = self.classifier(sequence_output)

    # 初始化损失为 None
    loss = None
    if labels is not None:
        # 将标签移到正确的设备上，以便启用模型的并行计算
        labels = labels.to(logits.device)
        # 如果未指定问题类型，则根据标签类型和数量确定问题类型
        if self.config.problem_type is None:
            if self.num_labels == 1:
                self.config.problem_type = "regression"
            elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                self.config.problem_type = "single_label_classification"
            else:
                self.config.problem_type = "multi_label_classification"

        # 根据问题类型计算损失
        if self.config.problem_type == "regression":
            loss_fct = MSELoss()
            if self.num_labels == 1:
                loss = loss_fct(logits.squeeze(), labels.squeeze())
            else:
                loss = loss_fct(logits, labels)
        elif self.config.problem_type == "single_label_classification":
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        elif self.config.problem_type == "multi_label_classification":
            loss_fct = BCEWithLogitsLoss()
            loss = loss_fct(logits, labels)

    # 如果不需要返回字典，则返回分类器的输出
    if not return_dict:
        output = (logits,) + outputs[2:]  # 保留 logits 和额外的输出（如隐藏状态）
        return ((loss,) + output) if loss is not None else output

    # 如果需要返回字典，则返回 SequenceClassifierOutput 对象
    return SequenceClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

`
"""
XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
a softmax) e.g. for RocStories/SWAG tasks.
"""

Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA

class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel):
def init(self, config):
super().init(config)

    # Initialize XLM-RoBERTa model based on provided configuration
    self.roberta = XLMRobertaModel(config)
    # Dropout layer with dropout probability from configuration
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    # Linear layer for classification with input size as hidden size from configuration, output size 1
    self.classifier = nn.Linear(config.hidden_size, 1)

    # Initialize weights and perform final processing after model setup
    self.post_init()

@add_start_docstrings_to_model_forward(
    XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
    checkpoint=_CHECKPOINT_FOR_DOC,
    output_type=MultipleChoiceModelOutput,
    config_class=_CONFIG_FOR_DOC,
)
# Forward method for XLM-RoBERTa multiple choice model
def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    token_type_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    head_mask: Optional[torch.FloatTensor] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
    r"""
    labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
        num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
        `input_ids` above)
    """
    # 根据 return_dict 参数决定是否使用返回字典
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    # 计算 num_choices，如果 input_ids 存在则取其第二维的大小，否则取 inputs_embeds 的第二维大小
    num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

    # 将输入数据展平，以便于传入模型
    flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
    flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
    flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
    flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
    flat_inputs_embeds = (
        inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
        if inputs_embeds is not None
        else None
    )

    # 将展平后的输入传入 RoBERTa 模型
    outputs = self.roberta(
        flat_input_ids,
        position_ids=flat_position_ids,
        token_type_ids=flat_token_type_ids,
        attention_mask=flat_attention_mask,
        head_mask=head_mask,
        inputs_embeds=flat_inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    # 获取池化后的输出
    pooled_output = outputs[1]

    # 应用 dropout 层
    pooled_output = self.dropout(pooled_output)
    # 通过分类器获取 logits
    logits = self.classifier(pooled_output)
    # 将 logits 重塑为 (batch_size, num_choices)
    reshaped_logits = logits.view(-1, num_choices)

    loss = None
    if labels is not None:
        # 将 labels 移动到正确的设备上以支持模型并行处理
        labels = labels.to(reshaped_logits.device)
        # 定义交叉熵损失函数
        loss_fct = CrossEntropyLoss()
        # 计算交叉熵损失
        loss = loss_fct(reshaped_logits, labels)

    if not return_dict:
        # 如果不使用返回字典，则返回一个元组
        output = (reshaped_logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

    # 如果使用返回字典，则返回 MultipleChoiceModelOutput 对象
    return MultipleChoiceModelOutput(
        loss=loss,
        logits=reshaped_logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

"""
XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.

This class inherits from XLMRobertaPreTrainedModel and adds a token classification layer on top of XLM-RoBERTa.
"""
@add_start_docstrings(
"""
XLM-RoBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
for Named-Entity-Recognition (NER) tasks.
""",
XLM_ROBERTA_START_DOCSTRING,
)

Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA

class XLMRobertaForTokenClassification(XLMRobertaPreTrainedModel):
def init(self, config):
"""
Initializes the XLMRobertaForTokenClassification model.

    Args:
        config (XLMRobertaConfig): Configuration class instance defining the model architecture and parameters.
    """
    super().__init__(config)
    # Number of output labels for token classification
    self.num_labels = config.num_labels

    # XLM-RoBERTa model without pooling layer
    self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
    
    # Dropout layer with specified dropout rate
    classifier_dropout = (
        config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
    )
    self.dropout = nn.Dropout(classifier_dropout)
    
    # Linear layer for token classification
    self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    # Initialize weights and perform final model setup
    self.post_init()

@add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
    checkpoint="Jean-Baptiste/roberta-large-ner-english",
    output_type=TokenClassifierOutput,
    config_class=_CONFIG_FOR_DOC,
    expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
    expected_loss=0.01,
)
def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.FloatTensor] = None,
    token_type_ids: Optional[torch.LongTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    head_mask: Optional[torch.FloatTensor] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
):
    """
    Performs forward pass of the XLMRobertaForTokenClassification model.

    Args:
        input_ids (torch.LongTensor, optional): Tokenized input IDs.
        attention_mask (torch.FloatTensor, optional): Attention mask to avoid performing attention on padding tokens.
        token_type_ids (torch.LongTensor, optional): Segment token indices to differentiate segments in sequence pairs.
        position_ids (torch.LongTensor, optional): Indices of positions of each input sequence tokens in the position embeddings.
        head_mask (torch.FloatTensor, optional): Mask to nullify selected heads of the self-attention modules.
        inputs_embeds (torch.FloatTensor, optional): Optionally, the embeddings of the inputs instead of input_ids.
        labels (torch.LongTensor, optional): Labels for computing the token classification loss.
        output_attentions (bool, optional): Whether to output attentions.
        output_hidden_states (bool, optional): Whether to output hidden states.
        return_dict (bool, optional): Whether to return a dict instead of a tuple of outputs.

    Returns:
        TokenClassifierOutput: Token classification output consisting of logits and optionally hidden states, attentions, and loss.
    """
    # Implementation details are copied from transformers.
    pass
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
    r"""
    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
    """
    # 确定是否返回字典类型的输出
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # 使用 RoBERTa 模型进行前向传播
    outputs = self.roberta(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    # 获取模型输出的序列特征向量
    sequence_output = outputs[0]

    # 对序列特征向量应用 dropout 操作
    sequence_output = self.dropout(sequence_output)
    
    # 将处理后的特征向量输入分类器，得到分类 logits
    logits = self.classifier(sequence_output)

    # 初始化损失值为 None
    loss = None
    # 如果提供了标签，则计算交叉熵损失
    if labels is not None:
        # 将标签移到正确的设备以实现模型并行计算
        labels = labels.to(logits.device)
        loss_fct = CrossEntropyLoss()
        # 计算交叉熵损失
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    # 如果不返回字典类型的输出，则重新构造输出元组
    if not return_dict:
        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

    # 返回 TokenClassifierOutput 对象，包括损失、logits、隐藏状态和注意力权重
    return TokenClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

从 transformers.models.roberta.modeling_roberta.RobertaClassificationHead 复制而来，将 Roberta 替换为 XLMRoberta

class XLMRobertaClassificationHead(nn.Module):
"""用于句子级分类任务的头部。"""

def __init__(self, config):
    super().__init__()
    self.dense = nn.Linear(config.hidden_size, config.hidden_size)  # 定义一个全连接层
    # 根据配置选择分类器的 dropout，如果未指定则使用隐藏层的 dropout
    classifier_dropout = (
        config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
    )
    self.dropout = nn.Dropout(classifier_dropout)  # 定义一个 dropout 层
    self.out_proj = nn.Linear(config.hidden_size, config.num_labels)  # 输出层，映射到标签数量的维度

def forward(self, features, **kwargs):
    x = features[:, 0, :]  # 取 <s> 标记（等同于 [CLS]）
    x = self.dropout(x)  # 应用 dropout
    x = self.dense(x)  # 应用全连接层
    x = torch.tanh(x)  # 应用 tanh 激活函数
    x = self.dropout(x)  # 再次应用 dropout
    x = self.out_proj(x)  # 应用输出投影层
    return x

@add_start_docstrings(
"""
用于抽取式问答任务（如 SQuAD）的 XLM-RoBERTa 模型，顶部有一个跨度分类头部，线性层在隐藏状态输出的顶部，
计算 'span start logits' 和 'span end logits'。
""",
XLM_ROBERTA_START_DOCSTRING,
)

从 transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering 复制而来，将 Roberta 替换为 XLMRoberta，ROBERTA 替换为 XLM_ROBERTA

class XLMRobertaForQuestionAnswering(XLMRobertaPreTrainedModel):
def init(self, config):
super().init(config)
self.num_labels = config.num_labels

    self.roberta = XLMRobertaModel(config, add_pooling_layer=False)  # XLM-RoBERTa 模型，不包含池化层
    self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)  # 线性层，映射到标签数量的维度

    # 初始化权重并应用最终处理
    self.post_init()

@add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
    checkpoint="deepset/roberta-base-squad2",
    output_type=QuestionAnsweringModelOutput,
    config_class=_CONFIG_FOR_DOC,
    expected_output="' puppet'",
    expected_loss=0.86,
)
def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.FloatTensor] = None,
    token_type_ids: Optional[torch.LongTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    head_mask: Optional[torch.FloatTensor] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    start_positions: Optional[torch.LongTensor] = None,
    end_positions: Optional[torch.LongTensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
    r"""
    start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for position (index) of the start of the labelled span for computing the token classification loss.
        Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
        are not taken into account for computing the loss.
    end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for position (index) of the end of the labelled span for computing the token classification loss.
        Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
        are not taken into account for computing the loss.
    """
    # Determine whether to use the provided return_dict or default to the model's configuration
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # Pass inputs through the RoBERTa model and retrieve outputs
    outputs = self.roberta(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    # Extract the sequence output from RoBERTa's output
    sequence_output = outputs[0]

    # Compute logits for the Question Answering task
    logits = self.qa_outputs(sequence_output)
    
    # Split logits into start and end logits
    start_logits, end_logits = logits.split(1, dim=-1)
    
    # Squeeze unnecessary dimensions from logits
    start_logits = start_logits.squeeze(-1).contiguous()
    end_logits = end_logits.squeeze(-1).contiguous()

    total_loss = None
    if start_positions is not None and end_positions is not None:
        # Ensure start_positions and end_positions are properly shaped
        if len(start_positions.size()) > 1:
            start_positions = start_positions.squeeze(-1)
        if len(end_positions.size()) > 1:
            end_positions = end_positions.squeeze(-1)
        
        # Define the ignored index and clamp positions within valid range
        ignored_index = start_logits.size(1)
        start_positions = start_positions.clamp(0, ignored_index)
        end_positions = end_positions.clamp(0, ignored_index)

        # Compute CrossEntropyLoss for start and end positions
        loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
        start_loss = loss_fct(start_logits, start_positions)
        end_loss = loss_fct(end_logits, end_positions)
        total_loss = (start_loss + end_loss) / 2

    # If return_dict is False, return a tuple containing the loss and other outputs
    if not return_dict:
        output = (start_logits, end_logits) + outputs[2:]
        return ((total_loss,) + output) if total_loss is not None else output

    # If return_dict is True, return a QuestionAnsweringModelOutput object
    return QuestionAnsweringModelOutput(
        loss=total_loss,
        start_logits=start_logits,
        end_logits=end_logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

从输入的 input_ids 中创建位置标识符，替换非填充符号为它们的位置编号。位置编号从 padding_idx+1 开始，填充符号被忽略。

这是从 fairseq 的 `utils.make_positions` 修改而来。

定义一个函数，接受输入参数 input_ids（输入的索引序列）、padding_idx（填充符号的索引）、past_key_values_length（过去键值的长度，默认为0）

def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
# 创建一个布尔掩码，标记哪些位置不是填充符号，将非填充位置标记为1，填充位置标记为0
mask = input_ids.ne(padding_idx).int()
# 计算累积的索引，用于替换非填充符号的位置。加上 past_key_values_length 是为了处理过去键值的长度。
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
# 将累积的索引转换为长整型，并加上 padding_idx，得到最终的位置标识符
return incremental_indices.long() + padding_idx


# `.\models\xlm_roberta\tokenization_xlm_roberta.py`

```py
# coding=utf-8
# 指定文件编码为 UTF-8

# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
# 版权声明，列出了部分作者和团队

# Licensed under the Apache License, Version 2.0 (the "License");
# 遵循 Apache 2.0 许可证

# you may not use this file except in compliance with the License.
# 除非符合许可证要求，否则不得使用此文件。

# You may obtain a copy of the License at
# 可以从上述链接获取许可证的副本

#     http://www.apache.org/licenses/LICENSE-2.0
#     许可证详细信息网址

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非法律要求或书面同意，否则按原样提供分发，无论是明示还是暗示的任何保证或条件。

# See the License for the specific language governing permissions and
# limitations under the License
# 请参阅许可证了解具体的权限和限制。

""" Tokenization classes for XLM-RoBERTa model."""
# XLM-RoBERTa 模型的分词类定义

import os
# 导入操作系统相关模块
from shutil import copyfile
# 导入文件复制功能模块
from typing import Any, Dict, List, Optional, Tuple
# 导入类型提示相关模块

import sentencepiece as spm
# 导入 sentencepiece 库

from ...tokenization_utils import AddedToken, PreTrainedTokenizer
# 从 tokenization_utils 中导入 AddedToken 和 PreTrainedTokenizer 类
from ...utils import logging
# 从 utils 中导入 logging 模块

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器

SPIECE_UNDERLINE = "▁"
# 定义特殊标记 "▁"

VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
# 词汇表文件名字典，包含一个键值对

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "FacebookAI/xlm-roberta-base": "https://huggingface.co/FacebookAI/xlm-roberta-base/resolve/main/sentencepiece.bpe.model",
        "FacebookAI/xlm-roberta-large": "https://huggingface.co/FacebookAI/xlm-roberta-large/resolve/main/sentencepiece.bpe.model",
        "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch": (
            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/sentencepiece.bpe.model"
        ),
        "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish": (
            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/sentencepiece.bpe.model"
        ),
        "FacebookAI/xlm-roberta-large-finetuned-conll03-english": (
            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-english/resolve/main/sentencepiece.bpe.model"
        ),
        "FacebookAI/xlm-roberta-large-finetuned-conll03-german": (
            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-german/resolve/main/sentencepiece.bpe.model"
        ),
    }
}
# 预训练模型的词汇文件映射字典，包含多个模型名到 URL 的映射

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "FacebookAI/xlm-roberta-base": 512,
    "FacebookAI/xlm-roberta-large": 512,
    "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch": 512,
    "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish": 512,
    "FacebookAI/xlm-roberta-large-finetuned-conll03-english": 512,
    "FacebookAI/xlm-roberta-large-finetuned-conll03-german": 512,
}
# 预训练模型的位置嵌入大小字典，包含多个模型名到嵌入大小的映射

class XLMRobertaTokenizer(PreTrainedTokenizer):
    """
    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.
    """
    # XLM-RoBERTa 分词器类继承自 PreTrainedTokenizer 类
    # 定义函数，用于加载词汇表文件，并配置特殊标记的默认值
    def __init__(
        vocab_file: str,
        bos_token: str = "<s>",   # 开始序列的特殊标记，默认为 "<s>"
        eos_token: str = "</s>",   # 结束序列的特殊标记，默认为 "</s>"
        sep_token: str = "</s>",   # 分隔符的特殊标记，默认为 "</s>"
        cls_token: str = "<s>",    # 分类器标记，在使用特殊标记构建序列时为序列的第一个标记，默认为 "<s>"
        unk_token: str = "<unk>",  # 未知标记，当词汇表中不存在某个词时使用，默认为 "<unk>"
        pad_token: str = "<pad>",  # 填充标记，用于处理不同长度的序列，默认为 "<pad>"
        mask_token: str = "<mask>",# 掩码标记，用于掩盖值，模型训练中会预测该标记，默认为 "<mask>"
        sp_model_kwargs: dict = None # 传递给 SentencePieceProcessor.__init__() 方法的参数，用于配置 SentencePiece 模型
    ):
        """
        初始化函数，用于配置特殊标记的默认值和加载词汇表文件。
    
        Args:
            vocab_file (`str`): 词汇表文件的路径。
            bos_token (`str`, *optional*, defaults to `"<s>"`): 预训练期间使用的序列开始标记，也可用作序列分类器标记。
            eos_token (`str`, *optional*, defaults to `"</s>"`): 序列结束标记。
            sep_token (`str`, *optional*, defaults to `"</s>"`): 分隔符标记，用于构建多序列或特殊标记序列的最后一个标记。
            cls_token (`str`, *optional*, defaults to `"<s>"`): 序列分类时的分类器标记，用于整体序列分类。
            unk_token (`str`, *optional*, defaults to `"<unk>"`): 未知标记，用于词汇表中不存在的词。
            pad_token (`str`, *optional*, defaults to `"<pad>"`): 填充标记，用于处理不同长度序列的填充。
            mask_token (`str`, *optional*, defaults to `"<mask>"`): 掩码标记，模型预测时使用的标记。
            sp_model_kwargs (`dict`, *optional*): 将传递给 `SentencePieceProcessor.__init__()` 方法的参数。
        """
        pass
    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    """

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(
        self,
        vocab_file,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ) -> None:
        # Mask token behave like a normal word, i.e. include the space before it
        # 如果 mask_token 是字符串，则设置为 AddedToken 对象，它会去除左侧空格并被视为特殊标记
        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token

        # 如果 sp_model_kwargs 为 None，则设为空字典，否则使用提供的参数
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        # 使用 SentencePieceProcessor 初始化 sp_model 对象，并加载给定的 vocab_file
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(str(vocab_file))
        self.vocab_file = vocab_file

        # 确保 fairseq 和 spm 的词汇表对齐，以便进行 token-to-id 映射
        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}

        # fairseq 的偏移量，用于调整 fairseq 和 spm 的 token-to-id 映射关系
        self.fairseq_offset = 1

        # 添加 <mask> 到 token-to-id 映射中，使用 fairseq 的偏移量和 spm 的词汇表长度
        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset
        # 创建 fairseq 的 id-to-token 映射
        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}

        # 调用父类的初始化方法，传递所有参数
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            sp_model_kwargs=self.sp_model_kwargs,
            **kwargs,
        )

    def __getstate__(self):
        # 创建对象状态的拷贝
        state = self.__dict__.copy()
        # 将 sp_model 设为 None，以防止序列化时存储 sp_model 对象
        state["sp_model"] = None
        # 存储 sp_model 的序列化模型 proto
        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
        return state

    def __setstate__(self, d):
        # 恢复对象状态
        self.__dict__ = d

        # 兼容旧版本的处理
        if not hasattr(self, "sp_model_kwargs"):
            self.sp_model_kwargs = {}

        # 重新创建 sp_model 对象，并从序列化的 proto 中加载模型
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An XLM-RoBERTa sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """

        # If only one sequence is provided, concatenate with special tokens <s> and </s>
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        
        # Define special tokens for beginning (CLS) and separation (SEP)
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        
        # For a pair of sequences, concatenate with appropriate special tokens
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep


    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        # If the tokens already include special tokens, delegate to superclass method
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )
        
        # Initialize the mask with special token (1) for CLS token
        special_tokens_mask = [1]
        
        # Append sequence token (0) for each token in token_ids_0
        special_tokens_mask += [0] * len(token_ids_0)
        
        # Append special token (1) for SEP token
        special_tokens_mask += [1]
        
        # If token_ids_1 exists, append special tokens for separation and tokens in token_ids_1
        if token_ids_1 is not None:
            special_tokens_mask += [1, 1] + [0] * len(token_ids_1)
            special_tokens_mask += [1]
        
        return special_tokens_mask
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.

        """

        # Define the separator token
        sep = [self.sep_token_id]
        # Define the classification token
        cls = [self.cls_token_id]

        # If only one sequence is provided (no token_ids_1), return the mask length based on token_ids_0
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        
        # If two sequences are provided, return the mask length based on both sequences concatenated
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    @property
    def vocab_size(self):
        # Calculate and return the vocabulary size including an offset for additional tokens (like <mask>)
        return len(self.sp_model) + self.fairseq_offset + 1  # Add the <mask> token

    def get_vocab(self):
        # Generate a dictionary mapping tokens to their corresponding IDs in the vocabulary
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        # Update the vocabulary with any additional tokens not in the SentencePiece model
        vocab.update(self.added_tokens_encoder)
        return vocab

    def _tokenize(self, text: str) -> List[str]:
        # Tokenize input text into a list of subword strings using SentencePiece model
        # TODO check if the t5/llama PR also applies here
        return self.sp_model.encode(text, out_type=str)

    def _convert_token_to_id(self, token):
        """Converts a token (str) into an ID using the vocabulary."""
        # Check if the token exists in the fairseq vocabulary mappings
        if token in self.fairseq_tokens_to_ids:
            return self.fairseq_tokens_to_ids[token]
        # If not found, convert using the SentencePiece model
        spm_id = self.sp_model.PieceToId(token)

        # Return an unknown token ID if SentencePiece model returns 0
        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id

    def _convert_id_to_token(self, index):
        """Converts an ID (integer) into a token (str) using the vocabulary."""
        # Check if the ID exists in fairseq mappings
        if index in self.fairseq_ids_to_tokens:
            return self.fairseq_ids_to_tokens[index]
        # Convert ID to a token using the SentencePiece model, adjusting for fairseq offset
        return self.sp_model.IdToPiece(index - self.fairseq_offset)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) into a single string."""
        # Concatenate tokens into a string, replacing SPIECE_UNDERLINE with spaces and stripping extra spaces
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # Ensure save_directory exists and is a directory; otherwise, log an error
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # Define the output vocabulary file path
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # If the current vocabulary file differs from the output path and exists, copy it
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        # If the current vocabulary file doesn't exist, write the serialized SentencePiece model to the output file
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        # Return the path to the saved vocabulary file
        return (out_vocab_file,)

`.\models\xlm_roberta\tokenization_xlm_roberta_fast.py`

# 设置 Python 文件的编码格式为 UTF-8
# 版权声明和许可证信息
# 本模块提供了 XLM-RoBERTa 模型的分词类

# 导入必要的模块和函数
import os
from shutil import copyfile
from typing import List, Optional, Tuple

# 导入自定义的分词相关工具函数和类
from ...tokenization_utils import AddedToken
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import is_sentencepiece_available, logging

# 如果安装了 sentencepiece 库，则导入 XLMRobertaTokenizer 类
if is_sentencepiece_available():
    from .tokenization_xlm_roberta import XLMRobertaTokenizer
else:
    XLMRobertaTokenizer = None

# 获取日志记录器
logger = logging.get_logger(__name__)

# 定义用于 XLM-RoBERTa 模型的词汇文件名称映射
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}

# 定义预训练模型和对应的词汇文件 URL 映射关系
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "FacebookAI/xlm-roberta-base": "https://huggingface.co/FacebookAI/xlm-roberta-base/resolve/main/sentencepiece.bpe.model",
        "FacebookAI/xlm-roberta-large": "https://huggingface.co/FacebookAI/xlm-roberta-large/resolve/main/sentencepiece.bpe.model",
        "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch": (
            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/sentencepiece.bpe.model"
        ),
        "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish": (
            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/sentencepiece.bpe.model"
        ),
        "FacebookAI/xlm-roberta-large-finetuned-conll03-english": (
            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-english/resolve/main/sentencepiece.bpe.model"
        ),
        "FacebookAI/xlm-roberta-large-finetuned-conll03-german": (
            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-german/resolve/main/sentencepiece.bpe.model"
        ),
    },
    # 定义一个字典，包含多个键值对，每个键是模型名称，对应的值是该模型的 tokenizer.json 文件的 URL
    "tokenizer_file": {
        "FacebookAI/xlm-roberta-base": "https://huggingface.co/FacebookAI/xlm-roberta-base/resolve/main/tokenizer.json",
        "FacebookAI/xlm-roberta-large": "https://huggingface.co/FacebookAI/xlm-roberta-large/resolve/main/tokenizer.json",
        "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch": (
            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll02-dutch/resolve/main/tokenizer.json"
        ),
        "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish": (
            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll02-spanish/resolve/main/tokenizer.json"
        ),
        "FacebookAI/xlm-roberta-large-finetuned-conll03-english": (
            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-english/resolve/main/tokenizer.json"
        ),
        "FacebookAI/xlm-roberta-large-finetuned-conll03-german": (
            "https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-german/resolve/main/tokenizer.json"
        ),
    },
}

# 定义一个预训练位置嵌入大小的字典，不同的模型对应不同的嵌入大小
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "FacebookAI/xlm-roberta-base": 512,  # XLM-RoBERTa base 模型的位置嵌入大小为 512
    "FacebookAI/xlm-roberta-large": 512,  # XLM-RoBERTa large 模型的位置嵌入大小为 512
    "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch": 512,  # 细调为荷兰语的 XLM-RoBERTa large 模型的位置嵌入大小为 512
    "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish": 512,  # 细调为西班牙语的 XLM-RoBERTa large 模型的位置嵌入大小为 512
    "FacebookAI/xlm-roberta-large-finetuned-conll03-english": 512,  # 细调为英语的 XLM-RoBERTa large 模型的位置嵌入大小为 512
    "FacebookAI/xlm-roberta-large-finetuned-conll03-german": 512,  # 细调为德语的 XLM-RoBERTa large 模型的位置嵌入大小为 512
}


class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
    """
    构建一个“快速”XLM-RoBERTa tokenizer（由HuggingFace的*tokenizers*库支持）。改编自[`RobertaTokenizer`]和[`XLNetTokenizer`]。
    基于[BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models)。

    此tokenizer继承自[`PreTrainedTokenizerFast`]，其中包含大多数主要方法。用户应参考此超类以获取有关这些方法的更多信息。
    """
    Args:
        vocab_file (`str`):
            Path to the vocabulary file.

        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining.
            Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.

        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.

        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.

        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.

        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.
    """
    
    # Define constants
    vocab_files_names = VOCAB_FILES_NAMES  # Constant mapping vocabulary file names
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP  # Mapping of pretrained vocabulary files
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES  # Maximum model input sizes
    model_input_names = ["input_ids", "attention_mask"]  # Names of model input tensors
    slow_tokenizer_class = XLMRobertaTokenizer  # Tokenizer class used

    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        **kwargs,
    ):
        """
        Constructor for the tokenizer class.

        Args:
            vocab_file (str, optional): Path to the vocabulary file.
            tokenizer_file (str, optional): Path to the tokenizer file.
            bos_token (str, optional): The beginning of sequence token.
            eos_token (str, optional): The end of sequence token.
            sep_token (str, optional): The separator token.
            cls_token (str, optional): The classifier token.
            unk_token (str, optional): The unknown token.
            pad_token (str, optional): The padding token.
            mask_token (str, optional): The masking token.
            **kwargs: Additional keyword arguments.
        """
    ):
        # 如果 mask_token 是字符串，创建一个剥离左边空格的 AddedToken 对象
        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

        # 调用父类的初始化方法，传入参数进行初始化
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            cls_token=cls_token,
            unk_token=unk_token,
            pad_token=pad_token,
            mask_token=mask_token,
            **kwargs,
        )

        # 将 vocab_file 赋值给 self.vocab_file
        self.vocab_file = vocab_file

    @property
    def can_save_slow_tokenizer(self) -> bool:
        # 如果 self.vocab_file 存在且是文件，则返回 True，否则返回 False
        return os.path.isfile(self.vocab_file) if self.vocab_file else False

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        通过连接和添加特殊 token，为序列分类任务构建模型输入。XLM-RoBERTa 序列的格式如下：

        - 单个序列: `<s> X </s>`
        - 序列对: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                要添加特殊 token 的 ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                可选的第二个序列的 ID 列表，用于序列对。

        Returns:
            `List[int]`: 带有适当特殊 token 的输入 ID 列表。
        """

        if token_ids_1 is None:
            # 返回包含特殊 token 的单个序列的输入 ID 列表
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        # 返回包含特殊 token 的序列对的输入 ID 列表
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        创建用于序列对分类任务的 mask。XLM-RoBERTa 不使用 token 类型 ID，因此返回一个全为零的列表。

        Args:
            token_ids_0 (`List[int]`):
                ID 列表。
            token_ids_1 (`List[int]`, *optional*):
                可选的第二个序列的 ID 列表，用于序列对。

        Returns:
            `List[int]`: 全为零的列表。

        """

        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        if token_ids_1 is None:
            # 返回单个序列的 token 类型 ID 列表，全为零
            return len(cls + token_ids_0 + sep) * [0]
        # 返回序列对的 token 类型 ID 列表，全为零
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
    # 定义保存词汇表的方法，接受一个保存目录路径和可选的文件名前缀参数，返回一个包含文件路径字符串的元组
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 如果无法保存慢速分词器的词汇表，则引发值错误异常
        if not self.can_save_slow_tokenizer:
            raise ValueError(
                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
                "tokenizer."
            )

        # 如果保存目录不存在，记录错误日志并返回空
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
            return

        # 组合输出词汇表文件的路径，结合可选的文件名前缀和标准的词汇表文件名
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇表文件路径与输出路径不一致，则复制当前词汇表文件到输出路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        # 返回包含输出词汇表文件路径的元组
        return (out_vocab_file,)

`.\models\xlm_roberta\init.py`

# 导入必要的类型检查模块
from typing import TYPE_CHECKING

# 从相对路径导入工具函数和类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_sentencepiece_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义一个字典，用于存储不同模块的导入结构
_import_structure = {
    "configuration_xlm_roberta": [
        "XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "XLMRobertaConfig",
        "XLMRobertaOnnxConfig",
    ],
}

# 检查是否安装了 sentencepiece 库，如果没有则抛出异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 XLMRobertaTokenizer 添加到导入结构中
    _import_structure["tokenization_xlm_roberta"] = ["XLMRobertaTokenizer"]

# 检查是否安装了 tokenizers 库，如果没有则抛出异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 XLMRobertaTokenizerFast 添加到导入结构中
    _import_structure["tokenization_xlm_roberta_fast"] = ["XLMRobertaTokenizerFast"]

# 检查是否安装了 torch 库，如果没有则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 XLMRoBerta 相关模型和类添加到导入结构中
    _import_structure["modeling_xlm_roberta"] = [
        "XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
        "XLMRobertaForCausalLM",
        "XLMRobertaForMaskedLM",
        "XLMRobertaForMultipleChoice",
        "XLMRobertaForQuestionAnswering",
        "XLMRobertaForSequenceClassification",
        "XLMRobertaForTokenClassification",
        "XLMRobertaModel",
        "XLMRobertaPreTrainedModel",
    ]

# 检查是否安装了 tensorflow 库，如果没有则抛出异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，则将 TFXLMRoberta 相关模型和类添加到导入结构中
    _import_structure["modeling_tf_xlm_roberta"] = [
        "TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFXLMRobertaForCausalLM",
        "TFXLMRobertaForMaskedLM",
        "TFXLMRobertaForMultipleChoice",
        "TFXLMRobertaForQuestionAnswering",
        "TFXLMRobertaForSequenceClassification",
        "TFXLMRobertaForTokenClassification",
        "TFXLMRobertaModel",
        "TFXLMRobertaPreTrainedModel",
    ]

# 检查是否安装了 flax 库，如果没有则抛出异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，继续处理 flax 相关的导入结构（此处代码省略）
    pass
    # 将一组模块名称添加到 _import_structure 字典中的 "modeling_flax_xlm_roberta" 键下
    _import_structure["modeling_flax_xlm_roberta"] = [
        "FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
        "FlaxXLMRobertaForMaskedLM",
        "FlaxXLMRobertaForCausalLM",
        "FlaxXLMRobertaForMultipleChoice",
        "FlaxXLMRobertaForQuestionAnswering",
        "FlaxXLMRobertaForSequenceClassification",
        "FlaxXLMRobertaForTokenClassification",
        "FlaxXLMRobertaModel",
        "FlaxXLMRobertaPreTrainedModel",
    ]
if TYPE_CHECKING:
    # 引入需要的配置和模型类映射
    from .configuration_xlm_roberta import (
        XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
        XLMRobertaConfig,
        XLMRobertaOnnxConfig,
    )

    try:
        # 检查是否安装了 sentencepiece
        if not is_sentencepiece_available():
            # 如果未安装，抛出可选依赖不可用的异常
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果安装了 sentencepiece，引入 XLMRobertaTokenizer
        from .tokenization_xlm_roberta import XLMRobertaTokenizer

    try:
        # 检查是否安装了 tokenizers
        if not is_tokenizers_available():
            # 如果未安装，抛出可选依赖不可用的异常
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果安装了 tokenizers，引入 XLMRobertaTokenizerFast
        from .tokenization_xlm_roberta_fast import XLMRobertaTokenizerFast

    try:
        # 检查是否安装了 torch
        if not is_torch_available():
            # 如果未安装，抛出可选依赖不可用的异常
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果安装了 torch，引入 XLM-Roberta 模型和相关类
        from .modeling_xlm_roberta import (
            XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
            XLMRobertaForCausalLM,
            XLMRobertaForMaskedLM,
            XLMRobertaForMultipleChoice,
            XLMRobertaForQuestionAnswering,
            XLMRobertaForSequenceClassification,
            XLMRobertaForTokenClassification,
            XLMRobertaModel,
            XLMRobertaPreTrainedModel,
        )

    try:
        # 检查是否安装了 tensorflow
        if not is_tf_available():
            # 如果未安装，抛出可选依赖不可用的异常
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果安装了 tensorflow，引入 TF 版本的 XLM-Roberta 模型和相关类
        from .modeling_tf_xlm_roberta import (
            TF_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFXLMRobertaForCausalLM,
            TFXLMRobertaForMaskedLM,
            TFXLMRobertaForMultipleChoice,
            TFXLMRobertaForQuestionAnswering,
            TFXLMRobertaForSequenceClassification,
            TFXLMRobertaForTokenClassification,
            TFXLMRobertaModel,
            TFXLMRobertaPreTrainedModel,
        )

    try:
        # 检查是否安装了 flax
        if not is_flax_available():
            # 如果未安装，抛出可选依赖不可用的异常
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果安装了 flax，引入 Flax 版本的 XLM-Roberta 模型和相关类
        from .modeling_flax_xlm_roberta import (
            FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
            FlaxXLMRobertaForCausalLM,
            FlaxXLMRobertaForMaskedLM,
            FlaxXLMRobertaForMultipleChoice,
            FlaxXLMRobertaForQuestionAnswering,
            FlaxXLMRobertaForSequenceClassification,
            FlaxXLMRobertaForTokenClassification,
            FlaxXLMRobertaModel,
            FlaxXLMRobertaPreTrainedModel,
        )

else:
    # 如果不是类型检查阶段，则将当前模块设置为懒加载模块
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\xlm_roberta_xl\configuration_xlm_roberta_xl.py`

# coding=utf-8
# 声明文件的编码格式为 UTF-8

# Copyright 2022 The HuggingFace Inc. team.
# 版权声明，版权归 HuggingFace 公司所有，日期为 2022 年

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 许可证授权使用本文件

# you may not use this file except in compliance with the License.
# 除非符合许可证，否则不得使用本文件

# You may obtain a copy of the License at
# 可以在上述许可证链接获取许可证的副本

# http://www.apache.org/licenses/LICENSE-2.0
# 许可证链接地址

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，否则按"原样"分发软件，不附带任何明示或暗示的保证或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 查看许可证以了解具体的语言控制权限和限制

""" XLM_ROBERTA_XL configuration"""

# 导入必要的模块
from collections import OrderedDict  # 导入 OrderedDict 类
from typing import Mapping  # 导入 Mapping 类型提示

# 导入配置工具函数和 ONNX 配置
from ...configuration_utils import PretrainedConfig  # 导入 PretrainedConfig 类
from ...onnx import OnnxConfig  # 导入 OnnxConfig 类
from ...utils import logging  # 导入 logging 模块

# 获取 logger 对象
logger = logging.get_logger(__name__)

# 预训练模型及其配置文件映射
XLM_ROBERTA_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/xlm-roberta-xl": "https://huggingface.co/facebook/xlm-roberta-xl/resolve/main/config.json",
    "facebook/xlm-roberta-xxl": "https://huggingface.co/facebook/xlm-roberta-xxl/resolve/main/config.json",
    # 查看所有 XLM-RoBERTa-XL 模型的链接地址
}

# XLMRoertaXLConfig 类的定义，继承自 PretrainedConfig
class XLMRobertaXLConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`XLMRobertaXLModel`] or a [`TFXLMRobertaXLModel`].
    It is used to instantiate a XLM_ROBERTA_XL model according to the specified arguments, defining the model
    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
    XLM_ROBERTA_XL [facebook/xlm-roberta-xl](https://huggingface.co/facebook/xlm-roberta-xl) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Examples:

    ```
    >>> from transformers import XLMRobertaXLConfig, XLMRobertaXLModel

    >>> # Initializing a XLM_ROBERTA_XL google-bert/bert-base-uncased style configuration
    >>> configuration = XLMRobertaXLConfig()

    >>> # Initializing a model (with random weights) from the google-bert/bert-base-uncased style configuration
    >>> model = XLMRobertaXLModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    
    # 模型类型定义为 "xlm-roberta-xl"
    model_type = "xlm-roberta-xl"
    # 定义一个初始化方法，用于初始化一个 Transformer 模型的参数
    def __init__(
        self,
        vocab_size=250880,  # 词汇表大小，默认为 250880
        hidden_size=2560,  # 隐藏层大小，默认为 2560
        num_hidden_layers=36,  # 隐藏层的数量，默认为 36
        num_attention_heads=32,  # 注意力头的数量，默认为 32
        intermediate_size=10240,  # 中间层的大小，默认为 10240
        hidden_act="gelu",  # 隐藏层的激活函数，默认为 GELU
        hidden_dropout_prob=0.1,  # 隐藏层的 dropout 概率，默认为 0.1
        attention_probs_dropout_prob=0.1,  # 注意力概率的 dropout 概率，默认为 0.1
        max_position_embeddings=514,  # 最大位置编码数，默认为 514
        type_vocab_size=1,  # 类型词汇表大小，默认为 1
        initializer_range=0.02,  # 初始化范围，默认为 0.02
        layer_norm_eps=1e-05,  # 层归一化的 epsilon，默认为 1e-05
        pad_token_id=1,  # 填充 token 的 id，默认为 1
        bos_token_id=0,  # 起始 token 的 id，默认为 0
        eos_token_id=2,  # 结束 token 的 id，默认为 2
        position_embedding_type="absolute",  # 位置嵌入的类型，默认为绝对位置编码
        use_cache=True,  # 是否使用缓存，默认为 True
        classifier_dropout=None,  # 分类器的 dropout，初始为 None，可以后续设置
        **kwargs,  # 其他未明确指定的参数
    ):
        # 调用父类的初始化方法，设置填充、起始和结束 token 的 id
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
        # 设置模型的各种参数
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.position_embedding_type = position_embedding_type
        self.use_cache = use_cache
        self.classifier_dropout = classifier_dropout
# 从 transformers.models.roberta.configuration_roberta.RobertaOnnxConfig 复制代码，并将 Roberta 替换为 XLMRobertaXL
class XLMRobertaXLOnnxConfig(OnnxConfig):
    
    # 定义 inputs 属性，返回一个字典，其中包含动态轴的映射关系
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 如果任务是多选题 ("multiple-choice")，设置动态轴为 {0: "batch", 1: "choice", 2: "sequence"}
        if self.task == "multiple-choice":
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        # 否则，设置动态轴为 {0: "batch", 1: "sequence"}
        else:
            dynamic_axis = {0: "batch", 1: "sequence"}
        
        # 返回有序字典，包含 input_ids 和 attention_mask 作为键，对应的动态轴作为值
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),
                ("attention_mask", dynamic_axis),
            ]
        )

`.\models\xlm_roberta_xl\convert_xlm_roberta_xl_original_pytorch_checkpoint_to_pytorch.py`

# 指定 Python 文件的编码格式为 UTF-8

# 导入必要的库和模块
import argparse  # 解析命令行参数的库
import pathlib   # 处理路径的库

import fairseq   # 引入 fairseq 库
import torch     # 引入 PyTorch 库
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel  # 导入 fairseq 中的 RoBERTa 模型
from fairseq.modules import TransformerSentenceEncoderLayer  # 导入 fairseq 中的 TransformerSentenceEncoderLayer 模块
from packaging import version  # 用于处理版本号的库

# 从 transformers 库中导入相关模块和类
from transformers import XLMRobertaConfig, XLMRobertaXLForMaskedLM, XLMRobertaXLForSequenceClassification
from transformers.models.bert.modeling_bert import (
    BertIntermediate,   # 导入 BERT 模型中的 BertIntermediate 类
    BertLayer,          # 导入 BERT 模型中的 BertLayer 类
    BertOutput,         # 导入 BERT 模型中的 BertOutput 类
    BertSelfAttention,  # 导入 BERT 模型中的 BertSelfAttention 类
    BertSelfOutput,     # 导入 BERT 模型中的 BertSelfOutput 类
)
from transformers.models.roberta.modeling_roberta import RobertaAttention  # 导入 RoBERTa 模型中的 RobertaAttention 类
from transformers.utils import logging  # 导入 transformers 库中的日志记录模块

# 检查 fairseq 版本是否符合要求
if version.parse(fairseq.__version__) < version.parse("1.0.0a"):
    raise Exception("requires fairseq >= 1.0.0a")

# 设置日志记录的详细程度为 info 级别
logging.set_verbosity_info()
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义一个示例文本
SAMPLE_TEXT = "Hello world! cécé herlolip"

# 定义函数：将 XLM-RoBERTa XL 的检查点转换为 PyTorch 模型
def convert_xlm_roberta_xl_checkpoint_to_pytorch(
    roberta_checkpoint_path: str,  # RoBERTa 检查点文件路径
    pytorch_dump_folder_path: str,  # 转换后的 PyTorch 模型保存路径
    classification_head: bool  # 是否包含分类头
):
    """
    复制/粘贴/调整 RoBERTa 的权重到我们的 BERT 结构。
    """
    # 从预训练的 RoBERTa 模型加载权重
    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
    # 将模型设置为评估模式，禁用 dropout
    roberta.eval()
    # 获取 RoBERTa 模型中的句子编码器
    roberta_sent_encoder = roberta.model.encoder.sentence_encoder
    # 根据 RoBERTa 模型的配置创建 XLM-RoBERTa 的配置
    config = XLMRobertaConfig(
        vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,  # 词汇表大小
        hidden_size=roberta.cfg.model.encoder_embed_dim,  # 隐藏层大小
        num_hidden_layers=roberta.cfg.model.encoder_layers,  # 编码器层数
        num_attention_heads=roberta.cfg.model.encoder_attention_heads,  # 注意力头数
        intermediate_size=roberta.cfg.model.encoder_ffn_embed_dim,  # 中间层大小
        max_position_embeddings=514,  # 最大位置嵌入
        type_vocab_size=1,  # 类型词汇表大小
        layer_norm_eps=1e-5,  # 层归一化的 epsilon 值，与 fairseq 使用的 PyTorch 默认值相同
    )
    # 如果包含分类头，则设置配置中的标签数目
    if classification_head:
        config.num_labels = roberta.model.classification_heads["mnli"].out_proj.weight.shape[0]

    # 打印配置信息
    print("Our RoBERTa config:", config)

    # 根据是否包含分类头选择相应的 XLM-RoBERTa 模型
    model = XLMRobertaXLForSequenceClassification(config) if classification_head else XLMRobertaXLForMaskedLM(config)
    # 将模型设置为评估模式
    model.eval()

    # 开始复制所有权重。
    # 复制嵌入层的权重
    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
        model.roberta.embeddings.token_type_embeddings.weight
    )  # 将 RoBERTa 模型的 token_type_embeddings 权重置零，因为 RoBERTa 不使用它们。

    model.roberta.encoder.LayerNorm.weight = roberta_sent_encoder.layer_norm.weight
    model.roberta.encoder.LayerNorm.bias = roberta_sent_encoder.layer_norm.bias

    for i in range(config.num_hidden_layers):
        # 循环遍历每一层的编码器

        # 获取当前层的 BertLayer 对象和对应的 TransformerSentenceEncoderLayer 对象
        layer: BertLayer = model.roberta.encoder.layer[i]
        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]

        # 设置注意力层的权重和偏置
        attention: RobertaAttention = layer.attention
        attention.self_attn_layer_norm.weight = roberta_layer.self_attn_layer_norm.weight
        attention.self_attn_layer_norm.bias = roberta_layer.self_attn_layer_norm.bias

        # 设置自注意力机制的权重和偏置
        self_attn: BertSelfAttention = layer.attention.self
        assert (
            roberta_layer.self_attn.k_proj.weight.data.shape
            == roberta_layer.self_attn.q_proj.weight.data.shape
            == roberta_layer.self_attn.v_proj.weight.data.shape
            == torch.Size((config.hidden_size, config.hidden_size))
        )
        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias

        # 设置自注意力机制输出的权重和偏置
        self_output: BertSelfOutput = layer.attention.output
        assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias

        # 设置最终的层归一化的权重和偏置
        layer.LayerNorm.weight = roberta_layer.final_layer_norm.weight
        layer.LayerNorm.bias = roberta_layer.final_layer_norm.bias

        # 设置中间层的全连接层的权重和偏置
        intermediate: BertIntermediate = layer.intermediate
        assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
        intermediate.dense.weight = roberta_layer.fc1.weight
        intermediate.dense.bias = roberta_layer.fc1.bias

        # 设置输出层的权重和偏置
        bert_output: BertOutput = layer.output
        assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
        bert_output.dense.weight = roberta_layer.fc2.weight
        bert_output.dense.bias = roberta_layer.fc2.bias
        # 层结束
    # 如果有分类头，则复制 RoBERTa 模型的分类头参数到当前模型的分类器中
    if classification_head:
        # 复制权重和偏置
        model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight
        model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias
        # 复制输出投影的权重和偏置
        model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight
        model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias
    else:
        # 如果没有分类头，则复制 RoBERTa 模型的语言模型头参数到当前模型的语言模型头中
        # 复制权重和偏置
        model.lm_head.dense.weight = roberta.model.encoder.lm_head.dense.weight
        model.lm_head.dense.bias = roberta.model.encoder.lm_head.dense.bias
        # 复制 LayerNorm 的权重和偏置
        model.lm_head.layer_norm.weight = roberta.model.encoder.lm_head.layer_norm.weight
        model.lm_head.layer_norm.bias = roberta.model.encoder.lm_head.layer_norm.bias
        # 复制解码器的权重和偏置
        model.lm_head.decoder.weight = roberta.model.encoder.lm_head.weight
        model.lm_head.decoder.bias = roberta.model.encoder.lm_head.bias

    # 检查模型输出是否一致
    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0)  # 将输入编码为张量，并增加一个维度作为批处理的大小为1
    our_output = model(input_ids)[0]  # 获取当前模型的输出
    if classification_head:
        their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids))
    else:
        their_output = roberta.model(input_ids)[0]  # 获取 RoBERTa 模型的输出
    print(our_output.shape, their_output.shape)  # 打印两个模型输出的形状
    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()  # 计算输出之间的最大绝对差异
    print(f"max_absolute_diff = {max_absolute_diff}")  # 打印最大绝对差异，预期约为 1e-7
    success = torch.allclose(our_output, their_output, atol=1e-3)  # 检查两个模型输出是否在指定误差范围内一致
    print("Do both models output the same tensors?", "🔥" if success else "💩")  # 打印是否两个模型输出相同
    if not success:
        raise Exception("Something went wRoNg")  # 如果输出不一致，则抛出异常

    # 确保路径存在并创建 PyTorch 模型保存文件夹
    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
    print(f"Saving model to {pytorch_dump_folder_path}")  # 打印模型保存的路径
    model.save_pretrained(pytorch_dump_folder_path)  # 将当前模型保存到指定路径
if __name__ == "__main__":
    # 如果脚本被直接执行而非作为模块导入，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # 必填参数
    parser.add_argument(
        "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
    )
    # 添加一个必填参数：RoBERTa 模型的检查点路径，必须是字符串类型，用户必须提供，帮助信息指明它是官方 PyTorch dump 的路径

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )
    # 添加一个必填参数：输出 PyTorch 模型的文件夹路径，必须是字符串类型，用户必须提供，帮助信息指明它是输出 PyTorch 模型的路径

    parser.add_argument(
        "--classification_head", action="store_true", help="Whether to convert a final classification head."
    )
    # 添加一个标志参数：是否转换最终的分类头部，当存在该参数时设置其值为 True，帮助信息说明了这个参数的作用

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数，将 XLM-RoBERTa XL 模型的检查点转换为 PyTorch 格式
    convert_xlm_roberta_xl_checkpoint_to_pytorch(
        args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
    )

`.\models\xlm_roberta_xl\modeling_xlm_roberta_xl.py`

# 设置文件编码为 UTF-8
# 版权声明，指明版权归 HuggingFace Inc. 团队所有，使用 Apache License, Version 2.0 许可
# 详细许可信息可在 http://www.apache.org/licenses/LICENSE-2.0 获取
# 根据适用法律或书面同意，本软件是基于“原样”分发，不提供任何明示或暗示的保证或条件
# 请查阅许可证，了解具体的法律条款和限制条件

"""PyTorch XLM RoBERTa xl,xxl model."""
# 导入所需模块和类型注解
import math
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入激活函数及模型输出类
from ...activations import ACT2FN, gelu
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
# 导入模型工具函数及预训练模型基类
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
# 导入工具函数和日志记录函数
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# 导入 XLM-Roberta XL 配置文件
from .configuration_xlm_roberta_xl import XLMRobertaXLConfig

# 获取全局日志记录器
logger = logging.get_logger(__name__)

# 用于文档的检查点和配置信息
_CHECKPOINT_FOR_DOC = "facebook/xlm-roberta-xl"
_CONFIG_FOR_DOC = "XLMRobertaXLConfig"

# XLM-RoBERTa XL 预训练模型存档列表
XLM_ROBERTA_XL_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/xlm-roberta-xl",
    "facebook/xlm-roberta-xxl",
    # 更多的 RoBERTa 模型可在 https://huggingface.co/models?filter=xlm-roberta-xl 查看
]

# XLM-RoBERTa XL 嵌入层定义
class XLMRobertaXLEmbeddings(nn.Module):
    """
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    """
    # 初始化函数，用于初始化一个新的实例对象
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建词嵌入层，vocab_size表示词汇表大小，hidden_size表示隐藏单元的大小，
        # padding_idx表示填充标记的索引位置，用于处理变长序列
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        
        # 创建位置嵌入层，max_position_embeddings表示最大的位置编码数，
        # hidden_size表示隐藏单元的大小，用于表示单词在句子中的位置信息
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        
        # 创建token类型嵌入层，type_vocab_size表示token类型的数量，
        # hidden_size表示隐藏单元的大小，用于区分不同类型的token
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # 创建Dropout层，用于随机将一部分元素置为0，以防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
        # position_embedding_type用于指定位置编码的类型，默认为"absolute"
        # 将position_ids张量注册为模型的缓冲区，包含从0到max_position_embeddings-1的位置编码
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        
        # 将token_type_ids张量注册为模型的缓冲区，初始化为全0的张量，形状与position_ids相同
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

        # End copy
        # 设置padding_idx属性为config.pad_token_id，用于词嵌入层和位置嵌入层
        self.padding_idx = config.pad_token_id
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )

    # 前向传播函数，定义了模型的数据流向
    def forward(
        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
    ):
        # 如果位置 ID 为 None
        if position_ids is None:
            # 如果输入的 token IDs 不为 None，则从输入的 token IDs 创建位置 IDs。任何填充的 token 仍然保持填充状态。
            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
            # 否则，从输入嵌入创建位置 IDs
            else:
                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

        # 如果输入的 token IDs 不为 None
        if input_ids is not None:
            # 获取输入 token IDs 的形状
            input_shape = input_ids.size()
        else:
            # 否则，获取输入嵌入的形状，去掉最后一维
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度，即输入的第二维度的大小
        seq_length = input_shape[1]

        # 将 token_type_ids 设置为构造函数中注册的缓冲区，通常为全零。这通常在自动生成时发生，
        # 注册的缓冲区有助于在不传递 token_type_ids 的情况下跟踪模型，解决问题 #5664
        if token_type_ids is None:
            # 如果 self 中有 "token_type_ids" 属性
            if hasattr(self, "token_type_ids"):
                # 从 self.token_type_ids 中获取缓冲的 token_type_ids，并截取到序列长度的部分
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                # 扩展 buffered_token_type_ids 以匹配输入形状的第一维和序列长度
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                # 否则，创建全零的 token_type_ids，dtype 为 long 类型，设备为 self.position_ids 的设备
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果输入嵌入为 None
        if inputs_embeds is None:
            # 使用 self.word_embeddings 对输入 token IDs 进行嵌入
            inputs_embeds = self.word_embeddings(input_ids)
        # 获取 token_type_embeddings
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 计算最终的嵌入表示，包括输入嵌入和 token_type_embeddings
        embeddings = inputs_embeds + token_type_embeddings

        # 如果位置嵌入类型为 "absolute"
        if self.position_embedding_type == "absolute":
            # 计算位置嵌入并添加到 embeddings 中
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings

        # 对 embeddings 应用 dropout
        embeddings = self.dropout(embeddings)
        # 返回最终的嵌入表示
        return embeddings

    # 从 transformers.models.roberta.modeling_roberta.RobertaEmbeddings.create_position_ids_from_inputs_embeds 复制而来
    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
        """
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        """
        # 获取输入嵌入的形状，去掉最后一维
        input_shape = inputs_embeds.size()[:-1]
        # 获取序列长度
        sequence_length = input_shape[1]

        # 生成从 padding_idx + 1 到 sequence_length + padding_idx + 1 的位置 IDs
        position_ids = torch.arange(
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
        )
        # 将 position_ids 扩展为与 input_shape 相同的形状
        return position_ids.unsqueeze(0).expand(input_shape)
# 从 transformers.models.bert.modeling_bert.BertSelfAttention 复制并修改为 XLMRobertaXLSelfAttention 类
class XLMRobertaXLSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 如果隐藏层大小不能被注意力头数整除且配置中没有嵌入大小属性，则引发异常
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 创建查询、键、值的线性层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 注意力概率的dropout
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        # 位置嵌入类型，默认为绝对位置编码
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        # 如果使用相对键（relative_key）或相对键查询（relative_key_query），则创建距离嵌入
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 是否为解码器
        self.is_decoder = config.is_decoder

    # 将输入张量重新排列为注意力分数的形状
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,

Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->XLMRobertaXL

class XLMRobertaXLSelfOutput(nn.Module):
def init(self, config):
super().init()
# 密集层，输入和输出大小为隐藏大小
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
# dropout
self.dropout = nn.Dropout(config.hidden_dropout_prob)

# 前向传播函数
def forward(self, hidden_states, input_tensor):
    # 通过密集层
    hidden_states = self.dense(hidden_states)
    # dropout
    hidden_states = self.dropout(hidden_states)
    # 添加输入张量并返回
    hidden_states = hidden_states + input_tensor
    return hidden_states

class XLMRobertaXLAttention(nn.Module):
# 初始化函数，用于创建一个新的自注意力模型实例
def init(self, config, position_embedding_type=None):
# 调用父类的初始化方法
super().init()
# 创建自注意力层的 LayerNorm 层，用于归一化隐藏状态
self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 创建自注意力层实例，传入配置和位置嵌入类型
self.self = XLMRobertaXLSelfAttention(config, position_embedding_type=position_embedding_type)
# 创建自注意力输出层实例
self.output = XLMRobertaXLSelfOutput(config)
# 存储需要被剪枝的注意力头的索引
self.pruned_heads = set()

# 头部剪枝函数，用于剪枝自注意力模型的某些注意力头
def prune_heads(self, heads):
    # 若剪枝头部列表为空，则直接返回
    if len(heads) == 0:
        return
    # 调用辅助函数找到可剪枝的头部及其索引
    heads, index = find_pruneable_heads_and_indices(
        heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
    )

    # 剪枝线性层中的查询、键、值和输出层
    self.self.query = prune_linear_layer(self.self.query, index)
    self.self.key = prune_linear_layer(self.self.key, index)
    self.self.value = prune_linear_layer(self.self.value, index)
    self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

    # 更新超参数并存储已剪枝的头部
    self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
    self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
    self.pruned_heads = self.pruned_heads.union(heads)

# 前向传播函数，接收隐藏状态和多种注意力相关参数，并输出模型的前向传播结果
def forward(
    self,
    hidden_states,
    attention_mask=None,
    head_mask=None,
    encoder_hidden_states=None,
    encoder_attention_mask=None,
    past_key_value=None,
    output_attentions=False,
):
    # 对输入的隐藏状态进行 LayerNorm 归一化处理
    intermediate = self.self_attn_layer_norm(hidden_states)
    # 调用自注意力层进行前向传播，得到自注意力层的输出
    self_outputs = self.self(
        intermediate,
        attention_mask,
        head_mask,
        encoder_hidden_states,
        encoder_attention_mask,
        past_key_value,
        output_attentions,
    )
    # 使用自注意力输出层处理自注意力层的输出和输入的隐藏状态，得到最终的注意力输出
    attention_output = self.output(self_outputs[0], hidden_states)
    # 如果需要输出注意力权重，则将其添加到输出中
    outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
    # 返回模型的输出结果
    return outputs

从transformers.models.bert.modeling_bert.BertIntermediate复制而来

class XLMRobertaXLIntermediate(nn.Module):
def init(self, config):
super().init()
# 创建一个全连接层，输入维度为config.hidden_size，输出维度为config.intermediate_size
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
# 根据配置选择激活函数，如果config.hidden_act是字符串，则从预定义的映射ACT2FN中选择对应的函数；否则直接使用config.hidden_act
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act

def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
    # 将输入hidden_states传入全连接层self.dense，进行线性变换
    hidden_states = self.dense(hidden_states)
    # 将线性变换后的结果传入激活函数self.intermediate_act_fn进行非线性变换
    hidden_states = self.intermediate_act_fn(hidden_states)
    return hidden_states

class XLMRobertaXLOutput(nn.Module):
def init(self, config):
super().init()
# 创建一个全连接层，输入维度为config.intermediate_size，输出维度为config.hidden_size
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)

def forward(self, hidden_states, input_tensor):
    # 将输入hidden_states传入全连接层self.dense，进行线性变换
    hidden_states = self.dense(hidden_states)
    # 将线性变换后的结果与输入input_tensor相加
    hidden_states = hidden_states + input_tensor
    return hidden_states

class XLMRobertaXLLayer(nn.Module):
def init(self, config):
super().init()
# 设定用于分块的前馈传播的块大小
self.chunk_size_feed_forward = config.chunk_size_feed_forward
# 序列长度维度为1
self.seq_len_dim = 1
# 创建XLMRobertaXLAttention对象并赋值给self.attention
self.attention = XLMRobertaXLAttention(config)
# 判断是否为解码器
self.is_decoder = config.is_decoder
# 判断是否添加交叉注意力机制
self.add_cross_attention = config.add_cross_attention
if self.add_cross_attention:
# 如果添加交叉注意力机制但不是解码器，则抛出错误
if not self.is_decoder:
raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
# 创建带有绝对位置嵌入类型的XLMRobertaXLAttention对象并赋值给self.crossattention
self.crossattention = XLMRobertaXLAttention(config, position_embedding_type="absolute")
# 创建XLMRobertaXLIntermediate对象并赋值给self.intermediate
self.intermediate = XLMRobertaXLIntermediate(config)
# 创建XLMRobertaXLOutput对象并赋值给self.output
self.output = XLMRobertaXLOutput(config)
# 创建具有指定参数的LayerNorm对象并赋值给self.LayerNorm
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

def forward(
    self,
    hidden_states,
    attention_mask=None,
    head_mask=None,
    encoder_hidden_states=None,
    encoder_attention_mask=None,
    past_key_value=None,
    output_attentions=False,
):
    # 如果过去的键/值对存在，则只保留自注意力部分的前两个位置
    self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
    # 调用自注意力层进行计算
    self_attention_outputs = self.attention(
        hidden_states,
        attention_mask,
        head_mask,
        output_attentions=output_attentions,
        past_key_value=self_attn_past_key_value,
    )
    # 获取自注意力层的输出
    attention_output = self_attention_outputs[0]

    # 如果当前层是解码器层，则最后一个输出是自注意力缓存的元组
    if self.is_decoder:
        # 提取除了自注意力缓存以外的所有输出
        outputs = self_attention_outputs[1:-1]
        # 获取当前自注意力的键/值对
        present_key_value = self_attention_outputs[-1]
    else:
        # 否则，包括自注意力权重输出
        outputs = self_attention_outputs[1:]  # 如果输出注意力权重，还要包括自注意力
      
    # 初始化交叉注意力的键/值对为 None
    cross_attn_present_key_value = None
    # 如果是解码器并且存在编码器的隐藏状态
    if self.is_decoder and encoder_hidden_states is not None:
        # 如果模型没有交叉注意力层，则抛出错误
        if not hasattr(self, "crossattention"):
            raise ValueError(
                f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                " by setting `config.add_cross_attention=True`"
            )

        # 如果过去的键/值对存在，则提取交叉注意力部分的位置 3,4
        cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
        # 调用交叉注意力层进行计算
        cross_attention_outputs = self.crossattention(
            attention_output,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            cross_attn_past_key_value,
            output_attentions,
        )
        # 获取交叉注意力层的输出
        attention_output = cross_attention_outputs[0]
        # 添加交叉注意力的输出到总输出中，排除注意力权重以外的部分
        outputs = outputs + cross_attention_outputs[1:-1]

        # 将交叉注意力缓存添加到当前键/值对的位置 3,4
        cross_attn_present_key_value = cross_attention_outputs[-1]
        present_key_value = present_key_value + cross_attn_present_key_value

    # 应用块分片处理函数到前向输出
    layer_output = apply_chunking_to_forward(
        self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
    )
    # 将处理后的层输出添加到总输出中
    outputs = (layer_output,) + outputs

    # 如果是解码器，则将注意力的键/值对作为最后一个输出返回
    if self.is_decoder:
        outputs = outputs + (present_key_value,)

    return outputs

# 定义前馈块处理函数
def feed_forward_chunk(self, attention_output):
    # 对注意力输出进行层归一化
    intermediate_output = self.LayerNorm(attention_output)
    # 应用中间层
    intermediate_output = self.intermediate(intermediate_output)
    # 应用输出层
    layer_output = self.output(intermediate_output, attention_output)
    return layer_output

定义一个用于编码的 XLM-Roberta 模型的编码器类

class XLMRobertaXLEncoder(nn.Module):
# 初始化方法，接收一个配置对象作为参数
def init(self, config):
# 调用父类的初始化方法
super().init()
# 将配置对象保存到实例变量中
self.config = config
# 创建一个由多个 XLM-Roberta 层组成的模块列表
self.layer = nn.ModuleList([XLMRobertaXLLayer(config) for _ in range(config.num_hidden_layers)])
# 创建一个 LayerNorm 层，用于对隐藏状态进行归一化
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
# 是否启用梯度检查点技术，默认为关闭状态
self.gradient_checkpointing = False

# 前向传播方法
def forward(
    self,
    hidden_states,
    attention_mask=None,
    head_mask=None,
    encoder_hidden_states=None,
    encoder_attention_mask=None,
    past_key_values=None,
    use_cache=None,
    output_attentions=False,
    output_hidden_states=False,
    return_dict=True,
    ):
        # 如果启用了梯度检查点且处于训练模式下
        if self.gradient_checkpointing and self.training:
            # 如果 use_cache 设置为 True，则与梯度检查点不兼容，发出警告并强制设置为 False
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False
        # 如果不需要输出隐藏状态，则初始化空的隐藏状态元组
        all_hidden_states = () if output_hidden_states else None
        # 如果不需要输出注意力权重，则初始化空的自注意力权重元组
        all_self_attentions = () if output_attentions else None
        # 如果不需要输出注意力权重或者不含交叉注意力层，则初始化空的交叉注意力权重元组
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果 use_cache 设置为 False，则初始化空的下一个解码器缓存元组
        next_decoder_cache = () if use_cache else None
        # 遍历每个 Transformer 层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码，如果未提供则为 None
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 获取过去的键值对，如果未提供则为 None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果启用了梯度检查点且处于训练模式下
            if self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数调用当前层模块，传入相关参数
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则直接调用当前层模块，传入相关参数
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            # 更新隐藏状态为当前层模块的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果 use_cache 设置为 True，则将当前层模块的输出的最后一个元素添加到 next_decoder_cache 中
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果需要输出注意力权重，则将当前层模块的输出的第二个元素添加到 all_self_attentions 中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果模型配置中包含交叉注意力层，则将当前层模块的输出的第三个元素添加到 all_cross_attentions 中
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 对最终的隐藏状态应用 LayerNorm 归一化
        hidden_states = self.LayerNorm(hidden_states)

        # 如果需要输出隐藏状态，则将最终隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要返回字典形式的输出
        if not return_dict:
            # 返回包含非 None 值的元组，包括隐藏状态、下一个解码器缓存、所有隐藏状态、所有自注意力权重、所有交叉注意力权重
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 否则返回包含所有输出的 BaseModelOutputWithPastAndCrossAttentions 对象
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )

Copied from transformers.models.bert.modeling_bert.BertPooler

class XLMRobertaXLPooler(nn.Module):
def init(self, config):
super().init()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()

def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
    # We "pool" the model by simply taking the hidden state corresponding
    # to the first token.
    first_token_tensor = hidden_states[:, 0]
    # Pass the first token's hidden state through a linear layer
    pooled_output = self.dense(first_token_tensor)
    # Apply activation function (Tanh) to the pooled output
    pooled_output = self.activation(pooled_output)
    return pooled_output

class XLMRobertaXLPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""

config_class = XLMRobertaXLConfig
base_model_prefix = "roberta"

# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
def _init_weights(self, module):
    """Initialize the weights"""
    if isinstance(module, nn.Linear):
        # Initialize weights of a linear layer with normal distribution
        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
            # Initialize biases to zeros
            module.bias.data.zero_()
    elif isinstance(module, nn.Embedding):
        # Initialize embedding weights with normal distribution
        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.padding_idx is not None:
            # If padding index is specified, initialize those weights to zeros
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
        # Initialize LayerNorm biases to zeros and weights to ones
        module.bias.data.zero_()
        module.weight.data.fill_(1.0)

XLM_ROBERTA_XL_START_DOCSTRING = r"""
This model inherits from [PreTrainedModel]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.) This model is also a PyTorch torch.nn.Module
subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
general usage and behavior.

Parameters:
    config ([`XLMRobertaXLConfig`]): Model configuration class with all the parameters of the
        model. Initializing with a config file does not load the weights associated with the model, only the
        configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.

"""

XLM_ROBERTA_XL_INPUTS_DOCSTRING = r"""
"""
Placeholder for documenting model input parameters, to be filled with specific details.
"""
Args:
input_ids (torch.LongTensor of shape ({0})):
# 输入序列中词汇表中的标记索引。可以使用 [AutoTokenizer] 获得索引。参见 [PreTrainedTokenizer.encode] 和 [PreTrainedTokenizer.__call__] 获取详细信息。什么是输入 ID？
attention_mask (torch.FloatTensor of shape ({0}), optional):
# 避免在填充标记索引上执行注意力操作的掩码。掩码取值范围 [0, 1]：

        # - 1 表示**未被掩盖**的标记，
        # - 0 表示**被掩盖**的标记。
        [什么是注意力掩码？](../glossary#attention-mask)
    token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
        # 分段标记索引，指示输入的第一部分和第二部分。索引取值 `[0, 1]`：

        # - 0 对应*句子 A* 的标记，
        # - 1 对应*句子 B* 的标记。
        [什么是分段标记 ID？](../glossary#token-type-ids)
    position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
        # 输入序列中每个标记的位置索引，在位置嵌入中选择范围 `[0, config.max_position_embeddings - 1]`。[什么是位置 ID？](../glossary#position-ids)
    head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
        # 用于空置自注意力模块中选择头部的掩码。掩码取值范围 `[0, 1]`：

        # - 1 表示**未被掩盖**的头部，
        # - 0 表示**被掩盖**的头部。
    inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
        # 可选，可以直接传递嵌入表示，而不是传递 `input_ids`。如果要比模型内部的嵌入查找矩阵更精确地控制如何将 `input_ids` 索引转换为相关向量，则此选项很有用。
    output_attentions (`bool`, *optional*):
        # 是否返回所有注意力层的注意力张量。查看返回张量下的 `attentions` 获取更多详细信息。
    output_hidden_states (`bool`, *optional*):
        # 是否返回所有层的隐藏状态。查看返回张量下的 `hidden_states` 获取更多详细信息。
    return_dict (`bool`, *optional*):
        # 是否返回 [`~utils.ModelOutput`] 而不是简单的元组。
"""
# 定义 XLM-RoBERTa-XL 模型类，继承自 XLMRobertaXLPreTrainedModel
@add_start_docstrings(
    "The bare XLM-RoBERTa-XL Model transformer outputting raw hidden-states without any specific head on top.",
    XLM_ROBERTA_XL_START_DOCSTRING,
)
class XLMRobertaXLModel(XLMRobertaXLPreTrainedModel):
    """
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin. To behave as an decoder the model needs to be initialized with the `is_decoder`
    argument of the configuration set to `True`. To be used in a Seq2Seq model, the model needs to initialized with
    both `is_decoder` argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as
    an input to the forward pass. .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
    """

    # 从 transformers.models.bert.modeling_bert.BertModel.__init__ 复制并修改为 XLM-RoBERTa-XL
    def __init__(self, config, add_pooling_layer=True):
        # 调用父类构造函数
        super().__init__(config)
        # 初始化模型配置
        self.config = config

        # 初始化词嵌入层
        self.embeddings = XLMRobertaXLEmbeddings(config)
        # 初始化编码器
        self.encoder = XLMRobertaXLEncoder(config)

        # 添加池化层，如果指定要添加
        self.pooler = XLMRobertaXLPooler(config) if add_pooling_layer else None

        # 执行初始化权重和最终处理
        self.post_init()

    # 获取输入词嵌入
    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    # 设置输入词嵌入
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    # 剪枝模型中的注意力头部
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # 从 transformers.models.bert.modeling_bert.BertModel.forward 复制
# 定义一个方法 `forward`，用于模型的前向传播
def forward(
    self,
    # 输入的 token IDs 张量，可选
    input_ids: Optional[torch.Tensor] = None,
    # 注意力掩码张量，指示输入中哪些是 padding 的，可选
    attention_mask: Optional[torch.Tensor] = None,
    # 分段 ID 张量，用于区分不同句子或片段，可选
    token_type_ids: Optional[torch.Tensor] = None,
    # 位置 ID 张量，标识输入中每个 token 的位置信息，可选
    position_ids: Optional[torch.Tensor] = None,
    # 头部掩码张量，用于屏蔽特定的注意力头部，可选
    head_mask: Optional[torch.Tensor] = None,
    # 嵌入的输入张量，用于直接提供输入的嵌入表示，可选
    inputs_embeds: Optional[torch.Tensor] = None,
    # 编码器的隐藏状态张量，可选
    encoder_hidden_states: Optional[torch.Tensor] = None,
    # 编码器的注意力掩码张量，用于屏蔽编码器注意力，可选
    encoder_attention_mask: Optional[torch.Tensor] = None,
    # 过去的键值列表，用于生成缓存，可选
    past_key_values: Optional[List[torch.FloatTensor]] = None,
    # 是否使用缓存，可选
    use_cache: Optional[bool] = None,
    # 是否输出注意力权重，可选
    output_attentions: Optional[bool] = None,
    # 是否输出隐藏状态，可选
    output_hidden_states: Optional[bool] = None,
    # 是否以字典形式返回结果，可选
    return_dict: Optional[bool] = None,

@add_start_docstrings(
"""XLM-RoBERTa-XL Model with a language modeling head on top for CLM fine-tuning.""",
XLM_ROBERTA_XL_START_DOCSTRING,
)
class XLMRobertaXLForCausalLM(XLMRobertaXLPreTrainedModel):
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]

def __init__(self, config):
    super().__init__(config)

    if not config.is_decoder:
        logger.warning("If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`")

    # 初始化父类构造函数，配置模型为是否解码器
    self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False)
    # 创建语言模型的头部
    self.lm_head = XLMRobertaXLLMHead(config)

    # 初始化模型权重
    self.init_weights()

def get_output_embeddings(self):
    # 返回语言模型头部的解码器部分
    return self.lm_head.decoder

def set_output_embeddings(self, new_embeddings):
    # 设置语言模型头部的解码器部分为新的嵌入层
    self.lm_head.decoder = new_embeddings

@add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.FloatTensor] = None,
    token_type_ids: Optional[torch.LongTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    head_mask: Optional[torch.FloatTensor] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    encoder_hidden_states: Optional[torch.FloatTensor] = None,
    encoder_attention_mask: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
):
    """
    模型的前向传播函数，支持条件语言建模（CLM）。
    """
    # 准备生成的输入，包括输入的 ID、注意力掩码和过去的键值对
    input_shape = input_ids.shape
    # 如果注意力掩码为空，则创建全为1的注意力掩码
    if attention_mask is None:
        attention_mask = input_ids.new_ones(input_shape)

    # 如果传入了过去的键值对，则修剪输入 ID
    if past_key_values is not None:
        past_length = past_key_values[0][0].shape[2]

        # 有些生成方法可能只传入最后一个输入 ID
        if input_ids.shape[1] > past_length:
            remove_prefix_length = past_length
        else:
            # 默认保留最后一个 ID
            remove_prefix_length = input_ids.shape[1] - 1

        input_ids = input_ids[:, remove_prefix_length:]

    # 返回输入的字典，包含修剪后的输入 ID、注意力掩码和过去的键值对
    return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
# 定义一个方法 `_reorder_cache`，用于重新排序缓存中的过去键值
def _reorder_cache(self, past_key_values, beam_idx):
    # 初始化一个空元组 `reordered_past` 用于存储重新排序后的过去键值
    reordered_past = ()
    # 遍历每个层级的过去键值
    for layer_past in past_key_values:
        # 对每个层级的过去状态进行重新排序，并添加到 `reordered_past` 中
        reordered_past += (
            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
        )
    # 返回重新排序后的过去键值 `reordered_past`
    return reordered_past

为 XLM-RoBERTa-XL 模型添加一个在顶部的语言建模头部

@add_start_docstrings(
"""XLM-RoBERTa-XL Model with a language modeling head on top.""", XLM_ROBERTA_XL_START_DOCSTRING
)
class XLMRobertaXLForMaskedLM(XLMRobertaXLPreTrainedModel):
# 定义了共享权重的关键键列表，用于语言建模头部
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]

def __init__(self, config):
    # 调用父类的初始化方法
    super().__init__(config)

    # 如果配置指定为解码器，发出警告，建议设置为双向自注意力模型
    if config.is_decoder:
        logger.warning(
            "If you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for "
            "bi-directional self-attention."
        )

    # 初始化 XLM-RoBERTa-XL 模型，不添加池化层
    self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False)
    # 初始化语言建模头部
    self.lm_head = XLMRobertaXLLMHead(config)

    # 初始化模型权重
    self.init_weights()

# 获取输出嵌入
def get_output_embeddings(self):
    return self.lm_head.decoder

# 设置输出嵌入
def set_output_embeddings(self, new_embeddings):
    self.lm_head.decoder = new_embeddings

# 定义前向传播方法，接受一系列输入参数和返回值，并用注释来描述每个参数和返回值的含义
@add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
    checkpoint=_CHECKPOINT_FOR_DOC,
    output_type=MaskedLMOutput,
    config_class=_CONFIG_FOR_DOC,
    mask="<mask>",
)
def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.FloatTensor] = None,
    token_type_ids: Optional[torch.LongTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    head_mask: Optional[torch.FloatTensor] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    encoder_hidden_states: Optional[torch.Tensor] = None,
    encoder_attention_mask: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    ) -> Union[Tuple, MaskedLMOutput]:
    r"""
    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
        config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
        loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
    kwargs (`Dict[str, any]`, optional, defaults to *{}*):
        Used to hide legacy arguments that have been deprecated.
    """
    # 根据 return_dict 参数确定是否返回字典类型的输出
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # 将输入传递给 RoBERTa 模型进行前向传播，获取输出
    outputs = self.roberta(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        encoder_hidden_states=encoder_hidden_states,
        encoder_attention_mask=encoder_attention_mask,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    # 获取 RoBERTa 模型的序列输出
    sequence_output = outputs[0]
    # 使用语言模型头部对序列输出进行预测得分计算
    prediction_scores = self.lm_head(sequence_output)

    masked_lm_loss = None
    # 如果提供了 labels，计算掩码语言模型的损失
    if labels is not None:
        loss_fct = CrossEntropyLoss()
        masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

    # 如果 return_dict 为 False，则按照非字典类型的方式返回输出
    if not return_dict:
        output = (prediction_scores,) + outputs[2:]
        return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

    # 如果 return_dict 为 True，则按照 MaskedLMOutput 类型返回输出
    return MaskedLMOutput(
        loss=masked_lm_loss,
        logits=prediction_scores,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

class XLMRobertaXLLMHead(nn.Module):
"""XLM-RoBERTa-XL Head for masked language modeling."""

def __init__(self, config):
    super().__init__()
    # 线性层，用于将输入特征从隐藏大小转换为隐藏大小
    self.dense = nn.Linear(config.hidden_size, config.hidden_size)
    # 归一化层，对输入进行归一化处理
    self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    # 解码层，将隐藏特征映射到词汇表大小
    self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
    # 偏置参数，与解码层相关联
    self.bias = nn.Parameter(torch.zeros(config.vocab_size))
    self.decoder.bias = self.bias

def forward(self, features, **kwargs):
    # 线性转换
    x = self.dense(features)
    # GELU激活函数
    x = gelu(x)
    # 归一化
    x = self.layer_norm(x)

    # 使用解码层映射到词汇表大小
    x = self.decoder(x)

    return x

def _tie_weights(self):
    # 如果偏置被重新设置（如在TPU上或偏置大小变化时），将解码层的偏置与模型的偏置参数关联
    self.bias = self.decoder.bias

@add_start_docstrings(
"""
XLM-RoBERTa-XL Model transformer with a sequence classification/regression head on top (a linear layer on top
of the pooled output) e.g. for GLUE tasks.
""",
XLM_ROBERTA_XL_START_DOCSTRING,
)
class XLMRobertaXLForSequenceClassification(XLMRobertaXLPreTrainedModel):
def init(self, config):
super().init(config)
# 类别数目
self.num_labels = config.num_labels
self.config = config

    # XLM-RoBERTa-XL 模型，不添加池化层
    self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False)
    # 分类器，用于在顶部进行序列分类
    self.classifier = XLMRobertaXLClassificationHead(config)

    # 初始化模型权重
    self.init_weights()

@add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
    checkpoint=_CHECKPOINT_FOR_DOC,
    output_type=SequenceClassifierOutput,
    config_class=_CONFIG_FOR_DOC,
)
def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.FloatTensor] = None,
    token_type_ids: Optional[torch.LongTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    head_mask: Optional[torch.FloatTensor] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    ):
    # 模型前向传播函数，详细参数如下所示
) -> Union[Tuple, SequenceClassifierOutput]:
    r"""
    labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
        config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
        `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
    """
    # 确保 return_dict 不为 None，则使用 self.config.use_return_dict，否则设为 None
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # 使用 Roberta 模型处理输入数据，并获取输出
    outputs = self.roberta(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    # 从 RoBERTa 输出中获取序列输出
    sequence_output = outputs[0]
    # 使用分类器对序列输出进行分类得到 logits
    logits = self.classifier(sequence_output)

    loss = None
    # 如果 labels 不为 None，则计算损失
    if labels is not None:
        # 如果未指定问题类型，则根据条件自动确定
        if self.config.problem_type is None:
            if self.num_labels == 1:
                self.config.problem_type = "regression"
            elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                self.config.problem_type = "single_label_classification"
            else:
                self.config.problem_type = "multi_label_classification"

        # 根据问题类型选择相应的损失函数进行计算
        if self.config.problem_type == "regression":
            loss_fct = MSELoss()  # 使用均方误差损失函数
            if self.num_labels == 1:
                loss = loss_fct(logits.squeeze(), labels.squeeze())
            else:
                loss = loss_fct(logits, labels)
        elif self.config.problem_type == "single_label_classification":
            loss_fct = CrossEntropyLoss()  # 使用交叉熵损失函数
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        elif self.config.problem_type == "multi_label_classification":
            loss_fct = BCEWithLogitsLoss()  # 使用带 logits 的二元交叉熵损失函数
            loss = loss_fct(logits, labels)

    # 如果不要求返回字典，则返回输出和损失
    if not return_dict:
        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

    # 返回带有损失、logits、隐藏状态和注意力权重的序列分类器输出对象
    return SequenceClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

"""
XLM-RoBERTa-XL Model with a multiple choice classification head on top (a linear layer on top of the pooled
output and a softmax) e.g. for RocStories/SWAG tasks.
"""

基于 XLM-RoBERTa-XL 模型，添加一个多选分类头部（池化输出之上的线性层和 softmax），例如用于 RocStories/SWAG 任务

@add_start_docstrings(
XLM_ROBERTA_XL_START_DOCSTRING,
)

XLM-RoBERTa-XLForMultipleChoice 类定义，继承自 XLMRobertaXLPreTrainedModel

class XLMRobertaXLForMultipleChoice(XLMRobertaXLPreTrainedModel):

# 初始化方法
def __init__(self, config):
    # 调用父类的初始化方法
    super().__init__(config)
    
    # 初始化 XLM-RoBERTa-XL 模型
    self.roberta = XLMRobertaXLModel(config)
    
    # Dropout 层
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    
    # 分类器，线性层，输入尺寸为隐藏层大小，输出尺寸为 1（二元分类）
    self.classifier = nn.Linear(config.hidden_size, 1)
    
    # 初始化权重
    self.init_weights()

# forward 方法
@add_start_docstrings_to_model_forward(
    XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
)
@add_code_sample_docstrings(
    checkpoint=_CHECKPOINT_FOR_DOC,
    output_type=MultipleChoiceModelOutput,
    config_class=_CONFIG_FOR_DOC,
)
def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    token_type_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    head_mask: Optional[torch.FloatTensor] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
):
    """
    XLM-RoBERTa-XL 模型的前向传播方法。

    Args:
        input_ids (Optional[torch.LongTensor], optional): 输入的 token IDs. Defaults to None.
        token_type_ids (Optional[torch.LongTensor], optional): token 类型 IDs. Defaults to None.
        attention_mask (Optional[torch.FloatTensor], optional): 注意力遮罩. Defaults to None.
        labels (Optional[torch.LongTensor], optional): 标签. Defaults to None.
        position_ids (Optional[torch.LongTensor], optional): 位置 IDs. Defaults to None.
        head_mask (Optional[torch.FloatTensor], optional): 头部遮罩. Defaults to None.
        inputs_embeds (Optional[torch.FloatTensor], optional): 输入嵌入. Defaults to None.
        output_attentions (Optional[bool], optional): 是否输出注意力. Defaults to None.
        output_hidden_states (Optional[bool], optional): 是否输出隐藏状态. Defaults to None.
        return_dict (Optional[bool], optional): 是否返回字典格式的输出. Defaults to None.

    Returns:
        MultipleChoiceModelOutput: 多选分类模型的输出.
    """
    # 省略具体的前向传播逻辑
    pass
    ) -> Union[Tuple, MultipleChoiceModelOutput]:
    r"""
    labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
        num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
        `input_ids` above)
    """
    # 确保返回字典不为空，根据配置决定是否使用返回字典
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    # 计算选项的数量，如果输入的input_ids不为空，则为其第二维的大小
    num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

    # 将输入的input_ids展平成二维数组，如果input_ids不为空
    flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
    # 将position_ids展平成二维数组，如果position_ids不为空
    flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
    # 将token_type_ids展平成二维数组，如果token_type_ids不为空
    flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
    # 将attention_mask展平成二维数组，如果attention_mask不为空
    flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
    # 将inputs_embeds展平成三维数组，如果inputs_embeds不为空
    flat_inputs_embeds = (
        inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
        if inputs_embeds is not None
        else None
    )

    # 使用RoBERTa模型处理展平后的输入
    outputs = self.roberta(
        flat_input_ids,
        position_ids=flat_position_ids,
        token_type_ids=flat_token_type_ids,
        attention_mask=flat_attention_mask,
        head_mask=head_mask,
        inputs_embeds=flat_inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    # 获取RoBERTa模型的汇聚输出
    pooled_output = outputs[1]

    # 对汇聚输出进行dropout操作
    pooled_output = self.dropout(pooled_output)
    # 使用分类器进行分类，得到logits
    logits = self.classifier(pooled_output)
    # 将logits重新形状为(batch_size, num_choices)
    reshaped_logits = logits.view(-1, num_choices)

    # 如果提供了labels，则计算交叉熵损失
    loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(reshaped_logits, labels)

    # 如果不使用返回字典，则返回一个元组，包含reshaped_logits和额外的输出信息
    if not return_dict:
        output = (reshaped_logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

    # 如果使用返回字典，则返回一个MultipleChoiceModelOutput对象
    return MultipleChoiceModelOutput(
        loss=loss,
        logits=reshaped_logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

@add_start_docstrings(
"""
XLM-RoBERTa-XL Model with a token classification head on top (a linear layer on top of the hidden-states
output) e.g. for Named-Entity-Recognition (NER) tasks.
""",
XLM_ROBERTA_XL_START_DOCSTRING,
)
class XLMRobertaXLForTokenClassification(XLMRobertaXLPreTrainedModel):
"""
XLM-RoBERTa-XL模型，顶部带有一个标记分类头部（在隐藏状态输出之上的线性层），例如用于命名实体识别（NER）任务。
"""

def __init__(self, config):
    super().__init__(config)
    self.num_labels = config.num_labels

    # 初始化 XLM-RoBERTa-XL 模型，不添加池化层
    self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False)

    # 如果配置中指定了分类器的dropout，则使用其值；否则使用隐藏层dropout的值
    classifier_dropout = (
        config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
    )
    self.dropout = nn.Dropout(classifier_dropout)

    # 线性分类器，将隐藏状态大小映射到标签数量
    self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    # 初始化模型权重
    self.init_weights()

@add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
    checkpoint=_CHECKPOINT_FOR_DOC,
    output_type=TokenClassifierOutput,
    config_class=_CONFIG_FOR_DOC,
)
def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.FloatTensor] = None,
    token_type_ids: Optional[torch.LongTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    head_mask: Optional[torch.FloatTensor] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
):
    """
    XLM-RoBERTa-XL 模型的前向传播方法，接受多种输入参数，并返回标记分类的输出。

    Args:
        input_ids (Optional[torch.LongTensor], optional): 输入的 token IDs，shape 为 [batch_size, sequence_length]。
        attention_mask (Optional[torch.FloatTensor], optional): 注意力遮罩，shape 为 [batch_size, sequence_length]。
        token_type_ids (Optional[torch.LongTensor], optional): token 类型 IDs，shape 为 [batch_size, sequence_length]。
        position_ids (Optional[torch.LongTensor], optional): 位置 IDs，shape 为 [batch_size, sequence_length]。
        head_mask (Optional[torch.FloatTensor], optional): 头部遮罩，shape 为 [num_heads] 或者 [num_layers, num_heads]。
        inputs_embeds (Optional[torch.FloatTensor], optional): 嵌入的输入，shape 为 [batch_size, sequence_length, hidden_size]。
        labels (Optional[torch.LongTensor], optional): 标签，shape 为 [batch_size, sequence_length]。
        output_attentions (Optional[bool], optional): 是否输出注意力权重。
        output_hidden_states (Optional[bool], optional): 是否输出隐藏状态。
        return_dict (Optional[bool], optional): 是否返回输出的字典格式。

    Returns:
        TokenClassifierOutput: 标记分类器的输出，包括 logits、损失和可能的额外内容。
    """
    # 略
) -> Union[Tuple, TokenClassifierOutput]:
    r"""
    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
    """
    # 如果 return_dict 参数为 None，则使用 self.config.use_return_dict 决定是否返回字典类型的输出
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # 使用 Roberta 模型进行前向传播
    outputs = self.roberta(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    # 获取模型输出的序列输出（通常是最后一层的隐藏状态）
    sequence_output = outputs[0]

    # 对序列输出进行 dropout 处理
    sequence_output = self.dropout(sequence_output)
    
    # 对处理后的序列输出进行分类预测
    logits = self.classifier(sequence_output)

    loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss()
        # 只保留 loss 中与 attention_mask 激活部分对应的部分
        if attention_mask is not None:
            active_loss = attention_mask.view(-1) == 1
            active_logits = logits.view(-1, self.num_labels)
            active_labels = torch.where(
                active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
            )
            loss = loss_fct(active_logits, active_labels)
        else:
            # 计算所有 logits 与 labels 之间的 loss
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    if not return_dict:
        # 如果不要求返回字典类型的输出，则按元组形式返回
        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

    # 如果要求返回字典类型的输出，则构造 TokenClassifierOutput 对象返回
    return TokenClassifierOutput(
        loss=loss,
        logits=logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

class XLMRobertaXLClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""

def __init__(self, config):
    super().__init__()
    # 定义一个全连接层，输入和输出维度都是 config.hidden_size
    self.dense = nn.Linear(config.hidden_size, config.hidden_size)
    # 根据配置选择分类器的 dropout rate，若未指定则使用隐藏层 dropout rate
    classifier_dropout = (
        config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
    )
    self.dropout = nn.Dropout(classifier_dropout)
    # 输出层，全连接层，输入维度为 config.hidden_size，输出维度为 config.num_labels
    self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

def forward(self, features, **kwargs):
    # 从 features 中取出第一个 token 的隐藏状态，相当于取出了 [CLS] token
    x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
    x = self.dropout(x)  # 使用 dropout 处理隐藏状态
    x = self.dense(x)  # 全连接层处理隐藏状态
    x = torch.tanh(x)  # 使用 tanh 激活函数
    x = self.dropout(x)  # 再次使用 dropout
    x = self.out_proj(x)  # 输出层处理结果
    return x

@add_start_docstrings(
"""
XLM-RoBERTa-XL Model with a span classification head on top for extractive question-answering tasks like SQuAD
(a linear layers on top of the hidden-states output to compute span start logits and span end logits).
""",
XLM_ROBERTA_XL_START_DOCSTRING,
)
class XLMRobertaXLForQuestionAnswering(XLMRobertaXLPreTrainedModel):
def init(self, config):
super().init(config)
self.num_labels = config.num_labels

    # 初始化 XLM-RoBERTa 模型，不包括 pooling 层
    self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False)
    # QA 输出层，全连接层，输入维度为 config.hidden_size，输出维度为 config.num_labels
    self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

    self.init_weights()  # 初始化模型权重

@add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@add_code_sample_docstrings(
    checkpoint=_CHECKPOINT_FOR_DOC,
    output_type=QuestionAnsweringModelOutput,
    config_class=_CONFIG_FOR_DOC,
)
def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.FloatTensor] = None,
    token_type_ids: Optional[torch.LongTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    head_mask: Optional[torch.FloatTensor] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    start_positions: Optional[torch.LongTensor] = None,
    end_positions: Optional[torch.LongTensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, QuestionAnsweringModelOutput]:
    r"""
    start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for position (index) of the start of the labelled span for computing the token classification loss.
        Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
        are not taken into account for computing the loss.
    end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for position (index) of the end of the labelled span for computing the token classification loss.
        Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
        are not taken into account for computing the loss.
    """
    # Initialize return_dict to either the provided value or the default from model configuration
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # Forward pass through the Roberta model with specified inputs and optional arguments
    outputs = self.roberta(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    # Extract the sequence output from the model outputs
    sequence_output = outputs[0]

    # Pass sequence output through QA output layer to get logits
    logits = self.qa_outputs(sequence_output)
    
    # Split logits into start and end logits along the last dimension
    start_logits, end_logits = logits.split(1, dim=-1)
    
    # Squeeze out unnecessary dimensions and ensure contiguous memory layout
    start_logits = start_logits.squeeze(-1).contiguous()
    end_logits = end_logits.squeeze(-1).contiguous()

    total_loss = None
    if start_positions is not None and end_positions is not None:
        # Adjust start_positions and end_positions if they have extra dimensions
        if len(start_positions.size()) > 1:
            start_positions = start_positions.squeeze(-1)
        if len(end_positions.size()) > 1:
            end_positions = end_positions.squeeze(-1)
        
        # Clamp positions to ignore indices outside of model input length
        ignored_index = start_logits.size(1)
        start_positions = start_positions.clamp(0, ignored_index)
        end_positions = end_positions.clamp(0, ignored_index)

        # Compute CrossEntropyLoss for start and end positions
        loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
        start_loss = loss_fct(start_logits, start_positions)
        end_loss = loss_fct(end_logits, end_positions)
        
        # Calculate total loss as the average of start and end losses
        total_loss = (start_loss + end_loss) / 2

    if not return_dict:
        # Prepare output tuple if return_dict is False
        output = (start_logits, end_logits) + outputs[2:]
        return ((total_loss,) + output) if total_loss is not None else output

    # Return structured output using QuestionAnsweringModelOutput class if return_dict is True
    return QuestionAnsweringModelOutput(
        loss=total_loss,
        start_logits=start_logits,
        end_logits=end_logits,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

从输入的 input_ids 中创建位置 ID，用于模型的位置编码

非填充符号被替换为它们的位置编号，位置编号从 padding_idx+1 开始计数，填充符号被忽略。

这个函数是根据 fairseq 的 `utils.make_positions` 修改而来。

def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's utils.make_positions.

Args:
    input_ids: torch.Tensor，输入的 token IDs
    padding_idx: int，填充符的索引
    past_key_values_length: int，过去的键值长度，用于增量索引

Returns:
    torch.Tensor，位置 ID 的张量
"""
# 创建一个掩码，标记非填充符号位置为1，填充符号位置为0
mask = input_ids.ne(padding_idx).int()
# 计算递增的位置索引，这里的类型转换和累加被精心设计以便与 ONNX 导出和 XLA 兼容
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
# 最终的位置 ID 是递增索引加上 padding_idx，转换为长整型
return incremental_indices.long() + padding_idx

posted @ 2024-07-01 10:53 绝不原创的飞龙阅读(21) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

龙哥盟

掠夺·扩张·投机·博弈