Transformers-源码解析-一百三十四-

Transformers 源码解析(一百三十四)

.\tf_utils.py

# 导入必要的库和模块
from typing import List, Optional, Union

import numpy as np  # 导入NumPy库,用于数值计算
import tensorflow as tf  # 导入TensorFlow库,用于机器学习模型构建和训练

from .feature_extraction_utils import BatchFeature  # 导入自定义的特征提取工具类
from .tokenization_utils_base import BatchEncoding  # 导入自定义的编码工具类
from .utils import logging  # 导入自定义的日志工具模块

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

def shape_list(tensor: Union[tf.Tensor, np.ndarray]) -> List[int]:
    """
    处理 TensorFlow 中的动态形状。

    Args:
        tensor (`tf.Tensor` or `np.ndarray`): 要获取形状的张量或数组。

    Returns:
        `List[int]`: 张量的形状列表。
    """
    if isinstance(tensor, np.ndarray):
        return list(tensor.shape)  # 返回数组的形状列表

    dynamic = tf.shape(tensor)  # 获取 TensorFlow 张量的动态形状

    if tensor.shape == tf.TensorShape(None):  # 如果张量的静态形状未知
        return dynamic  # 返回动态形状

    static = tensor.shape.as_list()  # 获取张量的静态形状列表

    return [dynamic[i] if s is None else s for i, s in enumerate(static)]  # 返回静态形状或动态形状的组合

def stable_softmax(logits: tf.Tensor, axis: Optional[int] = None, name: Optional[str] = None) -> tf.Tensor:
    """
    稳定的 softmax 函数,用于解决 TensorFlow 在 CPU 上与 XLA 结合时的问题。

    Args:
        logits (`tf.Tensor`): 输入的对数概率张量。
        axis (`int`, *optional*): 执行 softmax 操作的维度,默认为 -1 表示最后一个维度。
        name (`str`, *optional*): 操作的名称。

    Returns:
        `tf.Tensor`: 与 logits 具有相同类型和形状的张量,经过 softmax 处理。
    """
    # TODO: 当上述问题得到解决后,检查 TensorFlow 版本并使用原始函数,最终移除这个函数。
    return tf.nn.softmax(logits=logits + 1e-9, axis=axis, name=name)  # 添加一个小量以确保数值稳定性后进行 softmax 操作

def functional_layernorm(inputs, weight, bias, epsilon=1e-5, axis=-1):
    # 这是一个简化的功能性 layernorm,用于在需要时复制 PyTorch nn.functional.layer_norm 的功能
    # (待补充完整,具体实现未提供)
    # 检查权重和偏置的维度是否为1,以及轴是否为整数,若不符合则抛出未实现的错误
    if weight.shape.rank != 1 or bias.shape.rank != 1 or not isinstance(axis, int):
        raise NotImplementedError("Only 1D weight and bias tensors are supported for now, with only a single axis.")

    # 计算在指定轴上输入数据的均值和方差
    mean, variance = tf.nn.moments(inputs, axes=[axis], keepdims=True)

    if axis != -1:
        # 若轴不是最后一个轴(-1),则重塑权重和偏置的形状,使其与输入数据具有相同的秩,但在除了指定轴外的所有维度上都是1
        shape = [1] * inputs.shape.rank
        shape[axis] = shape_list(inputs)[axis]
        weight = tf.reshape(weight, shape)
        bias = tf.reshape(bias, shape)

    # 使用批量归一化函数 tf.nn.batch_normalization 计算层归一化
    outputs = tf.nn.batch_normalization(
        inputs,
        mean,
        variance,
        offset=bias,
        scale=weight,
        variance_epsilon=epsilon,
    )
    # 返回归一化后的输出结果
    return outputs
def flatten(input, start_dim=0, end_dim=-1):
    # Replicates the behavior of torch.flatten in TF

    # If end_dim or start_dim is negative, count them from the end
    # 如果 end_dim 或 start_dim 是负数,则从末尾开始计数
    if end_dim < 0:
        end_dim += input.shape.rank
    if start_dim < 0:
        start_dim += input.shape.rank

    # Return input tensor if start_dim equals end_dim
    # 如果 start_dim 等于 end_dim,则返回输入张量
    if start_dim == end_dim:
        return input

    # Get the shape of the input tensor
    # 获取输入张量的形状
    in_shape = tf.shape(input)
    
    # Calculate the total size of the flattened dimensions
    # 计算被展平维度的总大小
    flattened_dim = tf.math.reduce_prod(in_shape[start_dim : end_dim + 1])
    
    # Construct the output shape with the flattened dimensions
    # 使用展平后的维度构造输出形状
    out_shape = tf.concat([in_shape[:start_dim], [flattened_dim], in_shape[end_dim + 1 :]], axis=0)
    
    # Reshape the input tensor to the calculated output shape
    # 将输入张量重塑为计算得到的输出形状
    return tf.reshape(input, out_shape)


def invert_attention_mask(encoder_attention_mask: tf.Tensor) -> tf.Tensor:
    """
    Invert an attention mask (e.g., switches 0. and 1.).

    Args:
        encoder_attention_mask (`torch.Tensor`): An attention mask.

    Returns:
        `tf.Tensor`: The inverted attention mask.
    """
    if not isinstance(encoder_attention_mask, tf.Tensor):
        encoder_attention_mask = tf.convert_to_tensor(encoder_attention_mask)  # Catches stray NumPy inputs
    
    # Extend the attention mask tensor based on its rank
    # 根据张量的秩扩展注意力掩码张量
    if encoder_attention_mask.shape.rank == 3:
        encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
    if encoder_attention_mask.shape.rank == 2:
        encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
    
    # Invert the extended attention mask values
    # 反转扩展后的注意力掩码值
    encoder_extended_attention_mask = (
        tf.cast(1, encoder_attention_mask.dtype) - encoder_extended_attention_mask
    ) * encoder_extended_attention_mask.dtype.min

    return encoder_extended_attention_mask


def check_embeddings_within_bounds(tensor: tf.Tensor, embed_dim: int, tensor_name: str = "input_ids") -> None:
    """
    `tf.gather`, on which TF embedding layers are based, won't check positive out of bound indices on GPU, returning
    zeros instead. This function adds a check against that dangerous silent behavior.

    Args:
        tensor (`tf.Tensor`): The tensor of indices to check.
        embed_dim (`int`): The embedding dimension.
        tensor_name (`str`, *optional*): The name of the tensor to use in the error message.
    """
    # Assert that all indices in tensor are less than embed_dim
    # 断言张量中所有的索引都小于 embed_dim
    tf.debugging.assert_less(
        tensor,
        tf.cast(embed_dim, dtype=tensor.dtype),
        message=(
            f"The maximum value of {tensor_name} ({tf.math.reduce_max(tensor)}) must be smaller than the embedding "
            f"layer's input dimension ({embed_dim}). The likely cause is some problem at tokenization time."
        ),
    )


def save_attributes_to_hdf5_group(group, name, data):
    """Saves attributes (data) of the specified name into the HDF5 group.
    
    This function saves attributes (data) with a given name into the specified HDF5 group.

    Args:
        group: HDF5 group where the attributes will be saved.
        name: Name of the attribute to save.
        data: Data to be saved as the attribute.
    """
    """
    This method deals with an inherent problem of HDF5 file which is not able to store data larger than
    HDF5_OBJECT_HEADER_LIMIT bytes.

    Args:
        group: A pointer to a HDF5 group.
        name: A name of the attributes to save.
        data: Attributes data to store.

    Raises:
        RuntimeError: If any single attribute is too large to be saved.

    Copied from Keras to Transformers to avoid versioning issues.
    """
    # 定义 HDF5 文件中对象头部限制大小为 64512 字节
    HDF5_OBJECT_HEADER_LIMIT = 64512

    # 检查所有数据项是否超过 HDF5_OBJECT_HEADER_LIMIT 字节
    # 如果超过,则无论如何切块都无法保存
    bad_attributes = [x for x in data if len(x) > HDF5_OBJECT_HEADER_LIMIT]

    # 如果存在超过限制的属性,则抛出 RuntimeError 异常
    if bad_attributes:
        raise RuntimeError(
            "The following attributes cannot be saved to HDF5 file because "
            f"they are larger than {HDF5_OBJECT_HEADER_LIMIT} "
            f"bytes: {bad_attributes}"
        )

    # 将数据转换为 NumPy 数组
    data_npy = np.asarray(data)

    # 初始时只有一个块
    num_chunks = 1

    # 将数据切分成多个块
    chunked_data = np.array_split(data_npy, num_chunks)

    # 如果有任何一个块的大小超过 HDF5_OBJECT_HEADER_LIMIT,则继续切块
    while any(x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data):
        num_chunks += 1
        chunked_data = np.array_split(data_npy, num_chunks)

    # 如果切块数大于 1,则逐个保存每个块
    if num_chunks > 1:
        for chunk_id, chunk_data in enumerate(chunked_data):
            group.attrs["%s%d" % (name, chunk_id)] = chunk_data
    else:
        # 否则直接保存数据到 HDF5 文件
        group.attrs[name] = data
# 从指定的 HDF5 组中加载指定名称的属性数据
def load_attributes_from_hdf5_group(group, name):
    """Loads attributes of the specified name from the HDF5 group.

    This method deals with an inherent problem of HDF5 file which is not able to store data larger than
    HDF5_OBJECT_HEADER_LIMIT bytes.

    Args:
        group: A pointer to a HDF5 group.
        name: A name of the attributes to load.

    Returns:
        data: Attributes data.

    Copied from Keras to Transformers to avoid versioning issues.
    """
    # 检查属性名是否存在于 HDF5 组的属性中
    if name in group.attrs:
        # 将属性数据解码为 UTF-8 格式的字符串,如果属性是字节流则直接返回
        data = [n.decode("utf8") if hasattr(n, "decode") else n for n in group.attrs[name]]
    else:
        data = []  # 如果属性名不存在,初始化一个空列表
        chunk_id = 0
        # 持续循环直到找不到以 name + chunk_id 命名的属性
        while "%s%d" % (name, chunk_id) in group.attrs:
            # 将属性数据解码为 UTF-8 格式的字符串,如果属性是字节流则直接返回,扩展到 data 列表中
            data.extend(
                [n.decode("utf8") if hasattr(n, "decode") else n for n in group.attrs["%s%d" % (name, chunk_id)]]
            )
            chunk_id += 1
    # 返回加载的属性数据
    return data


# 将 1 维张量扩展为 2 维张量
def expand_1d(data):
    """Expands 1-dimensional `Tensor`s into 2-dimensional `Tensor`s.
    Copied from Keras to here to avoid versioning issues."""

    def _expand_single_1d_tensor(t):
        # 如果输入是 TensorFlow 的张量且为 1 维,则在最后一个维度上扩展为 2 维张量
        if isinstance(t, tf.Tensor) and t.shape.rank == 1:
            return tf.expand_dims(t, axis=-1)
        return t

    # 使用 tf.nest.map_structure 对数据结构中的每个元素应用 _expand_single_1d_tensor 函数
    return tf.nest.map_structure(_expand_single_1d_tensor, data)


# 将 HF BatchEncoding/BatchFeature 对象转换为 Keras 可理解的字典格式
def convert_batch_encoding(*args, **kwargs):
    # 如果参数中存在且第一个参数是 BatchEncoding 或 BatchFeature 类型的对象,则转换为字典
    if args and isinstance(args[0], (BatchEncoding, BatchFeature)):
        args = list(args)
        args[0] = dict(args[0])
    # 如果 kwargs 中存在键 "x" 且其值是 BatchEncoding 或 BatchFeature 类型的对象,则转换为字典
    elif "x" in kwargs and isinstance(kwargs["x"], (BatchEncoding, BatchFeature)):
        kwargs["x"] = dict(kwargs["x"])
    # 返回转换后的参数和关键字参数
    return args, kwargs

.\time_series_utils.py

# 设置文件编码格式为 UTF-8
# 版权声明,声明代码的版权归属
# 版权声明,版权归 Amazon.com, Inc. 或其关联公司所有,保留所有权利
#
# 根据 Apache 许可证 2.0 版本授权,除非符合许可证要求,否则不得使用此文件
# 您可以在以下网址获取许可证副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,本软件是基于“原样”提供的,没有任何明示或暗示的保证或条件
# 请参阅许可证以了解特定语言下的许可条件
"""
时间序列分布输出类和实用程序。
"""
from typing import Callable, Dict, Optional, Tuple

import torch
from torch import nn
from torch.distributions import (
    AffineTransform,
    Distribution,
    Independent,
    NegativeBinomial,
    Normal,
    StudentT,
    TransformedDistribution,
)

class AffineTransformed(TransformedDistribution):
    def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0):
        # 如果 loc 未提供,默认为 0.0
        self.loc = 0.0 if loc is None else loc
        # 如果 scale 未提供,默认为 1.0
        self.scale = 1.0 if scale is None else scale

        # 调用父类的初始化方法,使用 AffineTransform 将 loc 和 scale 应用于基本分布
        super().__init__(base_distribution, [AffineTransform(loc=self.loc, scale=self.scale, event_dim=event_dim)])

    @property
    def mean(self):
        """
        返回分布的均值。
        """
        return self.base_dist.mean * self.scale + self.loc

    @property
    def variance(self):
        """
        返回分布的方差。
        """
        return self.base_dist.variance * self.scale**2

    @property
    def stddev(self):
        """
        返回分布的标准差。
        """
        return self.variance.sqrt()


class ParameterProjection(nn.Module):
    def __init__(
        self, in_features: int, args_dim: Dict[str, int], domain_map: Callable[..., Tuple[torch.Tensor]], **kwargs
    ) -> None:
        super().__init__(**kwargs)
        # 参数维度字典,映射输入特征维度到每个参数的维度
        self.args_dim = args_dim
        # 使用 nn.Linear 创建一系列线性映射模块,将输入特征映射到每个参数的维度
        self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()])
        # 域映射函数,将未限制的参数映射到定义域
        self.domain_map = domain_map

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor]:
        # 对输入 x 应用所有的线性映射,得到未限制的参数列表
        params_unbounded = [proj(x) for proj in self.proj]

        # 使用域映射函数将未限制的参数映射到定义域,返回映射后的参数元组
        return self.domain_map(*params_unbounded)


class LambdaLayer(nn.Module):
    def __init__(self, function):
        super().__init__()
        # 初始化 LambdaLayer 类时传入的函数对象
        self.function = function

    def forward(self, x, *args):
        # 调用传入的函数对象,传入 x 和其他参数 args,返回结果
        return self.function(x, *args)


class DistributionOutput:
    # 分布输出的类别
    distribution_class: type
    # 输入特征的维度
    in_features: int
    # 参数维度的字典
    args_dim: Dict[str, int]

    def __init__(self, dim: int = 1) -> None:
        # 初始化分布输出对象的维度属性
        self.dim = dim
        # 参数维度的字典,每个参数的维度乘以 dim
        self.args_dim = {k: dim * self.args_dim[k] for k in self.args_dim}
    # 根据给定的参数创建一个分布对象,如果维度为1,则直接创建分布对象;否则,创建一个独立分布对象
    def _base_distribution(self, distr_args):
        if self.dim == 1:
            return self.distribution_class(*distr_args)
        else:
            return Independent(self.distribution_class(*distr_args), 1)

    # 根据基本分布对象创建一个分布对象,并根据需要添加仿射变换
    def distribution(
        self,
        distr_args,
        loc: Optional[torch.Tensor] = None,
        scale: Optional[torch.Tensor] = None,
    ) -> Distribution:
        distr = self._base_distribution(distr_args)
        if loc is None and scale is None:
            return distr
        else:
            return AffineTransformed(distr, loc=loc, scale=scale, event_dim=self.event_dim)

    # 返回事件形状的元组,如果维度为1,则返回空元组;否则返回包含维度大小的元组
    @property
    def event_shape(self) -> Tuple:
        r"""
        Shape of each individual event contemplated by the distributions that this object constructs.
        """
        return () if self.dim == 1 else (self.dim,)

    # 返回事件维度的整数值,即事件形状元组的长度
    @property
    def event_dim(self) -> int:
        r"""
        Number of event dimensions, i.e., length of the `event_shape` tuple, of the distributions that this object
        constructs.
        """
        return len(self.event_shape)

    # 返回支持域中的数值,用于计算对应分布的对数损失,默认为0.0,用于数据序列的填充
    @property
    def value_in_support(self) -> float:
        r"""
        A float that will have a valid numeric value when computing the log-loss of the corresponding distribution. By
        default 0.0. This value will be used when padding data series.
        """
        return 0.0

    # 返回将输入映射到分布参数的参数投影层
    def get_parameter_projection(self, in_features: int) -> nn.Module:
        r"""
        Return the parameter projection layer that maps the input to the appropriate parameters of the distribution.
        """
        return ParameterProjection(
            in_features=in_features,
            args_dim=self.args_dim,
            domain_map=LambdaLayer(self.domain_map),
        )

    # 将输入参数转换为正确的形状和域,具体形状取决于分布类型,需对末尾轴进行重塑以定义正确事件形状的分布
    def domain_map(self, *args: torch.Tensor):
        r"""
        Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the
        correct shape is obtained by reshaping the trailing axis in such a way that the returned tensors define a
        distribution of the right event_shape.
        """
        raise NotImplementedError()

    # 静态方法:通过应用square-plus操作将输入映射到正半轴上
    @staticmethod
    def squareplus(x: torch.Tensor) -> torch.Tensor:
        r"""
        Helper to map inputs to the positive orthant by applying the square-plus operation. Reference:
        https://twitter.com/jon_barron/status/1387167648669048833
        """
        return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0
# 学生 t 分布输出类,继承自 DistributionOutput 类
class StudentTOutput(DistributionOutput):
    """
    Student-T distribution output class.
    """

    # 定义参数维度的字典,包括自由度 df、位置 loc、标度 scale
    args_dim: Dict[str, int] = {"df": 1, "loc": 1, "scale": 1}
    # 分布类为 StudentT
    distribution_class: type = StudentT

    @classmethod
    # 将域映射方法,处理输入的张量参数 df、loc、scale
    def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
        # 对 scale 进行平方后加上正值(避免负数),并限制下限为浮点数精度的最小正数
        scale = cls.squareplus(scale).clamp_min(torch.finfo(scale.dtype).eps)
        # 对 df 进行平方后加上正值
        df = 2.0 + cls.squareplus(df)
        # 去除最后一个维度的squeeze操作,返回处理后的 df、loc、scale 参数
        return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1)


# 正态分布输出类,继承自 DistributionOutput 类
class NormalOutput(DistributionOutput):
    """
    Normal distribution output class.
    """

    # 定义参数维度的字典,包括位置 loc、标度 scale
    args_dim: Dict[str, int] = {"loc": 1, "scale": 1}
    # 分布类为 Normal
    distribution_class: type = Normal

    @classmethod
    # 将域映射方法,处理输入的张量参数 loc、scale
    def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor):
        # 对 scale 进行平方后加上正值(避免负数),并限制下限为浮点数精度的最小正数
        scale = cls.squareplus(scale).clamp_min(torch.finfo(scale.dtype).eps)
        # 去除最后一个维度的squeeze操作,返回处理后的 loc、scale 参数
        return loc.squeeze(-1), scale.squeeze(-1)


# 负二项分布输出类,继承自 DistributionOutput 类
class NegativeBinomialOutput(DistributionOutput):
    """
    Negative Binomial distribution output class.
    """

    # 定义参数维度的字典,包括总数 total_count、logits
    args_dim: Dict[str, int] = {"total_count": 1, "logits": 1}
    # 分布类为 NegativeBinomial
    distribution_class: type = NegativeBinomial

    @classmethod
    # 将域映射方法,处理输入的张量参数 total_count、logits
    def domain_map(cls, total_count: torch.Tensor, logits: torch.Tensor):
        # 对 total_count 进行平方后加上正值
        total_count = cls.squareplus(total_count)
        # 去除最后一个维度的squeeze操作,返回处理后的 total_count、logits 参数
        return total_count.squeeze(-1), logits.squeeze(-1)

    # 重写父类方法,根据维度返回对应的分布对象
    def _base_distribution(self, distr_args) -> Distribution:
        total_count, logits = distr_args
        # 如果维度为 1,则返回负二项分布对象
        if self.dim == 1:
            return self.distribution_class(total_count=total_count, logits=logits)
        else:
            # 否则返回独立分布对象
            return Independent(self.distribution_class(total_count=total_count, logits=logits), 1)

    # 覆盖父类方法,用于计算分布,根据需求调整 logits 参数
    def distribution(
        self, distr_args, loc: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None
    ) -> Distribution:
        total_count, logits = distr_args

        if scale is not None:
            # 根据 Gamma 分布的缩放属性调整 logits 参数
            logits += scale.log()

        # 返回基础分布对象的计算结果
        return self._base_distribution((total_count, logits))

.\tokenization_utils.py

# coding=utf-8
# 版权所有 2020 年 HuggingFace Inc. 团队。
#
# 根据 Apache 许可证 2.0 版本(“许可证”)许可;
# 除非符合许可证的规定,否则不得使用此文件。
# 您可以在以下网址获取许可证副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则本软件基于“原样”提供,不提供任何形式的明示或暗示担保或条件。
# 有关详细信息,请参阅许可证。
"""
用于 Python Tokenizers 的标记化类。对于快速标记化器(由 HuggingFace 的 tokenizers 库提供),请参见 tokenization_utils_fast.py
"""
import bisect  # 导入 bisect 模块,用于高效地插入和搜索元素
import itertools  # 导入 itertools 模块,用于创建迭代器的函数
import re  # 导入 re 模块,用于支持正则表达式操作
import unicodedata  # 导入 unicodedata 模块,提供对 Unicode 字符数据库的访问功能
from collections import OrderedDict  # 导入 OrderedDict 类,实现有序字典
from typing import Any, Dict, List, Optional, Tuple, Union, overload  # 导入类型提示

from .tokenization_utils_base import (  # 从 tokenization_utils_base 模块导入以下符号
    ENCODE_KWARGS_DOCSTRING,
    ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
    INIT_TOKENIZER_DOCSTRING,
    AddedToken,
    BatchEncoding,
    EncodedInput,
    EncodedInputPair,
    PreTokenizedInput,
    PreTokenizedInputPair,
    PreTrainedTokenizerBase,
    TextInput,
    TextInputPair,
    TruncationStrategy,
)
from .utils import PaddingStrategy, TensorType, add_end_docstrings, logging  # 从 utils 模块导入符号

logger = logging.get_logger(__name__)  # 获取当前模块的 logger 对象

# Slow tokenizers are saved in a vocabulary plus three separated files
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"  # 定义特殊标记映射文件名
ADDED_TOKENS_FILE = "added_tokens.json"  # 定义添加的标记文件名
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"  # 定义标记器配置文件名


class Trie:
    """
    Trie(字典树)的实现。基于给定的单词列表创建 Trie 结构,用于在一个步骤中分割 `added_tokens`。
    参考资料 https://en.wikipedia.org/wiki/Trie
    """

    def __init__(self):
        self.data = {}  # 初始化 Trie 数据结构
        self._tokens = set()  # 初始化存储添加的标记的集合

    def add(self, word: str):
        """
        将给定单词添加到 Trie 中。
        通过每个字符(UTF-8 字符)递归地添加到内部 `data` Trie 表示中。
        使用特殊键 `""` 表示终止状态。
        
        此函数是幂等的,添加两次相同的单词不会改变 Trie 结构。

        示例:

        ```
        >>> trie = Trie()
        >>> trie.add("Hello 友達")
        >>> trie.data
        {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}

        >>> trie.add("Hello")
        >>> trie.data
        {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
        ```
        """
        if not word:
            # 避免空字符串
            return
        
        self._tokens.add(word)  # 将单词添加到集合中
        ref = self.data  # 设置初始引用为 Trie 的根节点
        for char in word:
            ref[char] = ref.get(char, {})  # 如果字符不存在于 Trie 中,则创建一个新的空字典
            ref = ref[char]  # 移动到下一个字符的字典
        ref[""] = 1  # 在最后字符处标记为结束状态
    # 定义一个方法 cut_text,用于根据给定的偏移量列表将文本 text 切分成多个部分并返回
    def cut_text(self, text, offsets):
        # 将文本的总长度作为最后一个偏移量,确保所有部分都被切分
        offsets.append(len(text))
        # 初始化一个空列表,用于存储切分后的文本部分(即 tokens)
        tokens = []
        # 初始化起始位置 start 为 0
        start = 0
        # 遍历偏移量列表
        for end in offsets:
            # 如果起始位置大于结束位置,表示有错误,记录错误信息到日志
            if start > end:
                logger.error(
                    "There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it"
                    " anyway."
                )
                # 继续处理下一个偏移量
                continue
            # 如果起始位置等于结束位置,可能是在索引 0 处匹配到了,或者是连续匹配导致的零宽度切分,跳过处理
            elif start == end:
                continue
            # 将从 start 到 end 的文本部分加入到 tokens 列表中
            tokens.append(text[start:end])
            # 更新起始位置为当前结束位置,为下一部分切分做准备
            start = end

        # 返回切分后的文本部分列表 tokens
        return tokens
# 检查字符是否为空白字符
def _is_whitespace(char):
    """Checks whether `char` is a whitespace character."""
    # \t, \n, and \r are technically control characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    # 获取字符的Unicode分类
    cat = unicodedata.category(char)
    # 如果Unicode分类为"Zs"(空格分隔符),则判断为True
    if cat == "Zs":
        return True
    return False


# 检查字符是否为控制字符
def _is_control(char):
    """Checks whether `char` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    # 获取字符的Unicode分类
    cat = unicodedata.category(char)
    # 如果Unicode分类以"C"开头(控制字符),则判断为True
    if cat.startswith("C"):
        return True
    return False


# 检查字符是否为标点符号
def _is_punctuation(char):
    """Checks whether `char` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    # 判断字符是否为ASCII中的标点符号范围内的字符
    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
        return True
    # 获取字符的Unicode分类
    cat = unicodedata.category(char)
    # 如果Unicode分类以"P"开头(标点符号),则判断为True
    if cat.startswith("P"):
        return True
    return False


# 检查文本的最后一个字符是否是标点符号、控制字符或空白字符
def _is_end_of_word(text):
    """Checks whether the last character in text is one of a punctuation, control or whitespace character."""
    # 获取文本的最后一个字符
    last_char = text[-1]
    # 返回最后一个字符是否为标点符号、控制字符或空白字符的布尔值
    return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))


# 检查文本的第一个字符是否是标点符号、控制字符或空白字符
def _is_start_of_word(text):
    """Checks whether the first character in text is one of a punctuation, control or whitespace character."""
    # 获取文本的第一个字符
    first_char = text[0]
    # 返回第一个字符是否为标点符号、控制字符或空白字符的布尔值
    return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))


# 将一个新的token插入到有序列表中,如果该token已经存在,则不插入
def _insert_one_token_to_ordered_list(token_list: List[str], new_token: str):
    """
    Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.
    """
    # 使用二分查找确定插入位置
    insertion_idx = bisect.bisect_left(token_list, new_token)
    # 检查新的token是否已经存在于有序的token_list中
    if insertion_idx < len(token_list) and token_list[insertion_idx] == new_token:
        # 如果存在,则直接返回,不做插入操作
        return
    else:
        # 如果不存在,则插入新的token到token_list中的对应位置
        token_list.insert(insertion_idx, new_token)
    """
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    """

    # 1. 初始化类的构造函数
    def __init__(self, **kwargs):
        # 2. 初始化 `self.tokens_trie` 为一个 Trie 数据结构对象
        self.tokens_trie = Trie()

        # 3. 如果子类没有初始化 `_added_tokens_decoder`,则初始化一个空的字典
        if not hasattr(self, "_added_tokens_decoder"):
            self._added_tokens_decoder: Dict[int, AddedToken] = {}

        # 4. 如果传入了 `added_tokens_decoder` 参数,表示从保存的分词器中加载,将其更新到 `_added_tokens_decoder`
        self._added_tokens_decoder.update(kwargs.pop("added_tokens_decoder", {}))
        # 使用 `_added_tokens_decoder` 构建 `_added_tokens_encoder`,将内容从 `AddedToken` 转换为字符串到整数的映射
        self._added_tokens_encoder: Dict[str, int] = {k.content: v for v, k in self._added_tokens_decoder.items()}

        # 5. 调用父类的构造函数
        super().__init__(**kwargs)

        # 6. 如果某些特殊标记不在词汇表中,将它们添加到词汇表末尾
        #    添加顺序与 `self.SPECIAL_TOKENS_ATTRIBUTES` 相同,依赖于 `tokenizers` 对象
        self._add_tokens(
            [token for token in self.all_special_tokens_extended if token not in self._added_tokens_encoder],
            special_tokens=True,
        )

        # 7. 设定 `_decode_use_source_tokenizer` 标志为 False
        self._decode_use_source_tokenizer = False

    @property
    def is_fast(self) -> bool:
        # 返回 False,表明不是一个快速的分词器
        return False

    @property
    def vocab_size(self) -> int:
        """
        `int`: 基础词汇表的大小(不包括添加的特殊标记)。
        """
        # 抛出未实现错误,要求子类实现这个属性
        raise NotImplementedError

    @property
    def added_tokens_encoder(self) -> Dict[str, int]:
        """
        返回从字符串到索引的排序映射。为了性能优化,缓存了慢速分词器中的 `_added_tokens_encoder`。
        """
        # 将 `_added_tokens_decoder` 按索引排序并转换为字符串到整数的映射返回
        return {k.content: v for v, k in sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])}

    @property
    def added_tokens_decoder(self) -> Dict[int, AddedToken]:
        """
        返回词汇表中的添加标记,作为索引到 AddedToken 的字典。

        Returns:
            `Dict[str, int]`: 添加的标记。
        """
        # 按索引排序并返回 `_added_tokens_decoder` 的内容
        return dict(sorted(self._added_tokens_decoder.items(), key=lambda item: item[0]))

    @added_tokens_decoder.setter
    def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> Dict[int, AddedToken]:
        # 如果值是字符串类型,抛出错误,用户应该定义正确的行为
        for index, token in value.items():
            if not isinstance(token, (str, AddedToken)) or not isinstance(index, int):
                raise ValueError(
                    f"The provided `added_tokens_decoder` has an element of type {index.__class__, token.__class__}, should be a dict of {int, Union[AddedToken, str]}"
                )

            # 如果 token 是字符串类型,将其转换为 AddedToken 对象后存储
            self._added_tokens_decoder[index] = AddedToken(token) if isinstance(token, str) else token
            # 更新 `_added_tokens_encoder`,将 token 转换为字符串后存储其索引
            self._added_tokens_encoder[str(token)] = index
    def get_added_vocab(self) -> Dict[str, int]:
        """
        返回已添加到词汇表中的词汇作为一个字典,键为词汇,值为索引。结果可能与快速调用不同,因为我们当前总是添加这些词汇,即使它们已经存在于词汇表中。这是我们应该改变的事情。

        Returns:
            `Dict[str, int]`: 已添加的词汇表。
        """
        return self._added_tokens_encoder

    def __len__(self):
        """
        返回包含已添加词汇的完整词汇表的大小。计数的是 `keys` 而不是 `values`,因为如果词汇表中有空洞,我们会在错误的索引处添加分词器。
        """
        return len(set(self.get_vocab().keys()))

    def _update_trie(self, unique_no_split_tokens: Optional[str] = []):
        """
        更新 Trie 树,将新增的无需分割的词汇加入到词汇表中。

        Args:
            unique_no_split_tokens (`Optional[str]`, *optional*, defaults to `[]`):
                需要添加到 Trie 树中的唯一词汇列表。
        """
        for token in self._added_tokens_decoder.values():
            if token not in self.tokens_trie._tokens:
                self.tokens_trie.add(token.content)
        for token in unique_no_split_tokens:
            if token not in self.tokens_trie._tokens:
                self.tokens_trie.add(token)

    def num_special_tokens_to_add(self, pair: bool = False) -> int:
        """
        返回在编码序列时添加的特殊标记数量。

        <Tip>

        这会对一个虚拟输入进行编码,并检查添加的特殊标记数量,因此不是效率高的操作。不要将其放在训练循环内。

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                是否计算序列对或单序列中添加的特殊标记数量。

        Returns:
            `int`: 添加到序列中的特殊标记数量。
        """
        token_ids_0 = []
        token_ids_1 = []
        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))

    def _tokenize(self, text, **kwargs):
        """
        使用分词器将字符串转换为一系列标记(字符串)。基于词汇表的单词分割或基于子词的分割(BPE/SentencePieces/WordPieces)。

        不处理已添加的标记。

        Args:
            text (str): 要分词的文本。
            **kwargs: 其他参数传递给分词器的选项。

        Raises:
            NotImplementedError: 如果子类没有实现这个方法。
        """
        raise NotImplementedError
    # 将 tokens 转换为其对应的 ID,根据 tokens 的类型返回单个 ID 或 ID 列表
    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
        """
        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
        vocabulary.

        Args:
            tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).

        Returns:
            `int` or `List[int]`: The token id or list of token ids.
        """
        # 如果 tokens 为 None,直接返回 None
        if tokens is None:
            return None
        
        # 如果 tokens 是一个字符串,调用 _convert_token_to_id_with_added_voc 方法进行转换
        if isinstance(tokens, str):
            return self._convert_token_to_id_with_added_voc(tokens)

        # 如果 tokens 是一个列表,则遍历每个 token 并调用 _convert_token_to_id_with_added_voc 方法进行转换
        ids = []
        for token in tokens:
            ids.append(self._convert_token_to_id_with_added_voc(token))
        return ids

    # 根据 token 查找其对应的 ID,首先在自定义的添加 tokens 编码器中查找,然后再调用 _convert_token_to_id 方法
    def _convert_token_to_id_with_added_voc(self, token):
        if token is None:
            return None
        
        # 如果 token 在自定义的添加 tokens 编码器中,则返回对应的 ID
        if token in self._added_tokens_encoder:
            return self._added_tokens_encoder[token]
        
        # 否则调用 _convert_token_to_id 方法进行转换
        return self._convert_token_to_id(token)

    # 用于将 token 转换为 ID,需要在子类中实现具体的转换逻辑
    def _convert_token_to_id(self, token):
        raise NotImplementedError

    # 根据给定的文本输入进行编码处理,支持添加特殊 tokens,填充策略,截断策略等多种参数设置
    def _encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        # 具体实现细节需要在该方法的实现中完成,根据传入的参数进行文本编码和处理
        pass
    ) -> BatchEncoding:
        # 定义内部函数,根据输入文本返回对应的输入 ID 列表
        def get_input_ids(text):
            # 如果输入是字符串,将其标记化为 tokens,并转换成对应的 ID 列表
            if isinstance(text, str):
                tokens = self.tokenize(text, **kwargs)
                return self.convert_tokens_to_ids(tokens)
            # 如果输入是字符串列表或元组且首个元素是字符串,根据 is_split_into_words 参数处理
            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
                if is_split_into_words:
                    # 将列表中的每个字符串按单词切分后标记化,再转换成对应的 ID 列表
                    tokens = list(
                        itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
                    )
                    return self.convert_tokens_to_ids(tokens)
                else:
                    # 直接将字符串列表或元组标记化为 tokens,并转换成对应的 ID 列表
                    return self.convert_tokens_to_ids(text)
            # 如果输入是整数列表或元组,直接返回该列表
            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
                return text
            else:
                # 若参数 is_split_into_words 为 True 时,要求输入必须是字符串或字符串列表/元组
                if is_split_into_words:
                    raise ValueError(
                        f"Input {text} is not valid. Should be a string or a list/tuple of strings when"
                        " `is_split_into_words=True`."
                    )
                else:
                    # 否则,要求输入必须是字符串、字符串列表/元组或整数列表/元组
                    raise ValueError(
                        f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of"
                        " integers."
                    )

        # 若设置了 return_offsets_mapping 参数,则抛出未实现的错误
        if return_offsets_mapping:
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast. "
                "More information on available tokenizers at "
                "https://github.com/huggingface/transformers/pull/2674"
            )

        # 获取第一个文本的输入 ID 列表
        first_ids = get_input_ids(text)
        # 如果有第二个文本,则获取其输入 ID 列表;否则为 None
        second_ids = get_input_ids(text_pair) if text_pair is not None else None

        # 调用 prepare_for_model 方法,准备输入模型所需的数据格式
        return self.prepare_for_model(
            first_ids,
            pair_ids=second_ids,
            add_special_tokens=add_special_tokens,
            padding=padding_strategy.value,
            truncation=truncation_strategy.value,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            prepend_batch_axis=True,
            return_attention_mask=return_attention_mask,
            return_token_type_ids=return_token_type_ids,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_length=return_length,
            verbose=verbose,
        )
    # 定义一个方法 _batch_encode_plus,用于批量编码文本或文本对
    def _batch_encode_plus(
        self,
        # 输入参数 batch_text_or_text_pairs 可以是多种类型的列表,包括单文本、文本对、预分词输入、编码输入等
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
            List[PreTokenizedInputPair],
            List[EncodedInput],
            List[EncodedInputPair],
        ],
        # 是否添加特殊标记,默认为 True
        add_special_tokens: bool = True,
        # 填充策略,默认为不填充
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        # 截断策略,默认为不截断
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        # 最大长度限制,可选
        max_length: Optional[int] = None,
        # 步长,默认为 0
        stride: int = 0,
        # 是否已经分成单词,默认为 False
        is_split_into_words: bool = False,
        # 填充到指定的倍数,默认为 None
        pad_to_multiple_of: Optional[int] = None,
        # 返回的张量类型,默认为 None
        return_tensors: Optional[Union[str, TensorType]] = None,
        # 是否返回 token 类型 ID,默认为 None
        return_token_type_ids: Optional[bool] = None,
        # 是否返回注意力遮罩,默认为 None
        return_attention_mask: Optional[bool] = None,
        # 是否返回溢出的 token,默认为 False
        return_overflowing_tokens: bool = False,
        # 是否返回特殊 token 掩码,默认为 False
        return_special_tokens_mask: bool = False,
        # 是否返回偏移映射,默认为 False
        return_offsets_mapping: bool = False,
        # 是否返回长度,默认为 False
        return_length: bool = False,
        # 是否显示详细信息,默认为 True
        verbose: bool = True,
        # 其他关键字参数
        **kwargs,
    ) -> BatchEncoding:
        # 定义内部函数 get_input_ids,用于将文本或文本对转换为输入 IDs
        def get_input_ids(text):
            # 如果输入是字符串,则进行分词和转换为 IDs
            if isinstance(text, str):
                tokens = self.tokenize(text, **kwargs)
                return self.convert_tokens_to_ids(tokens)
            # 如果输入是字符串列表或元组,并且第一个元素是字符串,则根据 is_split_into_words 参数处理
            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
                if is_split_into_words:
                    # 将每个字符串分词后合并成一个 tokens 列表,并转换为 IDs
                    tokens = list(
                        itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
                    )
                    return self.convert_tokens_to_ids(tokens)
                else:
                    # 直接将字符串列表或元组转换为 IDs
                    return self.convert_tokens_to_ids(text)
            # 如果输入是整数列表或元组,则直接返回
            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
                return text
            else:
                # 如果输入不合法,则抛出 ValueError 异常
                raise ValueError(
                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
                )

        # 如果要求返回偏移映射,但使用 Python 分词器不支持该功能,则抛出 NotImplementedError 异常
        if return_offsets_mapping:
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast."
            )

        # 初始化 input_ids 列表
        input_ids = []
        # 遍历批量文本或文本对
        for ids_or_pair_ids in batch_text_or_text_pairs:
            # 如果元素不是列表或元组,则假设为单文本,没有配对文本
            if not isinstance(ids_or_pair_ids, (list, tuple)):
                ids, pair_ids = ids_or_pair_ids, None
            # 如果开启了 is_split_into_words 并且第一个元素不是列表或元组,则也假设为单文本
            elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
                ids, pair_ids = ids_or_pair_ids, None
            else:
                # 否则,假设为文本对,分别赋值给 ids 和 pair_ids
                ids, pair_ids = ids_or_pair_ids

            # 获取第一个文本的输入 IDs
            first_ids = get_input_ids(ids)
            # 如果存在第二个文本,则获取其输入 IDs;否则 pair_ids 为 None
            second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
            # 将文本对的输入 IDs 添加到 input_ids 列表中
            input_ids.append((first_ids, second_ids))

        # 调用内部方法 _batch_prepare_for_model 处理输入 IDs,返回 batch_outputs
        batch_outputs = self._batch_prepare_for_model(
            input_ids,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
            return_token_type_ids=return_token_type_ids,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_length=return_length,
            return_tensors=return_tensors,
            verbose=verbose,
        )

        # 返回 BatchEncoding 对象,其中包含处理后的批量输出
        return BatchEncoding(batch_outputs)

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def _batch_prepare_for_model(
        self,
        batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[str] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> BatchEncoding:
        """
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens

        Args:
            batch_ids_pairs: list of tokenized input ids or input ids pairs
        """

        # Initialize an empty dictionary to store batch outputs
        batch_outputs = {}
        
        # Iterate through each pair of input IDs in batch_ids_pairs
        for first_ids, second_ids in batch_ids_pairs:
            # Call prepare_for_model to process the input ids pairs
            outputs = self.prepare_for_model(
                first_ids,
                second_ids,
                add_special_tokens=add_special_tokens,
                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
                truncation=truncation_strategy.value,
                max_length=max_length,
                stride=stride,
                pad_to_multiple_of=None,  # we pad in batch afterward
                return_attention_mask=False,  # we pad in batch afterward
                return_token_type_ids=return_token_type_ids,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_length=return_length,
                return_tensors=None,  # We convert the whole batch to tensors at the end
                prepend_batch_axis=False,
                verbose=verbose,
            )

            # Aggregate outputs into batch_outputs dictionary
            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        # Pad batch outputs based on padding_strategy
        batch_outputs = self.pad(
            batch_outputs,
            padding=padding_strategy.value,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
        )

        # Convert batch_outputs to BatchEncoding object with specified tensor_type
        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)

        # Return the final batch outputs as a BatchEncoding object
        return batch_outputs
    ) -> Tuple[str, Dict[str, Any]]:
        """
        Performs any necessary transformations before tokenization.

        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
            text (`str`):
                The text to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize. This is useful for NER or token classification.
            kwargs (`Dict[str, Any]`, *optional*):
                Keyword arguments to use for the tokenization.

        Returns:
            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
        """
        return (text, kwargs)

    def get_special_tokens_mask(
        self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of ids of the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            # 如果已经有特殊标记,而且提供了第二个序列的 IDs,则引发错误
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
                    "ids is already formatted with special tokens for the model."
                )

            # 调用超类方法,返回特殊标记掩码
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )
        
        # 如果没有特殊标记,返回全零列表,表示所有标记都是序列标记而非特殊标记
        return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))

    @overload
    def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str:
        ...

    @overload
    def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]:
        ...

    def convert_ids_to_tokens(
        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
    ) -> Union[str, List[str]]:
        """
        Converts token ids into strings.

        Args:
            ids (`int` or `List[int]`):
                Token ids to convert.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether to skip special tokens during conversion.

        Returns:
            `str` or `List[str]`: Converted token(s) into string(s).
        """
    def convert_ids_to_tokens(self, ids: Union[int, List[int]]) -> Union[str, List[str]]:
        """
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `List[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `List[str]`: The decoded token(s).
        """
        # Check if the input ids is a single integer
        if isinstance(ids, int):
            # Check if the integer id corresponds to an added token
            if ids in self._added_tokens_decoder:
                # Return the content of the added token
                return self._added_tokens_decoder[ids].content
            else:
                # Otherwise, convert the integer id to a token using the vocabulary
                return self._convert_id_to_token(ids)
        
        # If ids is a list of integers, process each index
        tokens = []
        for index in ids:
            index = int(index)  # Ensure index is treated as an integer
            # Skip special tokens if specified and the index is in special token ids
            if skip_special_tokens and index in self.all_special_ids:
                continue
            # Check if the index corresponds to an added token
            if index in self._added_tokens_decoder:
                # Append the content of the added token to tokens
                tokens.append(self._added_tokens_decoder[index].content)
            else:
                # Otherwise, convert the index to a token using the vocabulary
                tokens.append(self._convert_id_to_token(index))
        
        # Return the list of tokens
        return tokens

    def _convert_id_to_token(self, index: int) -> str:
        # Placeholder method to convert an integer index to a token
        raise NotImplementedError

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        # Joins a list of tokens into a single string separated by spaces
        return " ".join(tokens)

    def _decode(
        self,
        token_ids: List[int],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        spaces_between_special_tokens: bool = True,
        **kwargs,
    ):
        # Method for decoding a list of token ids into a string
        # The parameters provide options for handling special tokens and tokenization spaces
        pass
        ) -> str:
        # 从kwargs中弹出"use_source_tokenizer"参数并设置为self._decode_use_source_tokenizer属性
        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)

        # 将token_ids转换为tokens,并根据skip_special_tokens过滤特殊token
        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
        
        # 计算不属于self.all_special_tokens的self._added_tokens_encoder.keys()集合与token中的特殊token
        legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | {
            token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size
        }
        
        # 为了避免在字节级别BPT中混合字节级和unicode,需要分别构建添加的token和字节级token的字符串
        # 参考:https://github.com/huggingface/transformers/issues/1133
        sub_texts = []
        current_sub_text = []
        
        # 在版本5中,特殊token应该在convert_tokens_to_string和_convert_tokens_to_string中处理
        # TODO @ArthurZ in version 5, special tokens should be handled in convert_tokens_to_string, while _convert_tokens_to_string
        for token in filtered_tokens:
            # 如果skip_special_tokens为True且token是特殊token,则跳过
            if skip_special_tokens and token in self.all_special_ids:
                continue
            # 如果token是legacy_added_tokens中的token
            if token in legacy_added_tokens:
                if current_sub_text:
                    # 将当前的sub_text转换为字符串并添加到sub_texts中
                    string = self.convert_tokens_to_string(current_sub_text)
                    if len(string) > 0:
                        sub_texts.append(string)
                    current_sub_text = []
                # 将token直接添加到sub_texts中
                sub_texts.append(token)
            else:
                # 将token添加到current_sub_text中
                current_sub_text.append(token)
        
        # 如果current_sub_text非空,则将其转换为字符串并添加到sub_texts中
        if current_sub_text:
            sub_texts.append(self.convert_tokens_to_string(current_sub_text))

        # 如果spaces_between_special_tokens为True,则用空格连接sub_texts中的字符串
        if spaces_between_special_tokens:
            text = " ".join(sub_texts)
        else:
            # 否则将sub_texts中的字符串连接起来
            text = "".join(sub_texts)

        # 如果clean_up_tokenization_spaces不为None,则使用其值;否则使用self.clean_up_tokenization_spaces的值
        clean_up_tokenization_spaces = (
            clean_up_tokenization_spaces
            if clean_up_tokenization_spaces is not None
            else self.clean_up_tokenization_spaces
        )
        
        # 如果clean_up_tokenization_spaces为True,则使用clean_up_tokenization方法清理text并返回
        if clean_up_tokenization_spaces:
            clean_text = self.clean_up_tokenization(text)
            return clean_text
        else:
            # 否则直接返回text
            return text

.\tokenization_utils_base.py

# 设置文件编码为 UTF-8
# 版权声明为 HuggingFace Inc. 团队,版权年份为 2020 年
# 使用 Apache 许可证 2.0 版本,详细信息请访问指定网址获取
# 本代码库提供的代码受版权法保护,除非符合许可证规定,否则不得使用
"""
包含慢速和快速标记化类共有的基础类:
- PreTrainedTokenizerBase:包含所有用户界面的编码方法
- Special token mixing:包含特殊标记逻辑
- BatchEncoding:用于快速标记化器的输出字典包装,带有特殊方法
"""

import copy  # 导入复制函数
import json  # 导入 JSON 序列化和反序列化函数
import os  # 导入操作系统相关函数
import re  # 导入正则表达式模块
import warnings  # 导入警告处理模块
from collections import UserDict  # 导入用户定义字典类
from collections.abc import Mapping, Sized  # 导入映射和可计数集合抽象基类
from contextlib import contextmanager  # 导入上下文管理器
from dataclasses import dataclass  # 导入 dataclass 装饰器
from functools import lru_cache  # 导入 lru_cache 装饰器
from typing import (  # 导入类型提示
    TYPE_CHECKING,
    Any,
    Dict,
    List,
    NamedTuple,
    Optional,
    Sequence,
    Tuple,
    Union,
)

import numpy as np  # 导入 NumPy 库
from packaging import version  # 导入版本管理模块

from . import __version__  # 导入当前模块的版本信息
from .dynamic_module_utils import custom_object_save  # 导入自定义对象保存函数
from .utils import (  # 导入工具函数
    ExplicitEnum,
    PaddingStrategy,
    PushToHubMixin,
    TensorType,
    add_end_docstrings,
    add_model_info_to_auto_map,
    cached_file,
    copy_func,
    download_url,
    extract_commit_hash,
    is_flax_available,
    is_jax_tensor,
    is_mlx_available,
    is_numpy_array,
    is_offline_mode,
    is_remote_url,
    is_tf_available,
    is_tf_tensor,
    is_tokenizers_available,
    is_torch_available,
    is_torch_device,
    is_torch_tensor,
    logging,
    requires_backends,
    to_py_obj,
)

if TYPE_CHECKING:  # 检查是否在类型检查模式下运行
    if is_torch_available():  # 如果 Torch 可用
        import torch  # 导入 Torch 库
    if is_tf_available():  # 如果 TensorFlow 可用
        import tensorflow as tf  # 导入 TensorFlow 库
    if is_flax_available():  # 如果 Flax 可用
        import jax.numpy as jnp  # 导入 JAX 的 NumPy 接口,用于类型检查
        from .pipelines.conversational import Conversation  # 导入会话式对话模块

if is_tokenizers_available():  # 如果 Tokenizers 可用
    from tokenizers import AddedToken  # 导入 Tokenizers 的 AddedToken 类
    from tokenizers import Encoding as EncodingFast  # 导入 Tokenizers 的 Encoding 类作为 EncodingFast
else:  # 如果 Tokenizers 不可用
    @dataclass(frozen=False, eq=True)  # 定义一个 dataclass 装饰器
    # 定义一个名为 AddedToken 的类,表示要添加到 Tokenizer 的一个标记
    # AddedToken 可以具有特殊选项,定义其行为方式
    class AddedToken:
        """
        AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
        way it should behave.

        The `normalized` will default to `not special` if it is not specified, similarly to the definition in
        `tokenizers`.
        """

        # 初始化方法,用于设置 AddedToken 的属性
        def __init__(
            self, content: str, single_word=False, lstrip=False, rstrip=False, special=False, normalized=None
        ):
            self.content = content  # 标记的内容
            self.single_word = single_word  # 是否是单词
            self.lstrip = lstrip  # 是否去除左侧空白
            self.rstrip = rstrip  # 是否去除右侧空白
            self.special = special  # 是否是特殊标记
            self.normalized = normalized if normalized is not None else not special  # 标记是否已标准化,默认与 special 相反

        # 返回对象的状态,用于序列化
        def __getstate__(self):
            return self.__dict__

        # 返回标记的内容
        def __str__(self):
            return self.content

    # 定义一个名为 EncodingFast 的数据类
    @dataclass
    class EncodingFast:
        """This is dummy class because without the `tokenizers` library we don't have these objects anyway"""
        
        pass  # 仅作为示例,因为没有 `tokenizers` 库,这些对象实际上并不存在
# 获取当前模块的日志记录器对象
logger = logging.get_logger(__name__)

# 设置一个非常大的整数作为输入模型的最大长度,用于模型具有无限输入大小的情况
VERY_LARGE_INTEGER = int(1e30)
# 设置一个大的整数,稍微小于VERY_LARGE_INTEGER,用于需要大量但不是非常大的情况
LARGE_INTEGER = int(1e20)

# 定义类型别名和命名元组
TextInput = str
PreTokenizedInput = List[str]
EncodedInput = List[int]
TextInputPair = Tuple[str, str]
PreTokenizedInputPair = Tuple[List[str], List[str]]
EncodedInputPair = Tuple[List[int], List[int]]

# 旧版慢速分词器保存在三个单独的文件中
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"

# 快速分词器(由HuggingFace tokenizer库提供)可以保存在单个文件中
FULL_TOKENIZER_FILE = "tokenizer.json"
_re_tokenizer_file = re.compile(r"tokenizer\.(.*)\.json")

class TruncationStrategy(ExplicitEnum):
    """
    `PreTrainedTokenizerBase.__call__` 方法中 `truncation` 参数的可能取值。
    在IDE中进行选项补全时非常有用。
    """
    ONLY_FIRST = "only_first"
    ONLY_SECOND = "only_second"
    LONGEST_FIRST = "longest_first"
    DO_NOT_TRUNCATE = "do_not_truncate"

class CharSpan(NamedTuple):
    """
    原始字符串中的字符范围。

    Args:
        start (`int`): 原始字符串中第一个字符的索引。
        end (`int`): 原始字符串中最后一个字符后面的字符的索引。
    """
    start: int
    end: int

class TokenSpan(NamedTuple):
    """
    编码字符串(token列表)中的token范围。

    Args:
        start (`int`): 范围中第一个token的索引。
        end (`int`): 范围中最后一个token后面的token的索引。
    """
    start: int
    end: int

class BatchEncoding(UserDict):
    """
    [`~tokenization_utils_base.PreTrainedTokenizerBase.__call__`],
    [`~tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`] 和
    [`~tokenization_utils_base.PreTrainedTokenizerBase.batch_encode_plus`] 方法的输出(tokens, attention_masks等)。

    这个类继承自Python字典类,可以像字典一样使用。此外,这个类还提供了从单词/字符空间到token空间的映射工具方法。
    """
    """
    Args:
        data (`dict`, *optional*):
            Dictionary of lists/arrays/tensors returned by the `__call__`/`encode_plus`/`batch_encode_plus` methods
            ('input_ids', 'attention_mask', etc.).
        encoding (`tokenizers.Encoding` or `Sequence[tokenizers.Encoding]`, *optional*):
            If the tokenizer is a fast tokenizer which outputs additional information like mapping from word/character
            space to token space the `tokenizers.Encoding` instance or list of instance (for batches) hold this
            information.
        tensor_type (`Union[None, str, TensorType]`, *optional*):
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
        prepend_batch_axis (`bool`, *optional*, defaults to `False`):
            Whether or not to add a batch axis when converting to tensors (see `tensor_type` above).
        n_sequences (`Optional[int]`, *optional*):
            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
            initialization.
    """

    # 初始化方法,用于将输入数据转换为张量并存储相关编码信息
    def __init__(
        self,
        data: Optional[Dict[str, Any]] = None,
        encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
        tensor_type: Union[None, str, TensorType] = None,
        prepend_batch_axis: bool = False,
        n_sequences: Optional[int] = None,
    ):
        # 调用父类初始化方法,传入数据字典
        super().__init__(data)

        # 如果 encoding 是 EncodingFast 类型,则转换为列表形式
        if isinstance(encoding, EncodingFast):
            encoding = [encoding]

        # 存储编码信息到实例变量 _encodings 中
        self._encodings = encoding

        # 如果 n_sequences 为 None,并且 encoding 不为 None 且非空列表,则从第一个编码对象获取 n_sequences
        if n_sequences is None and encoding is not None and len(encoding):
            n_sequences = encoding[0].n_sequences

        # 存储 n_sequences 到实例变量 _n_sequences 中
        self._n_sequences = n_sequences

        # 调用 convert_to_tensors 方法,将输入数据转换为张量(PyTorch/TensorFlow/Numpy),根据 tensor_type 和 prepend_batch_axis 参数进行处理
        self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)

    @property
    def n_sequences(self) -> Optional[int]:
        """
        `Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
        [`BatchEncoding`]. Currently can be one of `None` (unknown), `1` (a single sentence) or `2` (a pair of
        sentences)
        """
        # 返回存储在 _n_sequences 实例变量中的序列数信息
        return self._n_sequences

    @property
    def is_fast(self) -> bool:
        """
        `bool`: Indicate whether this [`BatchEncoding`] was generated from the result of a [`PreTrainedTokenizerFast`]
        or not.
        """
        # 返回一个布尔值,指示 _encodings 实例变量是否为 None(即是否由 PreTrainedTokenizerFast 生成了该 BatchEncoding 实例)
        return self._encodings is not None
    def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
        """
        If the key is a string, returns the value of the dict associated to `key` ('input_ids', 'attention_mask',
        etc.).

        If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`.

        If the key is a slice, returns the value of the dict associated to `key` ('input_ids', 'attention_mask', etc.)
        with the constraint of slice.
        """
        # 如果 `item` 是字符串,则返回与该键关联的字典值(如 'input_ids'、'attention_mask' 等)
        if isinstance(item, str):
            return self.data[item]
        # 如果 `item` 是整数,并且 `_encodings` 不为 None,则返回索引为 `item` 的批次的 `tokenizers.Encoding`
        elif self._encodings is not None:
            return self._encodings[item]
        # 如果 `item` 是切片对象,则返回满足切片条件的字典值(如 'input_ids', 'attention_mask' 等)
        elif isinstance(item, slice):
            return {key: self.data[key][item] for key in self.data.keys()}
        # 如果 `item` 类型不符合上述三种情况,则引发 KeyError
        else:
            raise KeyError(
                "Invalid key. Only three types of key are available: "
                "(1) string, (2) integers for backend Encoding, and (3) slices for data subsetting."
            )

    def __getattr__(self, item: str):
        try:
            # 尝试从 `self.data` 中获取属性 `item` 的值
            return self.data[item]
        except KeyError:
            # 如果 `item` 不存在于 `self.data` 中,则引发 AttributeError
            raise AttributeError

    def __getstate__(self):
        # 返回对象的序列化状态,包括 `self.data` 和 `_encodings`
        return {"data": self.data, "encodings": self._encodings}

    def __setstate__(self, state):
        # 如果 `state` 中包含 `data`,则将其赋值给 `self.data`
        if "data" in state:
            self.data = state["data"]

        # 如果 `state` 中包含 `_encodings`,则将其赋值给 `self._encodings`
        if "encodings" in state:
            self._encodings = state["encodings"]

    def keys(self):
        # 返回 `self.data` 的键列表
        return self.data.keys()

    def values(self):
        # 返回 `self.data` 的值列表
        return self.data.values()

    def items(self):
        # 返回 `self.data` 的键值对列表
        return self.data.items()

    # After this point:
    # Extended properties and methods only available for fast (Rust-based) tokenizers
    # provided by HuggingFace tokenizers library.

    @property
    def encodings(self) -> Optional[List[EncodingFast]]:
        """
        `Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns `None` if
        the input was tokenized through Python (i.e., not a fast) tokenizer.
        """
        # 返回 `_encodings`,即快速(基于 Rust 的)分词器生成的编码列表;如果是通过 Python 进行分词(非快速分词器),则返回 None
        return self._encodings

    def tokens(self, batch_index: int = 0) -> List[str]:
        """
        Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
        integer indices) at a given batch index (only works for the output of a fast tokenizer).

        Args:
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.

        Returns:
            `List[str]`: The list of tokens at that index.
        """
        # 如果 `_encodings` 为 None,则抛出 ValueError,说明不支持 `tokens()` 方法
        if not self._encodings:
            raise ValueError(
                "tokens() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
                " class)."
            )
        # 返回指定批次索引处的 token 列表
        return self._encodings[batch_index].tokens
    # 返回一个列表,将每个 token 映射到其原始句子的 id:
    # - 对于在序列周围或序列之间添加的特殊 token,映射为 `None`。
    # - 对于第一个序列中的单词对应的 token,映射为 `0`。
    # - 当对一对序列进行联合编码时,对于第二个序列中单词对应的 token,映射为 `1`。
    def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
        """
        Return a list mapping the tokens to the id of their original sentences:

            - `None` for special tokens added around or between sequences,
            - `0` for tokens corresponding to words in the first sequence,
            - `1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
              encoded.

        Args:
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.

        Returns:
            `List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens added
            by the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding
            sequence.
        """
        # 如果没有 `_encodings` 属性,抛出 ValueError 异常,提示无法使用该方法
        if not self._encodings:
            raise ValueError(
                "sequence_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
                " class)."
            )
        # 返回指定批次索引处的 `_encodings` 对象的 sequence_ids 属性
        return self._encodings[batch_index].sequence_ids

    # 返回一个列表,将每个 token 映射到其初始句子中的实际单词(仅适用于快速分词器)
    def words(self, batch_index: int = 0) -> List[Optional[int]]:
        """
        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.

        Args:
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.

        Returns:
            `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
            tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
            (several tokens will be mapped to the same word index if they are parts of that word).
        """
        # 如果没有 `_encodings` 属性,抛出 ValueError 异常,提示无法使用该方法
        if not self._encodings:
            raise ValueError(
                "words() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
                " class)."
            )
        # 发出警告,提示 `words()` 方法已被废弃,建议使用 `word_ids()` 方法
        warnings.warn(
            "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
            "but more self-explanatory `BatchEncoding.word_ids()` property.",
            FutureWarning,
        )
        # 返回调用 `word_ids()` 方法的结果,传入指定的批次索引
        return self.word_ids(batch_index)
    # 返回一个列表,将每个token映射到其在初始句子中的实际单词,适用于快速分词器。
    def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
        """
        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.

        Args:
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.

        Returns:
            `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
            tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
            (several tokens will be mapped to the same word index if they are parts of that word).
        """
        # 如果_encodings为空,则抛出值错误异常
        if not self._encodings:
            raise ValueError(
                "word_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
                " class)."
            )
        # 返回指定批次索引的word_ids列表
        return self._encodings[batch_index].word_ids

    # 获取给定token所表示的序列的索引
    def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
        """
        Get the index of the sequence represented by the given token. In the general use case, this method returns `0`
        for a single sequence or the first sequence of a pair, and `1` for the second sequence of a pair

        Can be called as:

        - `self.token_to_sequence(token_index)` if batch size is 1
        - `self.token_to_sequence(batch_index, token_index)` if batch size is greater than 1

        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
        tokenized words.

        Args:
            batch_or_token_index (`int`):
                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
                the token in the sequence.
            token_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
                sequence.

        Returns:
            `int`: Index of the word in the input sequence.
        """

        # 如果_encodings为空,则抛出值错误异常
        if not self._encodings:
            raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
        # 如果token_index不为None,则batch_index为batch_or_token_index
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0
            token_index = batch_or_token_index
        # 如果batch_index小于0,则将其转换为有效的索引
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        # 如果token_index小于0,则将其转换为有效的索引
        if token_index < 0:
            token_index = self._seq_len + token_index
        # 返回指定编码中指定token的序列索引
        return self._encodings[batch_index].token_to_sequence(token_index)
    def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
        """
        Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.

        Can be called as:

        - `self.token_to_word(token_index)` if batch size is 1
        - `self.token_to_word(batch_index, token_index)` if batch size is greater than 1

        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
        tokenized words.

        Args:
            batch_or_token_index (`int`):
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the token in the sequence.
            token_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
                sequence.

        Returns:
            `int`: Index of the word in the input sequence.
        """

        # 如果没有编码信息,则抛出错误,Python 基础的分词器不支持 token_to_word 方法
        if not self._encodings:
            raise ValueError("token_to_word() is not available when using Python based tokenizers")

        # 确定 batch_index 和 token_index 的值
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            batch_index = 0  # 默认的 batch_index 如果只有一个序列
            token_index = batch_or_token_index

        # 处理负数的 batch_index 和 token_index
        if batch_index < 0:
            batch_index = self._batch_size + batch_index
        if token_index < 0:
            token_index = self._seq_len + token_index

        # 调用具体编码对象的 token_to_word 方法,返回单词在输入序列中的索引
        return self._encodings[batch_index].token_to_word(token_index)

    def word_to_tokens(
        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
    ) -> Optional[TokenSpan]:
        """
        Get the encoded token span corresponding to a word in a sequence of the batch.

        Token spans are returned as a [`~tokenization_utils_base.TokenSpan`] with:

        - **start** -- Index of the first token.
        - **end** -- Index of the token following the last token.

        Can be called as:

        - `self.word_to_tokens(word_index, sequence_index: int = 0)` if batch size is 1
        - `self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)` if batch size is greater or equal to
          1

        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.

        Args:
            batch_or_word_index (`int`):
                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
                the word in the sequence.
            word_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
                sequence.
            sequence_index (`int`, *optional*, defaults to 0):
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided word index belongs to.

        Returns:
            ([`~tokenization_utils_base.TokenSpan`], *optional*): Span of tokens in the encoded sequence. Returns
            `None` if no tokens correspond to the word. This can happen especially when the token is a special token
            that has been used to format the tokenization. For example when we add a class token at the very beginning
            of the tokenization.
        """

        # Check if encodings are available; raise an error if not
        if not self._encodings:
            raise ValueError("word_to_tokens() is not available when using Python based tokenizers")

        # Determine whether batch_index or word_index was provided
        if word_index is not None:
            batch_index = batch_or_word_index
        else:
            batch_index = 0
            word_index = batch_or_word_index

        # Adjust negative batch_index to account for batch size
        if batch_index < 0:
            batch_index = self._batch_size + batch_index

        # Adjust negative word_index to account for sequence length
        if word_index < 0:
            word_index = self._seq_len + word_index

        # Retrieve the token span for the specified word and sequence index
        span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)

        # Return the TokenSpan object constructed from span, or None if span is None
        return TokenSpan(*span) if span is not None else None
    def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
        """
        Get the character span corresponding to an encoded token in a sequence of the batch.

        Character spans are returned as a [`~tokenization_utils_base.CharSpan`] with:

        - **start** -- Index of the first character in the original string associated to the token.
        - **end** -- Index of the character following the last character in the original string associated to the
          token.

        Can be called as:

        - `self.token_to_chars(token_index)` if batch size is 1
        - `self.token_to_chars(batch_index, token_index)` if batch size is greater or equal to 1

        Args:
            batch_or_token_index (`int`):
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the token in the sequence.
            token_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the token or tokens in
                the sequence.

        Returns:
            [`~tokenization_utils_base.CharSpan`]: Span of characters in the original string, or None, if the token
            (e.g. <s>, </s>) doesn't correspond to any chars in the origin string.
        """

        # 如果没有编码信息,则抛出错误,Python 版本的分词器不支持 token_to_chars()
        if not self._encodings:
            raise ValueError("token_to_chars() is not available when using Python based tokenizers")

        # 如果 token_index 不为 None,则说明参数中包含 batch_index 和 token_index
        if token_index is not None:
            batch_index = batch_or_token_index
        else:
            # 如果 token_index 为 None,则参数中只有 batch_or_token_index,此时 batch_index 设为 0
            batch_index = 0
            token_index = batch_or_token_index
        
        # 获取字符跨度的起始和结束索引
        span_indices = self._encodings[batch_index].token_to_chars(token_index)

        # 如果 span_indices 不为 None,则返回 CharSpan 对象,否则返回 None
        return CharSpan(*span_indices) if span_indices is not None else None
    ) -> int:
        """
        Get the index of the token in the encoded output comprising a character in the original string for a sequence
        of the batch.

        Can be called as:

        - `self.char_to_token(char_index)` if batch size is 1
        - `self.char_to_token(batch_index, char_index)` if batch size is greater or equal to 1

        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.

        Args:
            batch_or_char_index (`int`):
                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                the word in the sequence
            char_index (`int`, *optional*):
                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
                sequence.
            sequence_index (`int`, *optional*, defaults to 0):
                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                or 1) the provided character index belongs to.


        Returns:
            `int`: Index of the token.
        """

        # 如果没有编码信息,则抛出异常,因为在使用基于 Python 的分词器时无法使用 char_to_token()
        if not self._encodings:
            raise ValueError("char_to_token() is not available when using Python based tokenizers")

        # 根据参数情况确定 batch_index 和 char_index 的值
        if char_index is not None:
            batch_index = batch_or_char_index
        else:
            batch_index = 0
            char_index = batch_or_char_index

        # 调用内部编码对象的 char_to_token 方法,返回字符对应的 token 索引
        return self._encodings[batch_index].char_to_token(char_index, sequence_index)
    # 定义一个方法,用于获取给定批次中的序列中指定单词在原始字符串中的字符跨度

    def word_to_chars(batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0) -> CharSpan:
        """
        获取给定批次中的序列中指定单词在原始字符串中的字符跨度。

        字符跨度以 CharSpan 命名元组的形式返回,具有以下字段:
        - start: 原始字符串中第一个字符的索引
        - end: 原始字符串中最后一个字符之后的索引

        可以按以下方式调用:
        - `self.word_to_chars(word_index)` 如果批次大小为 1
        - `self.word_to_chars(batch_index, word_index)` 如果批次大小大于等于 1

        参数:
            batch_or_word_index (`int`):
                批次中序列的索引。如果批次只包含一个序列,则可以是序列中单词的索引。
            word_index (`int`, *optional*):
                如果在 `batch_or_word_index` 中提供了批次索引,则可以是序列中单词的索引。
            sequence_index (`int`, *optional*, 默认为 0):
                如果批次编码了一对序列,则可以用来指定所提供单词索引属于哪一个序列 (0 或 1)。

        返回:
            `CharSpan` 或 `List[CharSpan]`: 字符串中相关字符或字符组的跨度。CharSpan 是一个命名元组,具有以下字段:
            - start: 原始字符串中与令牌关联的第一个字符的索引
            - end: 原始字符串中与令牌关联的最后一个字符之后的索引
        """

        # 如果未提供编码,则抛出 ValueError
        if not self._encodings:
            raise ValueError("word_to_chars() 在使用基于 Python 的分词器时不可用")
        
        # 根据参数 word_index 的存在与否,确定 batch_index 的值
        if word_index is not None:
            batch_index = batch_or_word_index
        else:
            batch_index = 0
            word_index = batch_or_word_index
        
        # 调用 _encodings 中相应批次和序列索引的 word_to_chars 方法,并返回其结果
        return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))
    def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
        """
        将所有值发送到指定设备,通过调用 `v.to(device)` (仅适用于 PyTorch)。

        Args:
            device (`str` or `torch.device`): 要放置张量的设备。

        Returns:
            [`BatchEncoding`]: 修改后的相同实例。
        """
        # 检查是否使用了必需的后端
        requires_backends(self, ["torch"])

        # 这个检查捕捉到像 APEX 在模块的所有输入上盲目调用 "to" 的情况
        # 否则它将继续向下传递,并将包含标记索引的 LongTensor 转换为 HalfTensor
        if isinstance(device, str) or is_torch_device(device) or isinstance(device, int):
            # 将数据字典中的每个值都转移到指定设备上
            self.data = {k: v.to(device=device) for k, v in self.data.items()}
        else:
            # 如果尝试将 BatchEncoding 转换为不支持的类型,则发出警告
            logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
        # 返回修改后的实例
        return self
class SpecialTokensMixin:
    """
    A mixin derived by [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] to handle specific behaviors related to
    special tokens. In particular, this class hold the attributes which can be used to directly access these special
    tokens in a model-independent manner and allow to set and update the special tokens.

    Args:
        bos_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token representing the beginning of a sentence.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token representing the end of a sentence.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token representing an out-of-vocabulary token.
        sep_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token separating two different sentences in the same input (used by BERT for instance).
        pad_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        cls_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token representing the class of the input (used by BERT for instance).
        mask_token (`str` or `tokenizers.AddedToken`, *optional*):
            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
            BERT).
        additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
            A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
            skipped when decoding if `skip_special_tokens` is set to `True`.
    """

    # Define a class attribute listing the names of all special token attributes
    SPECIAL_TOKENS_ATTRIBUTES = [
        "bos_token",
        "eos_token",
        "unk_token",
        "sep_token",
        "pad_token",
        "cls_token",
        "mask_token",
        "additional_special_tokens",
    ]
    # 初始化函数,用于创建一个新的对象实例
    def __init__(self, verbose=False, **kwargs):
        # 下面的属性用于存储特殊 token 的值,初始设为 None
        self._bos_token = None  # 开始 token
        self._eos_token = None  # 结束 token
        self._unk_token = None  # 未知 token
        self._sep_token = None  # 分隔 token
        self._pad_token = None  # 填充 token
        self._cls_token = None  # 类别 token
        self._mask_token = None  # 掩码 token
        self._pad_token_type_id = 0  # 填充 token 的类型 ID,默认为 0
        self._additional_special_tokens = []  # 额外的特殊 token 列表
        self.verbose = verbose  # 是否为详细模式

        # 直接设置隐藏的值以允许初始化特殊 token,这些 token 可能尚未在词汇表中,用于序列化/反序列化
        # TODO 在某些时候清理此处代码(可能通过切换到快速分词器实现)

        # 遍历关键字参数 kwargs
        for key, value in kwargs.items():
            # 如果值为 None,则跳过
            if value is None:
                continue
            # 检查是否为特殊 token 属性之一
            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                # 如果是额外特殊 token 的情况
                if key == "additional_special_tokens":
                    assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
                    assert all(
                        isinstance(t, (str, AddedToken)) for t in value
                    ), "One of the tokens is not a string or an AddedToken"
                    setattr(self, key, value)  # 设置额外特殊 token 列表
                # 如果值为字符串或 AddedToken 对象,则直接设置
                elif isinstance(value, (str, AddedToken)):
                    setattr(self, key, value)  # 设置特殊 token 的值
                else:
                    raise TypeError(f"Special token {key} has to be either str or AddedToken but got: {type(value)}")
    ) -> int:
        """
        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
        it with indices starting from length of the current vocabulary and will be isolated before the tokenization
        algorithm is applied. Added tokens and tokens from the vocabulary of the tokenization algorithm are therefore
        not treated in the same way.

        Note, when adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix
        of the model so that its embedding matrix matches the tokenizer.

        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.

        Args:
            new_tokens (`str`, `tokenizers.AddedToken` or a list of *str* or `tokenizers.AddedToken`):
                Tokens are only added if they are not already in the vocabulary. `tokenizers.AddedToken` wraps a string
                token to let you personalize its behavior: whether this token should only match against a single word,
                whether this token should strip all potential whitespaces on the left side, whether this token should
                strip all potential whitespaces on the right side, etc.
            special_tokens (`bool`, *optional*, defaults to `False`):
                Can be used to specify if the token is a special token. This mostly changes the normalization behavior
                (special tokens like CLS or [MASK] are usually not lower-cased for instance).

                See details for `tokenizers.AddedToken` in HuggingFace tokenizers library.

        Returns:
            `int`: Number of tokens added to the vocabulary.

        Examples:

        ```
        # Let's see how to increase the vocabulary of Bert model and tokenizer
        tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased")
        model = BertModel.from_pretrained("google-bert/bert-base-uncased")

        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
        print("We have added", num_added_toks, "tokens")
        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))
        ```"""
        if not new_tokens:
            return 0  # 如果没有新的token要添加,则直接返回0

        if not isinstance(new_tokens, (list, tuple)):
            new_tokens = [new_tokens]  # 确保new_tokens是列表或元组形式

        return self._add_tokens(new_tokens, special_tokens=special_tokens)  # 调用内部方法_add_tokens来实际添加token

    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
        raise NotImplementedError  # 这是一个占位符方法,需要在子类中实现

    @property



        """
        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
        it with indices starting from length of the current vocabulary and will be isolated before the tokenization
        algorithm is applied. Added tokens and tokens from the vocabulary of the tokenization algorithm are therefore
        not treated in the same way.

        Note, when adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix
        of the model so that its embedding matrix matches the tokenizer.

        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.

        Args:
            new_tokens (`str`, `tokenizers.AddedToken` or a list of *str* or `tokenizers.AddedToken`):
                Tokens are only added if they are not already in the vocabulary. `tokenizers.AddedToken` wraps a string
                token to let you personalize its behavior: whether this token should only match against a single word,
                whether this token should strip all potential whitespaces on the left side, whether this token should
                strip all potential whitespaces on the right side, etc.
            special_tokens (`bool`, *optional*, defaults to `False`):
                Can be used to specify if the token is a special token. This mostly changes the normalization behavior
                (special tokens like CLS or [MASK] are usually not lower-cased for instance).

                See details for `tokenizers.AddedToken` in HuggingFace tokenizers library.

        Returns:
            `int`: Number of tokens added to the vocabulary.

        Examples:

        ```
        # Let's see how to increase the vocabulary of Bert model and tokenizer
        tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased")
        model = BertModel.from_pretrained("google-bert/bert-base-uncased")

        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
        print("We have added", num_added_toks, "tokens")
        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))
        ```"""
        if not new_tokens:
            return 0  # If there are no new tokens to add, return 0

        if not isinstance(new_tokens, (list, tuple)):
            new_tokens = [new_tokens]  # Ensure new_tokens is in list or tuple form

        return self._add_tokens(new_tokens, special_tokens=special_tokens)  # Call internal method _add_tokens to actually add tokens

    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
        raise NotImplementedError  # This is a placeholder method, needs to be implemented in subclass

    @property
    def bos_token(self) -> str:
        """
        `str`: Beginning of sentence token. Log an error if used while not having been set.
        """
        # 如果未设置开始句子的标记,则记录错误并返回 None
        if self._bos_token is None:
            if self.verbose:
                logger.error("Using bos_token, but it is not set yet.")
            return None
        # 返回开始句子的标记
        return str(self._bos_token)

    @property
    def eos_token(self) -> str:
        """
        `str`: End of sentence token. Log an error if used while not having been set.
        """
        # 如果未设置结束句子的标记,则记录错误并返回 None
        if self._eos_token is None:
            if self.verbose:
                logger.error("Using eos_token, but it is not set yet.")
            return None
        # 返回结束句子的标记
        return str(self._eos_token)

    @property
    def unk_token(self) -> str:
        """
        `str`: Unknown token. Log an error if used while not having been set.
        """
        # 如果未设置未知标记,则记录错误并返回 None
        if self._unk_token is None:
            if self.verbose:
                logger.error("Using unk_token, but it is not set yet.")
            return None
        # 返回未知标记
        return str(self._unk_token)

    @property
    def sep_token(self) -> str:
        """
        `str`: Separation token, to separate context and query in an input sequence. Log an error if used while not
        having been set.
        """
        # 如果未设置分隔标记,则记录错误并返回 None
        if self._sep_token is None:
            if self.verbose:
                logger.error("Using sep_token, but it is not set yet.")
            return None
        # 返回分隔标记
        return str(self._sep_token)

    @property
    def pad_token(self) -> str:
        """
        `str`: Padding token. Log an error if used while not having been set.
        """
        # 如果未设置填充标记,则记录错误并返回 None
        if self._pad_token is None:
            if self.verbose:
                logger.error("Using pad_token, but it is not set yet.")
            return None
        # 返回填充标记
        return str(self._pad_token)

    @property
    def cls_token(self) -> str:
        """
        `str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full
        depth of the model. Log an error if used while not having been set.
        """
        # 如果未设置分类标记,则记录错误并返回 None
        if self._cls_token is None:
            if self.verbose:
                logger.error("Using cls_token, but it is not set yet.")
            return None
        # 返回分类标记
        return str(self._cls_token)

    @property
    def mask_token(self) -> str:
        """
        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
        having been set.
        """
        # 如果未设置掩码标记,则记录错误并返回 None
        if self._mask_token is None:
            if self.verbose:
                logger.error("Using mask_token, but it is not set yet.")
            return None
        # 返回掩码标记
        return str(self._mask_token)
    def additional_special_tokens(self) -> List[str]:
        """
        `List[str]`: Returns a list of additional special tokens. Raises an error if accessed before being set.
        """
        # 如果 _additional_special_tokens 为 None,则打印错误信息并返回 None
        if self._additional_special_tokens is None:
            if self.verbose:
                logger.error("Using additional_special_tokens, but it is not set yet.")
            return None
        # 将 _additional_special_tokens 转换成字符串列表并返回
        return [str(tok) for tok in self._additional_special_tokens]

    @bos_token.setter
    def bos_token(self, value):
        # 如果 value 不是字符串或 AddedToken 对象,并且不为 None,则抛出值错误异常
        if not isinstance(value, (str, AddedToken)) and value is not None:
            raise ValueError("Cannot set a non-string value as the BOS token")
        # 设置 _bos_token 属性为给定值
        self._bos_token = value

    @eos_token.setter
    def eos_token(self, value):
        # 如果 value 不是字符串或 AddedToken 对象,并且不为 None,则抛出值错误异常
        if not isinstance(value, (str, AddedToken)) and value is not None:
            raise ValueError("Cannot set a non-string value as the EOS token")
        # 设置 _eos_token 属性为给定值
        self._eos_token = value

    @unk_token.setter
    def unk_token(self, value):
        # 如果 value 不是字符串或 AddedToken 对象,并且不为 None,则抛出值错误异常
        if not isinstance(value, (str, AddedToken)) and value is not None:
            raise ValueError("Cannot set a non-string value as the UNK token")
        # 设置 _unk_token 属性为给定值
        self._unk_token = value

    @sep_token.setter
    def sep_token(self, value):
        # 如果 value 不是字符串或 AddedToken 对象,并且不为 None,则抛出值错误异常
        if not isinstance(value, (str, AddedToken)) and value is not None:
            raise ValueError("Cannot set a non-string value as the SEP token")
        # 设置 _sep_token 属性为给定值
        self._sep_token = value

    @pad_token.setter
    def pad_token(self, value):
        # 如果 value 不是字符串或 AddedToken 对象,并且不为 None,则抛出值错误异常
        if not isinstance(value, (str, AddedToken)) and value is not None:
            raise ValueError("Cannot set a non-string value as the PAD token")
        # 设置 _pad_token 属性为给定值
        self._pad_token = value

    @cls_token.setter
    def cls_token(self, value):
        # 如果 value 不是字符串或 AddedToken 对象,并且不为 None,则抛出值错误异常
        if not isinstance(value, (str, AddedToken)) and value is not None:
            raise ValueError("Cannot set a non-string value as the CLS token")
        # 设置 _cls_token 属性为给定值
        self._cls_token = value

    @mask_token.setter
    def mask_token(self, value):
        # 如果 value 不是字符串或 AddedToken 对象,并且不为 None,则抛出值错误异常
        if not isinstance(value, (str, AddedToken)) and value is not None:
            raise ValueError("Cannot set a non-string value as the MASK token")
        # 设置 _mask_token 属性为给定值
        self._mask_token = value

    @additional_special_tokens.setter
    def additional_special_tokens(self, value):
        # 设置 _additional_special_tokens 属性为给定值,如果 value 是 None,则设置为 None
        self._additional_special_tokens = value if value is not None else None

    @property
    def bos_token_id(self) -> Optional[int]:
        """
        `Optional[int]`: Returns the ID of the beginning of sentence token in the vocabulary. Returns `None` if the
        token has not been set.
        """
        # 如果 _bos_token 为 None,则返回 None
        if self._bos_token is None:
            return None
        # 调用 convert_tokens_to_ids 方法将 _bos_token 转换成对应的 ID 并返回
        return self.convert_tokens_to_ids(self.bos_token)

    @property
    def eos_token_id(self) -> Optional[int]:
        """
        `Optional[int]`: Returns the ID of the end of sentence token in the vocabulary. Returns `None` if the token has
        not been set.
        """
        # 如果 _eos_token 为 None,则返回 None
        if self._eos_token is None:
            return None
        # 调用 convert_tokens_to_ids 方法将 _eos_token 转换成对应的 ID 并返回
        return self.convert_tokens_to_ids(self.eos_token)
    @property
    def unk_token_id(self) -> Optional[int]:
        """
        `Optional[int]`: Id of the unknown token in the vocabulary. Returns `None` if the token has not been set.
        """
        # 如果未设置未知标记,则返回 None
        if self._unk_token is None:
            return None
        # 否则,将未知标记转换为其对应的 id 并返回
        return self.convert_tokens_to_ids(self.unk_token)
    
    @property
    def sep_token_id(self) -> Optional[int]:
        """
        `Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
        sequence. Returns `None` if the token has not been set.
        """
        # 如果未设置分隔标记,则返回 None
        if self._sep_token is None:
            return None
        # 否则,将分隔标记转换为其对应的 id 并返回
        return self.convert_tokens_to_ids(self.sep_token)
    
    @property
    def pad_token_id(self) -> Optional[int]:
        """
        `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set.
        """
        # 如果未设置填充标记,则返回 None
        if self._pad_token is None:
            return None
        # 否则,将填充标记转换为其对应的 id 并返回
        return self.convert_tokens_to_ids(self.pad_token)
    
    @property
    def pad_token_type_id(self) -> int:
        """
        `int`: Id of the padding token type in the vocabulary.
        """
        # 直接返回填充标记类型的 id
        return self._pad_token_type_id
    
    @property
    def cls_token_id(self) -> Optional[int]:
        """
        `Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input sequence
        leveraging self-attention along the full depth of the model.
    
        Returns `None` if the token has not been set.
        """
        # 如果未设置分类标记,则返回 None
        if self._cls_token is None:
            return None
        # 否则,将分类标记转换为其对应的 id 并返回
        return self.convert_tokens_to_ids(self.cls_token)
    
    @property
    def mask_token_id(self) -> Optional[int]:
        """
        `Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
        modeling. Returns `None` if the token has not been set.
        """
        # 如果未设置掩码标记,则返回 None
        if self._mask_token is None:
            return None
        # 否则,将掩码标记转换为其对应的 id 并返回
        return self.convert_tokens_to_ids(self.mask_token)
    
    @property
    def additional_special_tokens_ids(self) -> List[int]:
        """
        `List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not having
        been set.
        """
        # 返回所有额外特殊标记的 id 列表
        return self.convert_tokens_to_ids(self.additional_special_tokens)
    
    @bos_token_id.setter
    def bos_token_id(self, value):
        # 如果设置了开始标记的 id,则将其转换为对应的 token 并存储
        self._bos_token = self.convert_ids_to_tokens(value) if value is not None else None
    
    @eos_token_id.setter
    def eos_token_id(self, value):
        # 如果设置了结束标记的 id,则将其转换为对应的 token 并存储
        self._eos_token = self.convert_ids_to_tokens(value) if value is not None else None
    
    @unk_token_id.setter
    def unk_token_id(self, value):
        # 如果设置了未知标记的 id,则将其转换为对应的 token 并存储
        self._unk_token = self.convert_ids_to_tokens(value) if value is not None else None
    
    @sep_token_id.setter
    def sep_token_id(self, value):
        # 如果设置了分隔标记的 id,则将其转换为对应的 token 并存储
        self._sep_token = self.convert_ids_to_tokens(value) if value is not None else None
    
    @pad_token_id.setter
    def pad_token_id(self, value):
        # 设置填充标记的 ID,并转换成对应的标记字符串,如果值为 None,则将 _pad_token 设为 None
        self._pad_token = self.convert_ids_to_tokens(value) if value is not None else None

    @cls_token_id.setter
    def cls_token_id(self, value):
        # 设置类别标记的 ID,并转换成对应的标记字符串,如果值为 None,则将 _cls_token 设为 None
        self._cls_token = self.convert_ids_to_tokens(value) if value is not None else None

    @mask_token_id.setter
    def mask_token_id(self, value):
        # 设置掩码标记的 ID,并转换成对应的标记字符串,如果值为 None,则将 _mask_token 设为 None
        self._mask_token = self.convert_ids_to_tokens(value) if value is not None else None

    @additional_special_tokens_ids.setter
    def additional_special_tokens_ids(self, values):
        # 设置额外特殊标记的 ID 列表,并逐个转换成对应的标记字符串
        self._additional_special_tokens = [self.convert_ids_to_tokens(value) for value in values]

    @property
    def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
        """
        `Dict[str, Union[str, List[str]]]`: 将特殊标记类属性(如 `cls_token`、`unk_token` 等)映射到它们的值(如 `'<unk>'`、`'<cls>'` 等)的字典。

        将 `tokenizers.AddedToken` 类型的潜在标记转换为字符串。
        """
        set_attr = {}
        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
            attr_value = getattr(self, attr)
            if attr_value:
                set_attr[attr] = attr_value
        return set_attr

    @property
    def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
        """
        `Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: 将特殊标记类属性(如 `cls_token`、`unk_token` 等)映射到它们的值(如 `'<unk>'`、`'<cls>'` 等)的字典。

        不将 `tokenizers.AddedToken` 类型的标记转换为字符串,以便更精细地控制特殊标记的分词。
        """
        set_attr = {}
        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
            attr_value = getattr(self, "_" + attr)
            if attr_value:
                set_attr[attr] = attr_value
        return set_attr
    def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
        """
        `List[Union[str, tokenizers.AddedToken]]`: 返回所有特殊标记(如 `<unk>`、`<cls>` 等)的列表,
        其顺序与每个标记的索引无关。如果需要正确的索引,请查看 `self.added_tokens_encoder`。
        无法按顺序创建了,因为键是 `AddedToken` 而不是 `String`。

        不要将 `tokenizers.AddedToken` 类型的标记转换为字符串,以便更精细地控制特殊标记的分词过程。
        """
        all_tokens = []
        seen = set()
        for value in self.special_tokens_map_extended.values():
            if isinstance(value, (list, tuple)):
                tokens_to_add = [token for token in value if str(token) not in seen]
            else:
                tokens_to_add = [value] if str(value) not in seen else []
            seen.update(map(str, tokens_to_add))
            all_tokens.extend(tokens_to_add)
        return all_tokens

    @property
    def all_special_tokens(self) -> List[str]:
        """
        `List[str]`: 返回唯一特殊标记(`'<unk>'`、`'<cls>'` 等)的列表。

        将 `tokenizers.AddedToken` 类型的标记转换为字符串。
        """
        all_toks = [str(s) for s in self.all_special_tokens_extended]
        return all_toks

    @property
    def all_special_ids(self) -> List[int]:
        """
        `List[int]`: 返回特殊标记(`'<unk>'`、`'<cls>'` 等)映射到类属性的 id 列表。
        """
        all_toks = self.all_special_tokens
        all_ids = self.convert_tokens_to_ids(all_toks)
        return all_ids
"""

"""


INIT_TOKENIZER_DOCSTRING = r"""
    Class attributes (overridden by derived classes)

        - **vocab_files_names** (`Dict[str, str]`) -- A dictionary with, as keys, the `__init__` keyword name of each
          vocabulary file required by the model, and as associated values, the filename for saving the associated file
          (string).
        - **pretrained_vocab_files_map** (`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
          high-level keys being the `__init__` keyword name of each vocabulary file required by the model, the
          low-level being the `short-cut-names` of the pretrained models with, as associated values, the `url` to the
          associated pretrained vocabulary file.
        - **max_model_input_sizes** (`Dict[str, Optional[int]]`) -- A dictionary with, as keys, the `short-cut-names`
          of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model,
          or `None` if the model has no maximum input size.
        - **pretrained_init_configuration** (`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
          `short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments to
          pass to the `__init__` method of the tokenizer class for this pretrained model when loading the tokenizer
          with the [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`] method.
        - **model_input_names** (`List[str]`) -- A list of inputs expected in the forward pass of the model.
        - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.
          Should be `'right'` or `'left'`.
        - **truncation_side** (`str`) -- The default value for the side on which the model should have truncation
          applied. Should be `'right'` or `'left'`.

"""

# 使用一个原始的字符串(raw string)定义类的文档字符串,用于描述各种类属性和其功能
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
    """
    Base class for [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`].

    Handles shared (mostly boiler plate) methods for those two classes.
    """

    # 字典,存储每个词汇文件的初始化关键字名和文件名
    vocab_files_names: Dict[str, str] = {}

    # 字典的字典,存储预训练模型的初始化关键字名和对应的预训练模型名称及其 URL
    pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {}

    # 字典,存储每个预训练模型的初始化关键字名和模型输入的最大长度
    max_model_input_sizes: Dict[str, Optional[int]] = {}

    # 字典的字典,存储每个预训练模型的初始化关键字名和加载 tokenizer 时传递给其 `__init__` 方法的特定参数
    pretrained_init_configuration: Dict[str, Dict[str, Any]] = {}

    # 列表,存储模型前向传播时期望的输入名称
    model_input_names: List[str] = ["input_ids", "token_type_ids", "attention_mask"]

    # 字符串,表示默认的填充方向,应为 `'right'` 或 `'left'`
    padding_side: str = "right"

    # 字符串,表示默认的截断方向,应为 `'right'` 或 `'left'`
    truncation_side: str = "right"

    # 类变量,指向慢速 tokenizer 类(如果有的话)
    slow_tokenizer_class = None
    # 定义类初始化方法,接受可选参数进行对象初始化
    def __init__(self, **kwargs):
        # 初始化输入参数以及用于保存和重新加载参数(见 from_pretrained 和 save_pretrained 方法)
        self.init_inputs = ()
        # 深复制 kwargs 参数,用于后续重载操作
        self.init_kwargs = copy.deepcopy(kwargs)
        # 提取 name_or_path 参数,默认为空字符串
        self.name_or_path = kwargs.pop("name_or_path", "")
        # 获取并设置 model_max_length 参数,默认为非常大的值
        model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
        self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER
    
        # 设置默认的数据填充和剪裁方向,右填充和左填充可以由子类覆盖。根据提供的 kwargs 参数进行调整
        self.padding_side = kwargs.pop("padding_side", self.padding_side)  # 默认是右侧填充
        if self.padding_side not in ["right", "left"]:
            raise ValueError(
                f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
            )
        self.truncation_side = kwargs.pop("truncation_side", self.truncation_side)  # 默认是右侧剪裁
        if self.truncation_side not in ["right", "left"]:
            raise ValueError(
                f"Truncation side should be selected between 'right' and 'left', current value: {self.truncation_side}"
            )
        # 获取并存储模型输入名称参数,默认为当前类定义的模型输入名称列表
        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
    
        # 默认为进行快速和慢速分词器的分词空间清理
        self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True)
    
        # 默认为不分离特殊符号参数,默认为 False
        self.split_special_tokens = kwargs.pop("split_special_tokens", False)
    
        # 初始化已注意到的弃用警告字典(避免重复警告)
        self.deprecation_warnings = {}
        self._in_target_context_manager = False
    
        # 存储一个 Jinja 模板对象,用于格式化对话历史为可分词的字符串
        self.chat_template = kwargs.pop("chat_template", None)
        if isinstance(self.chat_template, (list, tuple)):
            # 当 chat_template 是一个列表或元组时,将其转换成单个字典结构,便于后续操作
            self.chat_template = {template["name"]: template["template"] for template in self.chat_template}
    
        # 调用父类的 __init__ 方法,使用余下的参数
        super().__init__(**kwargs)
    
    # 计算和返回单句的最大长度
    @staticmethod
    def max_len_single_sentence() -> int:
        """
        `int`: 单句可以输入到模型的最大长度。
        """
        return self.model_max_length - self.num_special_tokens_to_add(pair=False)
    
    # 计算并返回一对句子的最大结合长度
    @staticmethod
    def max_len_sentences_pair() -> int:
        """
        `int`: 一对句子可以输入到模型的最大结合长度。
        """
        return self.model_max_length - self.num_special_tokens_to_add(pair=True)
    # 设置属性 max_len_single_sentence 的 setter 方法,用于设置单个句子的最大长度
    def max_len_single_sentence(self, value) -> int:
        # 检查是否为向后兼容性,允许设置 'max_len_single_sentence'
        if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
            # 如果设置的值符合向后兼容性要求且 verbose 为真,则发出警告
            if not self.deprecation_warnings.get("max_len_single_sentence", False):
                logger.warning(
                    "Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
                )
            # 标记 'max_len_single_sentence' 已发出过警告
            self.deprecation_warnings["max_len_single_sentence"] = True
        else:
            # 如果设置的值不符合向后兼容性要求,则抛出 ValueError 异常
            raise ValueError(
                "Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
            )

    # 设置属性 max_len_sentences_pair 的 setter 方法,用于设置句对的最大长度
    def max_len_sentences_pair(self, value) -> int:
        # 检查是否为向后兼容性,允许设置 'max_len_sentences_pair'
        if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
            # 如果设置的值符合向后兼容性要求且 verbose 为真,则发出警告
            if not self.deprecation_warnings.get("max_len_sentences_pair", False):
                logger.warning(
                    "Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up."
                )
            # 标记 'max_len_sentences_pair' 已发出过警告
            self.deprecation_warnings["max_len_sentences_pair"] = True
        else:
            # 如果设置的值不符合向后兼容性要求,则抛出 ValueError 异常
            raise ValueError("Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up.")

    # 设置 _processor_class 属性的私有方法,用于设置处理器类别
    def _set_processor_class(self, processor_class: str):
        """Sets processor class as an attribute."""
        self._processor_class = processor_class

    # 定义属性 added_tokens_decoder 的 getter 方法,返回一个字典,表示添加的特殊标记的解码器
    @property
    def added_tokens_decoder(self) -> Dict[int, AddedToken]:
        # 抛出 NotImplementedError,因为该方法需要在子类中实现具体逻辑
        raise NotImplementedError()

    # 定义对象的字符串表示形式,用于返回对象的详细描述信息
    def __repr__(self) -> str:
        # 将 added_tokens_decoder 属性的内容转换成字符串形式
        added_tokens_decoder_rep = "\n\t".join([f"{k}: {v.__repr__()}," for k, v in self.added_tokens_decoder.items()])
        # 返回对象的字符串表示形式,包括对象的各种属性信息和 added_tokens_decoder 的内容
        return (
            f"{self.__class__.__name__}(name_or_path='{self.name_or_path}',"
            f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast},"
            f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
            f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces}), "
            " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}"
        )

    # 定义对象的长度方法,返回对象的长度信息
    def __len__(self) -> int:
        # 抛出 NotImplementedError,因为该方法需要在子类中实现具体逻辑
        raise NotImplementedError()

    # 获取词汇表的方法,返回一个字典,表示 token 到 index 的映射
    def get_vocab(self) -> Dict[str, int]:
        """
        Returns the vocabulary as a dictionary of token to index.

        `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the
        vocab.

        Returns:
            `Dict[str, int]`: The vocabulary.
        """
        # 抛出 NotImplementedError,因为该方法需要在子类中实现具体逻辑
        raise NotImplementedError()
    def apply_chat_template(
        self,
        conversation: Union[List[Dict[str, str]], "Conversation"],
        chat_template: Optional[str] = None,
        add_generation_prompt: bool = False,
        tokenize: bool = True,
        padding: bool = False,
        truncation: bool = False,
        max_length: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_dict: bool = False,
        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ):
        """
        Applies a chat template to format a conversation.

        Args:
            conversation (Union[List[Dict[str, str]], "Conversation"]): The conversation data to format.
            chat_template (Optional[str]): Optional template string to format messages.
            add_generation_prompt (bool): Whether to add a generation prompt at the end.
            tokenize (bool): Whether to tokenize the formatted output.
            padding (bool): Whether to apply padding to the tokens.
            truncation (bool): Whether to truncate tokens if exceeding max_length.
            max_length (Optional[int]): Maximum length of the formatted output.
            return_tensors (Optional[Union[str, TensorType]]): Return type for tokenized outputs.
            return_dict (bool): Whether to return the output as a dictionary.
            tokenizer_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments for the tokenizer.
            **kwargs: Additional keyword arguments.

        Returns:
            Formatted conversation based on the provided chat template.
        """
        try:
            import jinja2
            from jinja2.exceptions import TemplateError
            from jinja2.sandbox import ImmutableSandboxedEnvironment
        except ImportError:
            raise ImportError("apply_chat_template requires jinja2 to be installed.")

        if version.parse(jinja2.__version__) < version.parse("3.0.0"):
            raise ImportError(
                "apply_chat_template requires jinja2>=3.0.0 to be installed. Your version is " f"{jinja2.__version__}."
            )

        def raise_exception(message):
            """
            Helper function to raise a TemplateError with a specified message.

            Args:
                message (str): Error message to raise.

            Raises:
                TemplateError: Exception with the provided message.
            """
            raise TemplateError(message)

        jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True)
        jinja_env.globals["raise_exception"] = raise_exception
        return jinja_env.from_string(chat_template)

    @lru_cache
    def _compile_jinja_template(self, chat_template):
        """
        Compiles a Jinja template using a sandboxed environment.

        Args:
            chat_template (str): The Jinja template string to compile.

        Returns:
            Jinja Template: Compiled Jinja template object.
        """
        try:
            import jinja2
            from jinja2.exceptions import TemplateError
            from jinja2.sandbox import ImmutableSandboxedEnvironment
        except ImportError:
            raise ImportError("_compile_jinja_template requires jinja2 to be installed.")

        if version.parse(jinja2.__version__) < version.parse("3.0.0"):
            raise ImportError(
                "_compile_jinja_template requires jinja2>=3.0.0 to be installed. Your version is " f"{jinja2.__version__}."
            )

        def raise_exception(message):
            """
            Helper function to raise a TemplateError with a specified message.

            Args:
                message (str): Error message to raise.

            Raises:
                TemplateError: Exception with the provided message.
            """
            raise TemplateError(message)

        jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True)
        jinja_env.globals["raise_exception"] = raise_exception
        return jinja_env.from_string(chat_template)

    @property
    def default_chat_template(self):
        """
        Property representing the default chat template in ChatML format.

        Returns:
            str: Default chat template.
        """
        logger.warning_once(
            "\nNo chat template is defined for this tokenizer - using a default chat template "
            "that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for "
            "your model, please set `tokenizer.chat_template` to an appropriate template. "
            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
        )
        return (
            "{% for message in messages %}"
            "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
            "{% endfor %}"
            "{% if add_generation_prompt %}"
            "{{ '<|im_start|>assistant\n' }}"
            "{% endif %}"
        )

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        *init_inputs,
        cache_dir: Optional[Union[str, os.PathLike]] = None,
        force_download: bool = False,
        local_files_only: bool = False,
        token: Optional[Union[str, bool]] = None,
        revision: str = "main",
        trust_remote_code=False,
        **kwargs,
    ):
        """
        Creates an instance of the class from a pretrained model or path.

        Args:
            pretrained_model_name_or_path (Union[str, os.PathLike]): Name or path of the pretrained model.
            *init_inputs: Additional positional arguments for initialization.
            cache_dir (Optional[Union[str, os.PathLike]]): Optional directory to cache downloaded files.
            force_download (bool): Whether to force download the model files.
            local_files_only (bool): Whether to use only local files without downloading.
            token (Optional[Union[str, bool]]): Token to authenticate access to the pretrained model.
            revision (str): Revision of the pretrained model to use.
            trust_remote_code (bool): Whether to trust remote code for model initialization.
            **kwargs: Additional keyword arguments.

        Returns:
            Instance of the class initialized with the pretrained model.
        """
    @staticmethod
    def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
        # This method should be deleted in Transformers v5
        # Its only purpose is to potentially throw a warning
        # that incorrectly defined max lengths of T5's tokenizer are used
        # which we will correct in Transformers v5.
        # 返回最大模型长度,这个方法在 Transformers v5 中应该被删除,只是用来可能发出警告,
        # 告知使用了错误定义的 T5 分词器的最大长度,我们将在 Transformers v5 中进行更正。
        return max_model_length

    @classmethod
    def convert_added_tokens(cls, obj: Union[AddedToken, Any], save=False, add_type_field=True):
        # 如果 obj 是字典且包含 "__type" 键且其值为 "AddedToken"
        if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
            obj.pop("__type")  # 移除 "__type" 键
            return AddedToken(**obj)  # 返回一个 AddedToken 对象
        # 如果 obj 是 AddedToken 对象且需要保存
        if isinstance(obj, AddedToken) and save:
            obj = obj.__getstate__()  # 获取对象状态
            if add_type_field:
                obj["__type"] = "AddedToken"  # 添加 "__type" 字段
            else:
                # 不保存 "special" 字段,适用于之前的分词器
                obj.pop("special")
            return obj
        elif isinstance(obj, (list, tuple)):
            # 如果 obj 是列表或元组,则递归地转换列表中的每个元素
            return [cls.convert_added_tokens(o, save=save, add_type_field=add_type_field) for o in obj]
        elif isinstance(obj, dict):
            # 如果 obj 是字典,则递归地转换字典中的每个值
            return {k: cls.convert_added_tokens(v, save=save, add_type_field=add_type_field) for k, v in obj.items()}
        return obj  # 返回原始对象

    def save_pretrained(
        self,
        save_directory: Union[str, os.PathLike],
        legacy_format: Optional[bool] = None,
        filename_prefix: Optional[str] = None,
        push_to_hub: bool = False,
        **kwargs,
    ):
        # 保存当前模型到指定目录
        pass  # 空函数体,用于定义方法结构,实际保存逻辑需根据具体情况添加

    def _save_pretrained(
        self,
        save_directory: Union[str, os.PathLike],
        file_names: Tuple[str],
        legacy_format: Optional[bool] = None,
        filename_prefix: Optional[str] = None,
    ):
        # 在指定目录下保存模型文件
        pass  # 空函数体,用于定义方法结构,实际保存逻辑需根据具体情况添加
    ) -> Tuple[str]:
        """
        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.

        Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
        specific [`~tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`]
        """
        # 如果不是传统格式,则抛出数值错误异常
        if legacy_format is False:
            raise ValueError(
                "Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format."
            )

        # 将保存目录转换为字符串类型
        save_directory = str(save_directory)

        # 构建添加的 tokens 文件路径,包括可选的前缀和固定的文件名后缀 ADDED_TOKENS_FILE
        added_tokens_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
        )
        # 获取添加的 tokens 的词汇表,仅包括索引大于等于词汇表大小的 token
        added_vocab = {tok: index for tok, index in self.added_tokens_encoder.items() if index >= self.vocab_size}
        # 如果存在添加的 tokens,则写入到文件中
        if added_vocab:
            with open(added_tokens_file, "w", encoding="utf-8") as f:
                out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
                f.write(out_str)
                logger.info(f"added tokens file saved in {added_tokens_file}")

        # 调用实例方法保存词汇表文件,并返回文件路径的元组
        vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)

        # 返回文件名列表和词汇表文件路径的元组,包括添加的 tokens 文件路径
        return file_names + vocab_files + (added_tokens_file,)

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save only the vocabulary of the tokenizer (vocabulary + added tokens).

        This method won't save the configuration and special token mappings of the tokenizer. Use
        [`~PreTrainedTokenizerFast._save_pretrained`] to save the whole state of the tokenizer.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.
            filename_prefix (`str`, *optional*):
                An optional prefix to add to the named of the saved files.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        """
        # 抛出未实现错误,提示子类应该实现该方法
        raise NotImplementedError
    @add_end_docstrings(
        ENCODE_KWARGS_DOCSTRING,
        """
            **kwargs: Passed along to the `.tokenize()` method.
        """,
        """
        Returns:
            `List[int]`, `torch.Tensor`, `tf.Tensor` or `np.ndarray`: The tokenized ids of the text.
        """,
    )

# 使用装饰器 `add_end_docstrings`,添加额外的文档字符串至 `encode` 方法
# 包含 `ENCODE_KWARGS_DOCSTRING` 变量定义的文档字符串
# 声明了 `**kwargs` 参数会传递给 `.tokenize()` 方法
# 描述了返回值为一个包含整数、PyTorch 张量、TensorFlow 张量或 NumPy 数组的列表,表示文本的标记化 ID


    def encode(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs,
    ) -> List[int]:

# 定义 `encode` 方法,用于将字符串转换为 ID 序列,利用 tokenizer 和词汇表进行编码
# 参数 `text`:`Union[TextInput, PreTokenizedInput, EncodedInput]`,第一个要编码的序列,可以是字符串、字符串列表(使用 `tokenize` 方法标记化的字符串)或整数列表(使用 `convert_tokens_to_ids` 方法标记化的字符串 ID)
# 参数 `text_pair`:`Optional[Union[TextInput, PreTokenizedInput, EncodedInput]]`,可选的第二个要编码的序列,同样可以是字符串、字符串列表或整数列表
# 参数 `add_special_tokens`:`bool`,默认为 `True`,是否添加与模型相关的特殊标记
# 参数 `padding`:`Union[bool, str, PaddingStrategy]`,默认为 `False`,是否对输入进行填充
# 参数 `truncation`:`Union[bool, str, TruncationStrategy]`,是否进行截断
# 参数 `max_length`:`Optional[int]`,可选的最大长度限制
# 参数 `stride`:`int`,默认为 `0`,步长设置
# 参数 `return_tensors`:`Optional[Union[str, TensorType]]`,是否返回张量类型
# `**kwargs`:额外的关键字参数,将传递给底层模型特定的 `encode` 方法


        encoded_inputs = self.encode_plus(
            text,
            text_pair=text_pair,
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            return_tensors=return_tensors,
            **kwargs,
        )

# 调用 `encode_plus` 方法,将 `text` 和 `text_pair`(如果存在)编码为输入 IDs
# 传递其他参数如 `add_special_tokens`、`padding`、`truncation`、`max_length`、`stride`、`return_tensors` 和 `**kwargs` 给 `encode_plus` 方法
# 将返回的结果存储在 `encoded_inputs` 变量中


        return encoded_inputs["input_ids"]

# 返回 `encoded_inputs` 字典中键为 `"input_ids"` 的值,即文本的标记化 ID 列表
    # 定义一个方法用于计算需要添加的特殊标记数量,抛出未实现错误,需要在子类中实现
    def num_special_tokens_to_add(self, pair: bool = False) -> int:
        raise NotImplementedError

    # 定义一个方法用于获取填充和截断策略,返回策略参数
    def _get_padding_truncation_strategies(
        self, padding=False, truncation=None, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
    ):
    
    # 装饰器:添加文档字符串到 __call__ 方法,使用默认和额外的编码参数文档字符串
    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    # 定义 __call__ 方法,用于对输入进行编码处理
    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
        text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        text_pair_target: Optional[
            Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
        ] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
    
    # 定义 _call_one 方法,用于对单个输入进行编码处理
    def _call_one(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
    
    # 装饰器:添加文档字符串到 _call_one 方法,使用默认和额外的编码参数文档字符串
    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    # 定义一个方法 `encode_plus`,用于处理文本及其可选的配对文本的编码和处理
    def encode_plus(
        self,
        # 输入文本,可以是单一文本、预处理后的输入或已编码的输入
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        # 可选的配对文本,可以是单一文本、预处理后的输入或已编码的输入
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        # 是否添加特殊标记,如 `[CLS]` 和 `[SEP]`
        add_special_tokens: bool = True,
        # 是否进行填充
        padding: Union[bool, str, PaddingStrategy] = False,
        # 是否进行截断
        truncation: Union[bool, str, TruncationStrategy] = None,
        # 最大长度限制
        max_length: Optional[int] = None,
        # 滑动窗口步长
        stride: int = 0,
        # 输入是否已经分割成单词
        is_split_into_words: bool = False,
        # 填充到指定的倍数
        pad_to_multiple_of: Optional[int] = None,
        # 返回的张量类型,如 'pt' 表示 PyTorch 张量
        return_tensors: Optional[Union[str, TensorType]] = None,
        # 是否返回 token 类型 ID
        return_token_type_ids: Optional[bool] = None,
        # 是否返回注意力掩码
        return_attention_mask: Optional[bool] = None,
        # 是否返回超出最大长度的 token
        return_overflowing_tokens: bool = False,
        # 是否返回特殊 token 掩码
        return_special_tokens_mask: bool = False,
        # 是否返回 token 的偏移映射
        return_offsets_mapping: bool = False,
        # 是否返回编码后的长度
        return_length: bool = False,
        # 是否启用详细模式,控制是否输出详细信息
        verbose: bool = True,
        # 其他参数
        **kwargs,
    ) -> BatchEncoding:
        """
        Tokenize and prepare for the model a sequence or a pair of sequences.

        <Tip warning={true}>

        This method is deprecated, `__call__` should be used instead.

        </Tip>

        Args:
            text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
            text_pair (`str`, `List[str]` or `List[int]`, *optional*):
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        # 获取填充和截断策略,以及最大长度和其他参数
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用内部方法 `_encode_plus` 进行编码
        return self._encode_plus(
            text=text,
            text_pair=text_pair,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            is_split_into_words=is_split_into_words,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )
    # 抽象方法,用于派生类实现,用于编码给定文本或文本对的方法
    def _encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ) -> BatchEncoding:
        # 抛出未实现错误,提醒子类需要实现这个方法
        raise NotImplementedError

    # 使用指定的文档字符串装饰器添加文档注释
    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    # 批量编码给定文本或文本对的方法
    def batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
            List[PreTokenizedInputPair],
            List[EncodedInput],
            List[EncodedInputPair],
        ],
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
    ) -> BatchEncoding:
        """
        Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.

        <Tip warning={true}>

        This method is deprecated, `__call__` should be used instead.

        </Tip>

        Args:
            batch_text_or_text_pairs (`List[str]`, `List[Tuple[str, str]]`, `List[List[str]]`, `List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also `List[List[int]]`, `List[Tuple[List[int], List[int]]]`):
                Batch of sequences or pair of sequences to be encoded. This can be a list of
                string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
                details in `encode_plus`).
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        # 获取填充和截断策略以及最大长度,以及其他参数设置
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用实际的编码方法,返回编码后的结果
        return self._batch_encode_plus(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            is_split_into_words=is_split_into_words,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )
    # 重写父类中的方法,用于批量编码输入文本或文本对
    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
            List[PreTokenizedInputPair],
            List[EncodedInput],
            List[EncodedInputPair],
        ],
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ) -> BatchEncoding:
        # 抛出未实现的错误,强制子类实现该方法
        raise NotImplementedError

    # 在批量编码过程中对输入进行填充
    def pad(
        self,
        encoded_inputs: Union[
            BatchEncoding,
            List[BatchEncoding],
            Dict[str, EncodedInput],
            Dict[str, List[EncodedInput]],
            List[Dict[str, EncodedInput]],
        ],
        padding: Union[bool, str, PaddingStrategy] = True,
        max_length: Optional[int] = None,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        verbose: bool = True,
    ):
        # 未实现的填充方法,应在子类中实现具体逻辑
        raise NotImplementedError

    # 根据输入的token_ids构建特殊的token类型标识
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create the token type IDs corresponding to the sequences passed. [What are token type
        IDs?](../glossary#token-type-ids)

        Should be overridden in a subclass if the model has a special way of building those.

        Args:
            token_ids_0 (`List[int]`): The first tokenized sequence.
            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.

        Returns:
            `List[int]`: The token type ids.
        """
        # 如果只有一个序列,所有token type为0
        if token_ids_1 is None:
            return len(token_ids_0) * [0]
        # 如果有两个序列,第一个序列的token type为0,第二个序列的token type为1
        return [0] * len(token_ids_0) + [1] * len(token_ids_1)

    # 构建包含特殊token的输入序列
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ):
        # 未实现的方法,应在子类中定义如何构建带有特殊token的输入序列
        raise NotImplementedError
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens.

        This implementation does not add special tokens and this method should be overridden in a subclass.

        Args:
            token_ids_0 (`List[int]`): The first tokenized sequence.
            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.

        Returns:
            `List[int]`: The model input with special tokens.
        """
        # 如果没有第二个序列,直接返回第一个序列
        if token_ids_1 is None:
            return token_ids_0
        # 否则将两个序列连接起来
        return token_ids_0 + token_ids_1

    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
    def prepare_for_model(
        self,
        ids: List[int],
        pair_ids: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        prepend_batch_axis: bool = False,
        **kwargs,
    ):
        """
        Placeholder method that should be overridden in a subclass to prepare model inputs with special tokens.

        This method includes various options for processing inputs such as padding, truncation, and returning
        tensors in specific formats.

        Args:
            ids (`List[int]`): List of input token IDs.
            pair_ids (`Optional[List[int]]`, *optional*): List of token IDs for the second sequence in pair inputs.
            add_special_tokens (`bool`, *optional*): Whether to add special tokens to the input sequences.
            padding (`Union[bool, str, PaddingStrategy]`, *optional*): Padding strategy or boolean for padding sequences.
            truncation (`Union[bool, str, TruncationStrategy]`, *optional*): Truncation strategy or boolean for truncating sequences.
            max_length (`Optional[int]`, *optional*): Maximum length of the sequences after processing.
            stride (`int`, *optional*): Stride to use when overflowing tokens.
            pad_to_multiple_of (`Optional[int]`, *optional*): Pad to a multiple of this value.
            return_tensors (`Optional[Union[str, TensorType]]`, *optional*): Return type of tensors (e.g., 'tf', 'pt').
            return_token_type_ids (`Optional[bool]`, *optional*): Whether to return token type IDs.
            return_attention_mask (`Optional[bool]`, *optional*): Whether to return attention mask.
            return_overflowing_tokens (`bool`, *optional*): Whether to return overflowing tokens.
            return_special_tokens_mask (`bool`, *optional*): Whether to return special tokens mask.
            return_offsets_mapping (`bool`, *optional*): Whether to return offsets mapping.
            return_length (`bool`, *optional*): Whether to return sequence lengths.
            verbose (`bool`, *optional*): Whether to print verbose logs.
            prepend_batch_axis (`bool`, *optional*): Whether to prepend batch axis to the returned tensors.
            **kwargs: Additional keyword arguments for specific implementations.

        Returns:
            `Dict[str, Union[torch.Tensor, tf.Tensor, np.ndarray]]`: Dictionary with model inputs prepared according to the specified arguments.
        """
        raise NotImplementedError

    def truncate_sequences(
        self,
        ids: List[int],
        pair_ids: Optional[List[int]] = None,
        num_tokens_to_remove: int = 0,
        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
        stride: int = 0,
    ):
        """
        Truncate sequences of token IDs to a specified maximum length.

        Args:
            ids (`List[int]`): List of input token IDs.
            pair_ids (`Optional[List[int]]`, *optional*): List of token IDs for the second sequence in pair inputs.
            num_tokens_to_remove (`int`, *optional*): Number of tokens to remove from the sequences.
            truncation_strategy (`Union[str, TruncationStrategy]`, *optional*): Strategy for truncation ('longest_first' or 'only_first').
            stride (`int`, *optional*): Stride to use when overflowing tokens.
        """
        raise NotImplementedError

    def _pad(
        self,
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
        max_length: Optional[int] = None,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
    ):
        """
        Internal method to pad encoded inputs to a specified length.

        Args:
            encoded_inputs (`Union[Dict[str, EncodedInput], BatchEncoding]`): Dictionary or BatchEncoding object containing encoded inputs.
            max_length (`Optional[int]`, *optional*): Maximum length to pad sequences to.
            padding_strategy (`PaddingStrategy`, *optional*): Strategy for padding sequences.
            pad_to_multiple_of (`Optional[int]`, *optional*): Pad to a multiple of this value.
            return_attention_mask (`Optional[bool]`, *optional*): Whether to return attention mask.

        Returns:
            `Dict[str, torch.Tensor]`: Dictionary containing padded inputs.
        """
        raise NotImplementedError

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """
        Converts a sequence of tokens into a single string representation.

        Args:
            tokens (`List[str]`): List of tokens to join.

        Returns:
            `str`: Joined string of tokens.
        """
        raise NotImplementedError

    def batch_decode(
        self,
        sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        **kwargs,
    ):
        """
        Batch decode sequences of token IDs into a list of strings.

        Args:
            sequences (`Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"]`): List or tensor of token IDs.
            skip_special_tokens (`bool`, *optional*): Whether to skip special tokens during decoding.
            clean_up_tokenization_spaces (`bool`, *optional*): Whether to clean up tokenization spaces in the decoded text.
            **kwargs: Additional keyword arguments for specific implementations.
        """
        raise NotImplementedError
    ) -> List[str]:
        """
        Convert a list of lists of token ids into a list of strings by calling decode.

        Args:
            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces`.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `List[str]`: The list of decoded sentences.
        """
        # Return a list comprehension that decodes each sequence in `sequences`
        return [
            self.decode(
                seq,
                skip_special_tokens=skip_special_tokens,
                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                **kwargs,
            )
            for seq in sequences  # Iterate over each sequence in the input `sequences`
        ]

    def decode(
        self,
        token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        **kwargs,
    ) -> str:
        """
        Converts a sequence of ids into a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces`.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `str`: The decoded sentence.
        """
        # Convert `token_ids` to Python list representation
        token_ids = to_py_obj(token_ids)

        # Call the internal decode method `_decode` with specified arguments
        return self._decode(
            token_ids=token_ids,
            skip_special_tokens=skip_special_tokens,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            **kwargs,
        )

    def _decode(
        self,
        token_ids: Union[int, List[int]],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        **kwargs,

    ) -> str:
        """
        Internal method to convert token ids into a string, with options to remove special tokens and clean up
        tokenization spaces.

        Args:
            token_ids (`Union[int, List[int]]`):
                List of tokenized input ids.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces`.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `str`: The decoded sentence.
        """
        # Return the result of decoding `token_ids` into a string using tokenizer and options
        return self.convert_tokens_to_string(
            self.convert_ids_to_tokens(token_ids),
            skip_special_tokens=skip_special_tokens,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            **kwargs,
        )
    ) -> str:
        raise NotImplementedError


# 抛出未实现错误,指示这个方法尚未被具体实现
def get_special_tokens_mask(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
    """
    Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
    special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

    Args:
        token_ids_0 (`List[int]`):
            List of ids of the first sequence.
        token_ids_1 (`List[int]`, *optional*):
            List of ids of the second sequence.
        already_has_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not the token list is already formatted with special tokens for the model.

    Returns:
        A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
    """
    assert already_has_special_tokens and token_ids_1 is None, (
        "You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
        "Please use a slow (full python) tokenizer to activate this argument. "
        "Or set `return_special_tokens_mask=True` when calling the encoding method "
        "to get the special tokens mask in any tokenizer. "
    )

    # 从类属性中缓存所有特殊 token 的 id
    all_special_ids = self.all_special_ids

    # 创建一个特殊 tokens 掩码列表,标记哪些 token 是特殊 token
    special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]

    return special_tokens_mask

@staticmethod
def clean_up_tokenization(out_string: str) -> str:
    """
    Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.

    Args:
        out_string (`str`): The text to clean up.

    Returns:
        `str`: The cleaned-up string.
    """
    # 清理简单的英文标记化残留,例如标点前的空格和缩写形式
    out_string = (
        out_string.replace(" .", ".")
        .replace(" ?", "?")
        .replace(" !", "!")
        .replace(" ,", ",")
        .replace(" ' ", "'")
        .replace(" n't", "n't")
        .replace(" 'm", "'m")
        .replace(" 's", "'s")
        .replace(" 've", "'ve")
        .replace(" 're", "'re")
    )
    return out_string
    def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Optional[int], verbose: bool):
        """
        根据输入和内部状态,可能会触发关于序列过长的警告,超过了模型指定的最大长度。

        Args:
            ids (`List[str]`): 标记化产生的 id 列表
            max_length (`int`, *optional*): 所需的最大长度(如果设置则不会触发警告)
            verbose (`bool`): 是否打印更多信息和警告。

        """
        # 如果 max_length 未设置且 ids 的长度超过 self.model_max_length 并且 verbose 为 True
        if max_length is None and len(ids) > self.model_max_length and verbose:
            # 如果尚未记录过这个警告,则记录该警告并打印日志
            if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False):
                logger.warning(
                    "Token indices sequence length is longer than the specified maximum sequence length "
                    f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model "
                    "will result in indexing errors"
                )
            self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True

    def _switch_to_input_mode(self):
        """
        将分词器切换到输入模式的私有方法(当分词器具有不同的输入/输出模式时)
        """
        pass

    def _switch_to_target_mode(self):
        """
        将分词器切换到目标模式的私有方法(当分词器具有不同的输入/输出模式时)
        """
        pass

    @contextmanager
    def as_target_tokenizer(self):
        """
        临时将分词器设置为用于编码目标的模式。对于需要为标签处理稍微不同的序列到序列模型的分词器非常有用。
        """
        warnings.warn(
            "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
            "labels by using the argument `text_target` of the regular `__call__` method (either in the same call as "
            "your input texts if you use the same keyword arguments, or in a separate call."
        )
        self._switch_to_target_mode()  # 切换到目标模式
        self._in_target_context_manager = True  # 设置目标模式上下文管理器为 True
        yield  # 返回控制权给调用方
        self._in_target_context_manager = False  # 取消目标模式上下文管理器
        self._switch_to_input_mode()  # 切换回输入模式

    @classmethod
    def register_for_auto_class(cls, auto_class="AutoTokenizer"):
        """
        Register this class with a given auto class. This should only be used for custom tokenizers as the ones in the
        library are already mapped with `AutoTokenizer`.

        <Tip warning={true}>

        This API is experimental and may have some slight breaking changes in the next releases.

        </Tip>

        Args:
            auto_class (`str` or `type`, *optional*, defaults to `"AutoTokenizer"`):
                The auto class to register this new tokenizer with.
        """
        # 检查 auto_class 是否不是字符串类型,如果不是,则取其类名作为字符串
        if not isinstance(auto_class, str):
            auto_class = auto_class.__name__

        # 导入 transformers 模块中的 auto 子模块
        import transformers.models.auto as auto_module

        # 检查 auto_module 中是否存在给定名称的 auto_class 类或模块
        if not hasattr(auto_module, auto_class):
            raise ValueError(f"{auto_class} is not a valid auto class.")

        # 将注册的 auto_class 赋值给类属性 _auto_class
        cls._auto_class = auto_class

    def prepare_seq2seq_batch(
        self,
        src_texts: List[str],
        tgt_texts: Optional[List[str]] = None,
        max_length: Optional[int] = None,
        max_target_length: Optional[int] = None,
        padding: str = "longest",
        return_tensors: str = None,
        truncation: bool = True,
        **kwargs,
"""
`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.
"""
        # 发出未来警告,提示`prepare_seq2seq_batch`即将在 Transformers 版本 5 中移除
        warnings.warn(formatted_warning, FutureWarning)
        # mBART-specific kwargs that should be ignored by other models.
        # 移除仅适用于 mBART 的特定 kwargs,其他模型忽略这些参数
        kwargs.pop("src_lang", None)
        kwargs.pop("tgt_lang", None)
        # 如果未设置 max_length,则使用 self.model_max_length 的值
        if max_length is None:
            max_length = self.model_max_length
        # 调用当前对象(可能是模型)的 `__call__` 方法,准备模型的输入
        model_inputs = self(
            src_texts,
            add_special_tokens=True,
            return_tensors=return_tensors,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            **kwargs,
        )
        # 如果目标文本(tgt_texts)为 None,则返回模型输入
        if tgt_texts is None:
            return model_inputs
        # 处理目标文本(tgt_texts)
        # 如果未设置 max_target_length,则使用 max_length 的值
        if max_target_length is None:
            max_target_length = max_length
        # 使用当前对象的目标专用分词器上下文处理目标文本(tgt_texts)
        with self.as_target_tokenizer():
            labels = self(
                tgt_texts,
                add_special_tokens=True,
                return_tensors=return_tensors,
                padding=padding,
                max_length=max_target_length,
                truncation=truncation,
                **kwargs,
            )
        # 将标签(labels)的输入 ID 添加到模型输入字典中的 "labels" 键
        model_inputs["labels"] = labels["input_ids"]
        # 返回模型输入字典
        return model_inputs


def get_fast_tokenizer_file(tokenization_files: List[str]) -> str:
    """
    Get the tokenization file to use for this version of transformers.

    Args:
        tokenization_files (`List[str]`): The list of available configuration files.

    Returns:
        `str`: The tokenization file to use.
    """
    # 初始化一个空字典,用于存储文件名与版本号的映射关系
    tokenizer_files_map = {}
    # 遍历提供的 tokenization_files 列表
    for file_name in tokenization_files:
        # 使用正则表达式搜索文件名中的版本号信息
        search = _re_tokenizer_file.search(file_name)
        # 如果找到版本号信息
        if search is not None:
            # 提取版本号并作为字典的键,文件名作为值存储
            v = search.groups()[0]
            tokenizer_files_map[v] = file_name
    # 对版本号进行排序
    available_versions = sorted(tokenizer_files_map.keys())

    # 默认使用 FULL_TOKENIZER_FILE,然后尝试查找一些更新的版本
    tokenizer_file = FULL_TOKENIZER_FILE
    # 解析当前 transformers 版本号
    transformers_version = version.parse(__version__)
    # 遍历可用版本号列表
    for v in available_versions:
        # 如果当前版本号小于或等于 transformers 版本号
        if version.parse(v) <= transformers_version:
            # 更新 tokenizer_file 为对应版本号的文件名
            tokenizer_file = tokenizer_files_map[v]
        else:
            # 因为版本号已经排序,无需继续查找更高版本
            # 在此处退出循环
            break

    # 返回确定的 tokenizer 文件名
    return tokenizer_file
# 将 push_to_hub 方法的文档字符串更新,需要先复制该方法,以避免改变原始的文档字符串。
PreTrainedTokenizerBase.push_to_hub = copy_func(PreTrainedTokenizerBase.push_to_hub)

# 检查复制后的 push_to_hub 方法是否有文档字符串,如果有,则格式化其文档字符串,
# 将其中的占位符替换为指定的对象、对象类别和对象文件描述信息。
if PreTrainedTokenizerBase.push_to_hub.__doc__ is not None:
    PreTrainedTokenizerBase.push_to_hub.__doc__ = PreTrainedTokenizerBase.push_to_hub.__doc__.format(
        object="tokenizer", object_class="AutoTokenizer", object_files="tokenizer files"
    )

.\tokenization_utils_fast.py

# 设置脚本的字符编码为 UTF-8
# 版权声明:2020年由 HuggingFace Inc. 团队提供
#
# 根据 Apache 许可证版本 2.0(“许可证”)授权使用此文件;
# 除非符合许可证的规定,否则不得使用此文件。
# 您可以在以下网址获取许可证的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则本软件按“原样”分发,不提供任何明示或
# 暗示的担保或条件。
# 有关详细信息,请参阅许可证。
"""
 Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
 see tokenization_utils.py
"""

# 导入必要的库和模块
import copy
import json
import os
from collections import defaultdict
from typing import Any, Dict, List, Optional, Tuple, Union

# 导入 fast tokenizers 相关模块和类
import tokenizers.pre_tokenizers as pre_tokenizers_fast
from tokenizers import Encoding as EncodingFast
from tokenizers import Tokenizer as TokenizerFast
from tokenizers.decoders import Decoder as DecoderFast
from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer

# 导入其他模块和类
from .convert_slow_tokenizer import convert_slow_tokenizer
from .tokenization_utils import PreTrainedTokenizer
from .tokenization_utils_base import (
    INIT_TOKENIZER_DOCSTRING,
    AddedToken,
    BatchEncoding,
    PreTokenizedInput,
    PreTokenizedInputPair,
    PreTrainedTokenizerBase,
    SpecialTokensMixin,
    TextInput,
    TextInputPair,
    TruncationStrategy,
)
from .utils import PaddingStrategy, add_end_docstrings, logging

# 获取 logger 对象
logger = logging.get_logger(__name__)

# 定义文件名常量
# fast tokenizers 可以保存在单个文件中
TOKENIZER_FILE = "tokenizer.json"
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"

# slow tokenizers 需要额外的添加 tokens 文件
ADDED_TOKENS_FILE = "added_tokens.json"

# 更新 INIT_TOKENIZER_DOCSTRING 文档字符串,增加关于 tokenizer_object 和 tokenizer_file 的说明
INIT_TOKENIZER_DOCSTRING += """
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
"""

# 映射模型类型到对应的 Trainer 类
MODEL_TO_TRAINER_MAPPING = {
    "BPE": BpeTrainer,
    "Unigram": UnigramTrainer,
    "WordLevel": WordLevelTrainer,
    "WordPiece": WordPieceTrainer,
}

# 定义 VOCAB_FILES_NAMES 字典,指定了 tokenizer_file 的文件名
VOCAB_FILES_NAMES = {"tokenizer_file": TOKENIZER_FILE}


# 使用装饰器将 INIT_TOKENIZER_DOCSTRING 添加到类 PreTrainedTokenizerFast 上,并继承自 PreTrainedTokenizerBase
@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
    """
    Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].
    """
    """
    Handles all the shared methods for tokenization and special tokens, as well as methods for
    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

    This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    """

    # 定义一个类属性,用于存储词汇表文件的名称
    vocab_files_names = VOCAB_FILES_NAMES
    # 慢速分词器类的类型提示,默认为 None
    slow_tokenizer_class: PreTrainedTokenizer = None

    @property
    def is_fast(self) -> bool:
        # 返回一个布尔值,指示是否使用了快速分词器
        return True

    @property
    def can_save_slow_tokenizer(self) -> bool:
        """
        `bool`: Whether or not the slow tokenizer can be saved. Usually for sentencepiece based slow tokenizer, this
        can only be `True` if the original `"sentencepiece.model"` was not deleted.
        """
        # 返回一个布尔值,指示是否可以保存慢速分词器
        return True

    @property
    def vocab_size(self) -> int:
        """
        `int`: Size of the base vocabulary (without the added tokens).
        """
        # 返回基本词汇表的大小(不包括添加的特殊标记)
        return self._tokenizer.get_vocab_size(with_added_tokens=False)

    def get_vocab(self) -> Dict[str, int]:
        # 返回包括添加的特殊标记在内的词汇表
        return self._tokenizer.get_vocab(with_added_tokens=True)

    @property
    def vocab(self) -> Dict[str, int]:
        # 返回包括添加的特殊标记在内的词汇表
        return self.get_vocab()

    @property
    def added_tokens_encoder(self) -> Dict[str, int]:
        """
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        """
        # 返回从字符串到索引的排序映射,用于添加的特殊标记编码器
        return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])}

    @property
    def added_tokens_decoder(self) -> Dict[int, AddedToken]:
        """
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `Dict[str, int]`: The added tokens.
        """
        # 返回词汇表中添加的特殊标记,作为索引到 AddedToken 对象的字典
        return self._tokenizer.get_added_tokens_decoder()

    def get_added_vocab(self) -> Dict[str, int]:
        """
        Returns the added tokens in the vocabulary as a dictionary of token to index.

        Returns:
            `Dict[str, int]`: The added tokens.
        """
        # 返回词汇表中添加的特殊标记,作为 token 到索引的字典
        return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])}

    def __len__(self) -> int:
        """
        Size of the full vocabulary with the added tokens.
        """
        # 返回包括添加的特殊标记在内的词汇表的大小
        return self._tokenizer.get_vocab_size(with_added_tokens=True)

    @property
    def backend_tokenizer(self) -> TokenizerFast:
        """
        `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
        """
        # 返回作为后端使用的 Rust 分词器对象
        return self._tokenizer

    @property
    def decoder(self) -> DecoderFast:
        """
        `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
        """
        # 返回用于此分词器的 Rust 解码器对象
        return self._tokenizer.decoder
    def _convert_encoding(
        self,
        encoding: EncodingFast,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
    ) -> Tuple[Dict[str, Any], List[EncodingFast]]:
        """
        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
        of encodings, take care of building a batch from overflowing tokens.

        Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
        lists (overflows) of lists (tokens).

        Output shape: (overflows, sequence length)
        """
        # Determine if `return_token_type_ids` should be inferred based on model input names
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        # Determine if `return_attention_mask` should be inferred based on model input names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        # Initialize `encodings` with current encoding or handle overflowing tokens
        if return_overflowing_tokens and encoding.overflowing is not None:
            encodings = [encoding] + encoding.overflowing
        else:
            encodings = [encoding]

        # Initialize a defaultdict to collect various encoding attributes as lists
        encoding_dict = defaultdict(list)
        # Iterate over each encoding in `encodings`
        for e in encodings:
            # Append token ids to the `input_ids` list in `encoding_dict`
            encoding_dict["input_ids"].append(e.ids)

            # Append token type ids if `return_token_type_ids` is enabled
            if return_token_type_ids:
                encoding_dict["token_type_ids"].append(e.type_ids)
            # Append attention mask if `return_attention_mask` is enabled
            if return_attention_mask:
                encoding_dict["attention_mask"].append(e.attention_mask)
            # Append special tokens mask if `return_special_tokens_mask` is enabled
            if return_special_tokens_mask:
                encoding_dict["special_tokens_mask"].append(e.special_tokens_mask)
            # Append offset mappings if `return_offsets_mapping` is enabled
            if return_offsets_mapping:
                encoding_dict["offset_mapping"].append(e.offsets)
            # Append length of token ids if `return_length` is enabled
            if return_length:
                encoding_dict["length"].append(len(e.ids))

        # Return the collected encoding attributes as `encoding_dict` and the list of `encodings`
        return encoding_dict, encodings

    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
        """
        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
        vocabulary.

        Args:
            tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).

        Returns:
            `int` or `List[int]`: The token id or list of token ids.
        """
        # If `tokens` is None, return None
        if tokens is None:
            return None

        # If `tokens` is a string, convert it to token id using `_convert_token_to_id_with_added_voc`
        if isinstance(tokens, str):
            return self._convert_token_to_id_with_added_voc(tokens)

        # If `tokens` is a list of strings, convert each token to token ids using `_convert_token_to_id_with_added_voc`
        return [self._convert_token_to_id_with_added_voc(token) for token in tokens]

    def _convert_token_to_id_with_added_voc(self, token: str) -> int:
        # Get the token id from `_tokenizer`, or return `unk_token_id` if token is unknown
        index = self._tokenizer.token_to_id(token)
        if index is None:
            return self.unk_token_id
        return index
    # 根据给定的索引将其转换为对应的标记(字符串)
    def _convert_id_to_token(self, index: int) -> Optional[str]:
        return self._tokenizer.id_to_token(int(index))

    # 向分词器添加新的标记(单词或特殊标记)
    def _add_tokens(self, new_tokens: List[Union[str, AddedToken]], special_tokens=False) -> int:
        if special_tokens:
            # 添加特殊标记到分词器
            return self._tokenizer.add_special_tokens(new_tokens)
        else:
            # 添加普通标记到分词器
            return self._tokenizer.add_tokens(new_tokens)

    # 返回编码序列时添加的特殊标记数量
    def num_special_tokens_to_add(self, pair: bool = False) -> int:
        """
        返回在编码序列时添加的特殊标记数量。

        <Tip>

        这会对虚拟输入进行编码并检查添加的标记数量,因此效率较低。不要将此函数放在训练循环中。

        </Tip>

        Args:
            pair (`bool`, *optional*, 默认为 `False`):
                是否在序列对(sequence pair)情况下计算添加的特殊标记数量,或单独序列的情况。

        Returns:
            `int`: 添加到序列中的特殊标记数量。
        """
        return self._tokenizer.num_special_tokens_to_add(pair)

    # 将给定的标记索引或索引列表转换为对应的标记或标记列表
    def convert_ids_to_tokens(
        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
    ) -> Union[str, List[str]]:
        """
        使用词汇表和已添加的标记,将单个索引或索引序列转换为标记或标记序列。

        Args:
            ids (`int` 或 `List[int]`):
                要转换为标记或标记序列的标记 ID(或标记 IDs)。
            skip_special_tokens (`bool`, *optional*, 默认为 `False`):
                是否在解码时跳过特殊标记。

        Returns:
            `str` 或 `List[str]`: 解码后的标记(或标记列表)。
        """
        if isinstance(ids, int):
            return self._tokenizer.id_to_token(ids)
        tokens = []
        for index in ids:
            index = int(index)
            if skip_special_tokens and index in self.all_special_ids:
                continue
            tokens.append(self._tokenizer.id_to_token(index))
        return tokens

    # 对文本进行分词处理,返回标记列表
    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
        return self.encode_plus(text=text, text_pair=pair, add_special_tokens=add_special_tokens, **kwargs).tokens()

    # 设置截断和填充策略,以及相关的参数
    def set_truncation_and_padding(
        self,
        padding_strategy: PaddingStrategy,
        truncation_strategy: TruncationStrategy,
        max_length: int,
        stride: int,
        pad_to_multiple_of: Optional[int],
        """
        Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
        library) and restore the tokenizer settings afterwards.

        The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
        padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
        section.

        Args:
            padding_strategy ([`~utils.PaddingStrategy`]):
                The kind of padding that will be applied to the input
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
                The kind of truncation that will be applied to the input
            max_length (`int`):
                The maximum size of a sequence.
            stride (`int`):
                The stride to use when handling overflow.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
        """
        # Preserve the current truncation and padding settings of the tokenizer
        _truncation = self._tokenizer.truncation
        _padding = self._tokenizer.padding

        # Set truncation strategy on the backend tokenizer
        if truncation_strategy == TruncationStrategy.DO_NOT_TRUNCATE:
            # If DO_NOT_TRUNCATE is specified, ensure no truncation is applied
            if _truncation is not None:
                self._tokenizer.no_truncation()
        else:
            # Define the target truncation settings
            target = {
                "max_length": max_length,
                "stride": stride,
                "strategy": truncation_strategy.value,
                "direction": self.truncation_side,
            }

            # Compare current truncation settings with the target settings
            if _truncation is None:
                current = None
            else:
                current = {k: _truncation.get(k, None) for k in target}

            # Enable truncation if current settings differ from the target settings
            if current != target:
                self._tokenizer.enable_truncation(**target)

        # Set padding strategy on the backend tokenizer
        if padding_strategy == PaddingStrategy.DO_NOT_PAD:
            # If DO_NOT_PAD is specified, ensure no padding is applied
            if _padding is not None:
                self._tokenizer.no_padding()
        else:
            # Define the target padding settings
            length = max_length if padding_strategy == PaddingStrategy.MAX_LENGTH else None
            target = {
                "length": length,
                "direction": self.padding_side,
                "pad_id": self.pad_token_id,
                "pad_token": self.pad_token,
                "pad_type_id": self.pad_token_type_id,
                "pad_to_multiple_of": pad_to_multiple_of,
            }

            # Compare current padding settings with the target settings
            if _padding != target:
                self._tokenizer.enable_padding(**target)
    # 定义一个方法用于批量编码文本或文本对
    def _batch_encode_plus(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput], List[TextInputPair], List[PreTokenizedInput], List[PreTokenizedInputPair]
        ],
        add_special_tokens: bool = True,  # 是否添加特殊的标记符号,默认为True
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,  # 填充策略,默认不填充
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,  # 截断策略,默认不截断
        max_length: Optional[int] = None,  # 最大长度限制,默认为无限制
        stride: int = 0,  # 步长,默认为0
        is_split_into_words: bool = False,  # 输入是否已分成单词,默认为False
        pad_to_multiple_of: Optional[int] = None,  # 填充到指定的倍数,默认为不填充到倍数
        return_tensors: Optional[str] = None,  # 返回的张量类型,默认为None
        return_token_type_ids: Optional[bool] = None,  # 是否返回token类型ID,默认为None
        return_attention_mask: Optional[bool] = None,  # 是否返回注意力掩码,默认为None
        return_overflowing_tokens: bool = False,  # 是否返回溢出的token,默认为False
        return_special_tokens_mask: bool = False,  # 是否返回特殊token的掩码,默认为False
        return_offsets_mapping: bool = False,  # 是否返回偏移映射,默认为False
        return_length: bool = False,  # 是否返回长度,默认为False
        verbose: bool = True,  # 是否详细输出信息,默认为True
    ):
    
    # 定义一个方法用于编码单个文本或文本对
    def _encode_plus(
        self,
        text: Union[TextInput, PreTokenizedInput],  # 输入的文本或预分词的文本
        text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None,  # 可选的文本对
        add_special_tokens: bool = True,  # 是否添加特殊的标记符号,默认为True
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,  # 填充策略,默认不填充
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,  # 截断策略,默认不截断
        max_length: Optional[int] = None,  # 最大长度限制,默认为无限制
        stride: int = 0,  # 步长,默认为0
        is_split_into_words: bool = False,  # 输入是否已分成单词,默认为False
        pad_to_multiple_of: Optional[int] = None,  # 填充到指定的倍数,默认为不填充到倍数
        return_tensors: Optional[bool] = None,  # 返回的张量类型,默认为None
        return_token_type_ids: Optional[bool] = None,  # 是否返回token类型ID,默认为None
        return_attention_mask: Optional[bool] = None,  # 是否返回注意力掩码,默认为None
        return_overflowing_tokens: bool = False,  # 是否返回溢出的token,默认为False
        return_special_tokens_mask: bool = False,  # 是否返回特殊token的掩码,默认为False
        return_offsets_mapping: bool = False,  # 是否返回偏移映射,默认为False
        return_length: bool = False,  # 是否返回长度,默认为False
        verbose: bool = True,  # 是否详细输出信息,默认为True
        **kwargs,  # 其他关键字参数,用于扩展功能
    ):
    ) -> BatchEncoding:
        # 将输入文本和可能存在的文本对作为一个批次输入,根据需要包装成元组
        batched_input = [(text, text_pair)] if text_pair else [text]
        # 调用内部方法进行批量编码处理,生成批次输出
        batched_output = self._batch_encode_plus(
            batched_input,
            is_split_into_words=is_split_into_words,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

        # 如果没有返回张量并且没有返回溢出的token,则移除前导的批次轴
        # 如果溢出的token作为一批输出返回,则在此情况下保留它们
        if return_tensors is None and not return_overflowing_tokens:
            # 重新处理批次输出,确保每个值正确处理为单个元素或列表的形式
            batched_output = BatchEncoding(
                {
                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
                    for key, value in batched_output.items()
                },
                batched_output.encodings,
            )

        # 检查并警告序列长度是否超过设定的最大长度
        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)

        # 返回处理后的批次输出
        return batched_output

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        # 使用后端的tokenizer decoder将token列表转换为字符串
        return self.backend_tokenizer.decoder.decode(tokens)

    def _decode(
        self,
        token_ids: Union[int, List[int]],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        **kwargs,
    ) -> str:
        # 检查是否需要使用源tokenizer进行解码
        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)

        # 将token_ids转换为列表形式(如果输入为单个整数)
        if isinstance(token_ids, int):
            token_ids = [token_ids]
        # 使用内部的tokenizer解码token_ids,根据需要跳过特殊token
        text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)

        # 检查是否需要清理token化空间
        clean_up_tokenization_spaces = (
            clean_up_tokenization_spaces
            if clean_up_tokenization_spaces is not None
            else self.clean_up_tokenization_spaces
        )
        # 如果需要清理token化空间,则执行清理操作并返回清理后的文本
        if clean_up_tokenization_spaces:
            clean_text = self.clean_up_tokenization(text)
            return clean_text
        else:
            # 否则直接返回解码后的文本
            return text

    def _save_pretrained(
        self,
        save_directory: Union[str, os.PathLike],
        file_names: Tuple[str],
        legacy_format: Optional[bool] = None,
        filename_prefix: Optional[str] = None,
    ) -> Tuple[str]:
        """
        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
        file containing {config + vocab + added-tokens}.
        """
        # 将保存目录转换为字符串
        save_directory = str(save_directory)

        # 如果没有定义慢速分词器的类且需要遗留格式,则引发值错误
        if self.slow_tokenizer_class is None and legacy_format is True:
            raise ValueError(
                "Your tokenizer does not have a legacy version defined and therefore cannot register this version. You"
                " might consider leaving the legacy_format at `None` or setting it to `False`."
            )

        # 决定是否保存慢速分词器
        save_slow = (
            (legacy_format is None or legacy_format is True)
            and self.slow_tokenizer_class is not None
            and self.can_save_slow_tokenizer
        )
        # 决定是否保存快速分词器
        save_fast = legacy_format is None or legacy_format is False

        # 如果需要保存慢速分词器
        if save_slow:
            # 构造添加的标记文件路径
            added_tokens_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
            )
            # 确保对未来兼容
            added_vocab = {tok: index for tok, index in self.added_tokens_encoder.items() if index >= self.vocab_size}
            # 如果有添加的词汇,写入JSON文件
            if added_vocab:
                with open(added_tokens_file, "w", encoding="utf-8") as f:
                    out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
                    f.write(out_str)

            # 保存词汇表文件并获取文件名列表
            vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
            file_names = file_names + vocab_files + (added_tokens_file,)

        # 如果需要保存快速分词器
        if save_fast:
            # 构造分词器文件路径
            tokenizer_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_FILE
            )
            # 调用后端分词器的保存方法
            self.backend_tokenizer.save(tokenizer_file)
            file_names = file_names + (tokenizer_file,)

        # 返回所有保存的文件名列表
        return file_names

.\tools\agents.py

# 导入 Python 标准库和第三方模块
import importlib.util  # 导入模块的辅助函数
import json  # 导入 JSON 解析库
import os  # 提供与操作系统交互的功能
import time  # 提供时间相关的功能
from dataclasses import dataclass  # 提供创建数据类的装饰器
from typing import Dict  # 引入类型提示

import requests  # 提供方便的 HTTP 请求功能
from huggingface_hub import HfFolder, hf_hub_download, list_spaces  # 引入与 Hugging Face Hub 相关的功能

# 导入本地定义的模块和函数
from ..models.auto import AutoTokenizer  # 自动加载适合任务的 tokenizer
from ..utils import is_offline_mode, is_openai_available, is_torch_available, logging  # 导入实用工具函数和日志记录器
from .base import TASK_MAPPING, TOOL_CONFIG_FILE, Tool, load_tool, supports_remote  # 导入基础配置和函数定义
from .prompts import CHAT_MESSAGE_PROMPT, download_prompt  # 导入对话提示和下载提示
from .python_interpreter import evaluate  # 导入 Python 解释器相关功能

# 获取日志记录器
logger = logging.get_logger(__name__)

# 如果 OpenAI 可用,导入 openai 模块
if is_openai_available():
    import openai

# 如果 Torch 可用,导入相关模块
if is_torch_available():
    from ..generation import StoppingCriteria, StoppingCriteriaList  # 导入生成停止条件
    from ..models.auto import AutoModelForCausalLM  # 导入适合因果语言建模的自动模型
else:
    StoppingCriteria = object  # 否则定义一个基础对象作为停止条件

# 工具初始化标志
_tools_are_initialized = False

# 基础 Python 工具集合,包括常见内置函数和类型转换
BASE_PYTHON_TOOLS = {
    "print": print,
    "range": range,
    "float": float,
    "int": int,
    "bool": bool,
    "str": str,
}

# 数据类定义,用于表示预设工具的信息
@dataclass
class PreTool:
    task: str  # 工具的任务描述
    description: str  # 工具的描述信息
    repo_id: str  # 工具关联的存储库 ID

# Hugging Face 默认工具的空字典
HUGGINGFACE_DEFAULT_TOOLS = {}

# 从 Hugging Face Hub 导入的默认工具列表
HUGGINGFACE_DEFAULT_TOOLS_FROM_HUB = [
    "image-transformation",
    "text-download",
    "text-to-image",
    "text-to-video",
]

# 获取远程工具的函数定义,从指定组织的存储库中检索工具
def get_remote_tools(organization="huggingface-tools"):
    if is_offline_mode():  # 如果处于离线模式,则提示无法访问远程工具
        logger.info("You are in offline mode, so remote tools are not available.")
        return {}  # 返回空字典表示没有可用的远程工具

    spaces = list_spaces(author=organization)  # 获取指定组织的空间列表
    tools = {}  # 初始化工具字典
    for space_info in spaces:
        repo_id = space_info.id  # 获取存储库 ID
        resolved_config_file = hf_hub_download(repo_id, TOOL_CONFIG_FILE, repo_type="space")  # 下载解析后的配置文件
        with open(resolved_config_file, encoding="utf-8") as reader:
            config = json.load(reader)  # 加载配置文件内容

        task = repo_id.split("/")[-1]  # 提取任务描述
        tools[config["name"]] = PreTool(task=task, description=config["description"], repo_id=repo_id)  # 存储工具信息到字典

    return tools  # 返回工具字典

# 设置默认工具的函数定义
def _setup_default_tools():
    global HUGGINGFACE_DEFAULT_TOOLS  # 声明全局变量
    global _tools_are_initialized  # 声明全局变量

    if _tools_are_initialized:  # 如果工具已初始化,则直接返回,避免重复设置
        return

    main_module = importlib.import_module("transformers")  # 导入 transformers 主模块
    tools_module = main_module.tools  # 获取 tools 子模块

    remote_tools = get_remote_tools()  # 获取远程工具
    # 遍历任务映射中的每个任务名和对应的工具类名
    for task_name, tool_class_name in TASK_MAPPING.items():
        # 从tools_module中获取工具类对象,名称为tool_class_name
        tool_class = getattr(tools_module, tool_class_name)
        # 获取工具类对象的描述信息
        description = tool_class.description
        # 将预处理工具对象加入到HUGGINGFACE_DEFAULT_TOOLS字典中,键为工具类的名称,值为PreTool对象
        HUGGINGFACE_DEFAULT_TOOLS[tool_class.name] = PreTool(task=task_name, description=description, repo_id=None)
    
    # 如果不处于离线模式
    if not is_offline_mode():
        # 遍历需要从Hub获取的默认工具列表中的每个任务名
        for task_name in HUGGINGFACE_DEFAULT_TOOLS_FROM_HUB:
            found = False
            # 遍历远程工具字典中的每个工具名和工具对象
            for tool_name, tool in remote_tools.items():
                # 如果远程工具对象的任务名与当前任务名匹配
                if tool.task == task_name:
                    # 将远程工具对象添加到HUGGINGFACE_DEFAULT_TOOLS字典中,键为工具名
                    HUGGINGFACE_DEFAULT_TOOLS[tool_name] = tool
                    found = True
                    break
    
            # 如果未找到匹配的远程工具,抛出值错误异常
            if not found:
                raise ValueError(f"{task_name} is not implemented on the Hub.")
    
    # 设置工具初始化状态标志为True
    _tools_are_initialized = True
# 解析工具函数,根据给定的代码、工具箱和是否远程访问标志来解析工具
def resolve_tools(code, toolbox, remote=False, cached_tools=None):
    # 如果未提供缓存的工具列表,使用基础 Python 工具的副本作为起点
    if cached_tools is None:
        resolved_tools = BASE_PYTHON_TOOLS.copy()
    else:
        resolved_tools = cached_tools
    # 遍历工具箱中的每个工具项
    for name, tool in toolbox.items():
        # 如果工具名称不在给定代码中,或者已经在解析后的工具列表中,则跳过
        if name not in code or name in resolved_tools:
            continue

        # 如果工具是 Tool 类的实例,直接加入解析后的工具列表
        if isinstance(tool, Tool):
            resolved_tools[name] = tool
        else:
            # 否则根据工具的任务或仓库 ID 加载工具,并根据需要进行远程访问
            task_or_repo_id = tool.task if tool.repo_id is None else tool.repo_id
            _remote = remote and supports_remote(task_or_repo_id)
            resolved_tools[name] = load_tool(task_or_repo_id, remote=_remote)

    # 返回最终解析后的工具列表
    return resolved_tools


# 生成工具创建代码,根据给定的代码和工具箱
def get_tool_creation_code(code, toolbox, remote=False):
    # 初始化代码行,导入 load_tool 函数
    code_lines = ["from transformers import load_tool", ""]
    # 遍历工具箱中的每个工具项
    for name, tool in toolbox.items():
        # 如果工具名称不在给定代码中或者工具是 Tool 类的实例,则跳过
        if name not in code or isinstance(tool, Tool):
            continue

        # 根据工具的任务或仓库 ID 构建工具创建代码行
        task_or_repo_id = tool.task if tool.repo_id is None else tool.repo_id
        line = f'{name} = load_tool("{task_or_repo_id}"'
        # 如果需要远程访问,则设置 remote=True
        if remote:
            line += ", remote=True"
        line += ")"
        # 添加构建好的代码行到代码列表中
        code_lines.append(line)

    # 将所有代码行连接成一个字符串并返回
    return "\n".join(code_lines) + "\n"


# 清理代码以便于聊天展示
def clean_code_for_chat(result):
    # 拆分结果为解释部分和代码部分
    lines = result.split("\n")
    idx = 0
    # 寻找代码块的起始位置
    while idx < len(lines) and not lines[idx].lstrip().startswith("```"):
        idx += 1
    explanation = "\n".join(lines[:idx]).strip()
    if idx == len(lines):
        return explanation, None

    idx += 1
    start_idx = idx
    # 继续寻找代码块的结束位置
    while not lines[idx].lstrip().startswith("```"):
        idx += 1
    code = "\n".join(lines[start_idx:idx]).strip()

    # 返回清理后的解释部分和代码部分
    return explanation, code


# 清理代码以便于运行
def clean_code_for_run(result):
    # 添加标记到结果中以便于识别和处理
    result = f"I will use the following {result}"
    # 拆分结果为解释部分和代码部分
    explanation, code = result.split("Answer:")
    explanation = explanation.strip()
    code = code.strip()

    # 分割代码行并去除首尾的代码块标记
    code_lines = code.split("\n")
    if code_lines[0] in ["```", "```", "```"]:
        code_lines = code_lines[1:]
    if code_lines[-1] == "```":
        code_lines = code_lines[:-1]
    code = "\n".join(code_lines)

    # 返回清理后的解释部分和代码部分
    return explanation, code
    Args:
        chat_prompt_template (`str`, *optional*):
            Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
            `chat_prompt_template.txt` in this repo in this case.
        run_prompt_template (`str`, *optional*):
            Pass along your own prompt if you want to override the default template for the `run` method. Can be the
            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
            `run_prompt_template.txt` in this repo in this case.
        additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
            Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
            one of the default tools, that default tool will be overridden.
    """

    # 定义一个类,用于处理对话生成模型的配置和工具
    def __init__(self, chat_prompt_template=None, run_prompt_template=None, additional_tools=None):
        # 设置默认工具集
        _setup_default_tools()

        # 获取当前类的名称作为代理名称
        agent_name = self.__class__.__name__

        # 下载指定的对话生成模板,用于聊天模式
        self.chat_prompt_template = download_prompt(chat_prompt_template, agent_name, mode="chat")

        # 下载指定的对话生成模板,用于运行模式
        self.run_prompt_template = download_prompt(run_prompt_template, agent_name, mode="run")

        # 复制默认的 Hugging Face 工具集到代理的工具箱
        self._toolbox = HUGGINGFACE_DEFAULT_TOOLS.copy()

        # 设置日志功能为打印输出
        self.log = print

        # 如果提供了额外的工具,则更新代理的工具箱
        if additional_tools is not None:
            # 如果 additional_tools 是列表或元组,则将其转换为字典,以工具名称作为键
            if isinstance(additional_tools, (list, tuple)):
                additional_tools = {t.name: t for t in additional_tools}
            # 如果 additional_tools 不是字典,则将其转换为包含单个工具的字典
            elif not isinstance(additional_tools, dict):
                additional_tools = {additional_tools.name: additional_tools}

            # 找出在 additional_tools 中已经存在于默认工具集中的工具,并将其用新工具替换
            replacements = {name: tool for name, tool in additional_tools.items() if name in HUGGINGFACE_DEFAULT_TOOLS}
            # 更新代理的工具箱
            self._toolbox.update(additional_tools)

            # 如果有工具被替换了,则记录警告信息
            if len(replacements) > 1:
                names = "\n".join([f"- {n}: {t}" for n, t in replacements.items()])
                logger.warning(
                    f"The following tools have been replaced by the ones provided in `additional_tools`:\n{names}."
                )
            elif len(replacements) == 1:
                name = list(replacements.keys())[0]
                logger.warning(f"{name} has been replaced by {replacements[name]} as provided in `additional_tools`.")

        # 准备进行新的聊天会话的初始化工作
        self.prepare_for_new_chat()

    @property
    def toolbox(self) -> Dict[str, Tool]:
        """Get all tool currently available to the agent"""
        # 返回当前代理可用的所有工具集合
        return self._toolbox
    def format_prompt(self, task, chat_mode=False):
        # 构建描述工具的字符串,每行包含工具名称和描述
        description = "\n".join([f"- {name}: {tool.description}" for name, tool in self.toolbox.items()])
        
        # 根据聊天模式选择不同的提示模板
        if chat_mode:
            # 如果历史记录为空,使用聊天提示模板并替换工具描述部分
            if self.chat_history is None:
                prompt = self.chat_prompt_template.replace("<<all_tools>>", description)
            else:
                # 否则,使用已有的聊天历史记录
                prompt = self.chat_history
            # 添加当前任务到聊天提示中
            prompt += CHAT_MESSAGE_PROMPT.replace("<<task>>", task)
        else:
            # 使用运行提示模板并替换工具描述和任务部分
            prompt = self.run_prompt_template.replace("<<all_tools>>", description)
            prompt = prompt.replace("<<prompt>>", task)
        
        # 返回生成的提示
        return prompt

    def set_stream(self, streamer):
        """
        Set the function use to stream results (which is `print` by default).

        Args:
            streamer (`callable`): The function to call when streaming results from the LLM.
        """
        # 设置日志输出函数,用于从语言模型生成的结果流式输出
        self.log = streamer

    def chat(self, task, *, return_code=False, remote=False, **kwargs):
        """
        Sends a new request to the agent in a chat. Will use the previous ones in its history.

        Args:
            task (`str`): The task to perform
            return_code (`bool`, *optional*, defaults to `False`):
                Whether to just return code and not evaluate it.
            remote (`bool`, *optional*, defaults to `False`):
                Whether or not to use remote tools (inference endpoints) instead of local ones.
            kwargs (additional keyword arguments, *optional*):
                Any keyword argument to send to the agent when evaluating the code.
        """
        # 根据任务构建格式化后的提示字符串,用于发送给语言模型
        prompt = self.format_prompt(task, chat_mode=True)
        
        # 生成一个对话回复,停止条件为指定字符串
        result = self.generate_one(prompt, stop=["Human:", "====="])
        
        # 更新聊天历史记录,包含之前的提示和生成的结果
        self.chat_history = prompt + result.strip() + "\n"
        
        # 清理生成的代码,获取解释和代码
        explanation, code = clean_code_for_chat(result)
        
        # 输出语言模型生成的解释信息
        self.log(f"==Explanation from the agent==\n{explanation}")
        
        # 如果生成了代码,则输出生成的代码,并根据需求返回或评估代码结果
        if code is not None:
            self.log(f"\n\n==Code generated by the agent==\n{code}")
            if not return_code:
                # 解析工具并返回评估结果
                self.cached_tools = resolve_tools(code, self.toolbox, remote=remote, cached_tools=self.cached_tools)
                self.chat_state.update(kwargs)
                return evaluate(code, self.cached_tools, self.chat_state, chat_mode=True)
            else:
                # 获取生成代码的工具创建代码并返回完整的生成代码
                tool_code = get_tool_creation_code(code, self.toolbox, remote=remote)
                return f"{tool_code}\n{code}"
    def prepare_for_new_chat(self):
        """
        Clears the history of prior calls to [`~Agent.chat`].
        """
        # 清空聊天历史记录
        self.chat_history = None
        # 重置聊天状态
        self.chat_state = {}
        # 清空缓存的工具
        self.cached_tools = None

    def clean_code_for_run(self, result):
        """
        Override this method if you want to change the way the code is
        cleaned for the `run` method.
        """
        # 调用特定函数来清理运行代码
        return clean_code_for_run(result)

    def run(self, task, *, return_code=False, remote=False, **kwargs):
        """
        Sends a request to the agent.

        Args:
            task (`str`): The task to perform
            return_code (`bool`, *optional*, defaults to `False`):
                Whether to just return code and not evaluate it.
            remote (`bool`, *optional*, defaults to `False`):
                Whether or not to use remote tools (inference endpoints) instead of local ones.
            kwargs (additional keyword arguments, *optional*):
                Any keyword argument to send to the agent when evaluating the code.

        Example:

        ```
        from transformers import HfAgent

        agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
        agent.run("Draw me a picture of rivers and lakes")
        ```
        """
        # 格式化任务提示信息
        prompt = self.format_prompt(task)
        # 生成一次结果,返回解释和生成的代码
        result = self.generate_one(prompt, stop=["Task:"])
        explanation, code = self.clean_code_for_run(result)

        # 记录代理生成的解释
        self.log(f"==Explanation from the agent==\n{explanation}")

        # 记录代理生成的代码
        self.log(f"\n\n==Code generated by the agent==\n{code}")
        if not return_code:
            # 如果不仅返回代码而不评估,则解析工具并评估生成的代码
            self.log("\n\n==Result==")
            self.cached_tools = resolve_tools(code, self.toolbox, remote=remote, cached_tools=self.cached_tools)
            return evaluate(code, self.cached_tools, state=kwargs.copy())
        else:
            # 获取工具创建代码并返回与生成代码的组合结果
            tool_code = get_tool_creation_code(code, self.toolbox, remote=remote)
            return f"{tool_code}\n{code}"

    def generate_one(self, prompt, stop):
        # This is the method to implement in your custom agent.
        # 这是需要在自定义代理中实现的方法,抛出未实现错误
        raise NotImplementedError

    def generate_many(self, prompts, stop):
        # Override if you have a way to do batch generation faster than one by one
        # 如果有批量生成的更快方式,则重写此方法
        return [self.generate_one(prompt, stop) for prompt in prompts]
    """
    Agent that uses the openai API to generate code.

    <Tip warning={true}>
    
    The openAI models are used in generation mode, so even for the `chat()` API, it's better to use models like
    `"text-davinci-003"` over the chat-GPT variant. Proper support for chat-GPT models will come in a next version.
    
    </Tip>
    
    Args:
        model (`str`, *optional*, defaults to `"text-davinci-003"`):
            The name of the OpenAI model to use.
        api_key (`str`, *optional*):
            The API key to use. If unset, will look for the environment variable `"OPENAI_API_KEY"`.
        chat_prompt_template (`str`, *optional*):
            Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
            `chat_prompt_template.txt` in this repo in this case.
        run_prompt_template (`str`, *optional*):
            Pass along your own prompt if you want to override the default template for the `run` method. Can be the
            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
            `run_prompt_template.txt` in this repo in this case.
        additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
            Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
            one of the default tools, that default tool will be overridden.
    
    Example:
    
    ```
    from transformers import OpenAiAgent
    
    agent = OpenAiAgent(model="text-davinci-003", api_key=xxx)
    agent.run("Is the following `text` (in Spanish) positive or negative?", text="¡Este es un API muy agradable!")
    ```
    """
    
    def __init__(
        self,
        model="text-davinci-003",
        api_key=None,
        chat_prompt_template=None,
        run_prompt_template=None,
        additional_tools=None,
    ):
        # 检查是否安装了 openai 库,如果没有则抛出 ImportError 异常
        if not is_openai_available():
            raise ImportError("Using `OpenAiAgent` requires `openai`: `pip install openai`.")
        
        # 如果未提供 API 密钥,则尝试从环境变量 "OPENAI_API_KEY" 中获取
        if api_key is None:
            api_key = os.environ.get("OPENAI_API_KEY", None)
        
        # 如果仍未设置 API 密钥,则抛出 ValueError 异常
        if api_key is None:
            raise ValueError(
                "You need an openai key to use `OpenAIAgent`. You can get one here: Get one here "
                "https://openai.com/api/`. If you have one, set it in your env with `os.environ['OPENAI_API_KEY'] = "
                "xxx."
            )
        else:
            # 设置 openai 的 API 密钥
            openai.api_key = api_key
        
        # 初始化父类 Agent,传入可能的自定义聊天和运行模板以及额外工具
        super().__init__(
            chat_prompt_template=chat_prompt_template,
            run_prompt_template=run_prompt_template,
            additional_tools=additional_tools,
        )
    # 根据给定的 prompts 列表和 stop 标志,生成多个聊天或完成文本
    def generate_many(self, prompts, stop):
        # 如果模型名称中包含 "gpt",则使用 _chat_generate 方法生成每个 prompt 的结果
        if "gpt" in self.model:
            return [self._chat_generate(prompt, stop) for prompt in prompts]
        else:
            # 否则,使用 _completion_generate 方法生成所有 prompts 的结果
            return self._completion_generate(prompts, stop)

    # 根据给定的 prompt 和 stop 标志,生成一个聊天或完成文本
    def generate_one(self, prompt, stop):
        # 如果模型名称中包含 "gpt",则使用 _chat_generate 方法生成结果
        if "gpt" in self.model:
            return self._chat_generate(prompt, stop)
        else:
            # 否则,使用 _completion_generate 方法生成结果,并返回第一个元素
            return self._completion_generate([prompt], stop)[0]

    # 使用 OpenAI 的聊天 API 生成文本
    def _chat_generate(self, prompt, stop):
        # 调用 OpenAI 的 chat.completions.create 方法,传入模型、消息内容、温度和停止条件
        result = openai.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            stop=stop,
        )
        # 返回生成的文本内容
        return result.choices[0].message.content

    # 使用 OpenAI 的 Completion API 生成文本
    def _completion_generate(self, prompts, stop):
        # 调用 OpenAI 的 Completion.create 方法,传入模型、prompt、温度、停止条件和最大 token 数
        result = openai.Completion.create(
            model=self.model,
            prompt=prompts,
            temperature=0,
            stop=stop,
            max_tokens=200,
        )
        # 返回生成的文本内容列表
        return [answer["text"] for answer in result["choices"]]
    """
    AzureOpenAiAgent 是一个继承自 Agent 的代理类,用于利用 Azure OpenAI 生成代码。参考官方文档来了解如何在 Azure 上部署 OpenAI 模型。

    Args:
        deployment_id (`str`):
            要使用的已部署 Azure OpenAI 模型的名称。
        api_key (`str`, *optional*):
            要使用的 API 密钥。如果未设置,将查找环境变量 `"AZURE_OPENAI_API_KEY"`。
        resource_name (`str`, *optional*):
            Azure OpenAI 资源的名称。如果未设置,将查找环境变量 `"AZURE_OPENAI_RESOURCE_NAME"`。
        api_version (`str`, *optional*, default to `"2022-12-01"`):
            该代理使用的 API 版本。
        is_chat_mode (`bool`, *optional*):
            是否使用聊天模型而非完成模型(参见上述注释,聊天模型的效率较低)。默认根据 `deployment_id` 是否包含 `'gpt'` 来判断。
        chat_prompt_template (`str`, *optional*):
            如果要覆盖 `chat` 方法的默认模板,请传递自定义的提示模板。可以是实际的提示模板或 Hugging Face Hub 上的 repo ID。在这种情况下,提示应该在该 repo 中命名为 `chat_prompt_template.txt`。
        run_prompt_template (`str`, *optional*):
            如果要覆盖 `run` 方法的默认模板,请传递自定义的提示模板。可以是实际的提示模板或 Hugging Face Hub 上的 repo ID。在这种情况下,提示应该在该 repo 中命名为 `run_prompt_template.txt`。
        additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
            除默认工具外要包含的任何附加工具。如果传递与默认工具同名的工具,将覆盖默认工具。

    Example:

    ```
    from transformers import AzureOpenAiAgent

    agent = AzureAiAgent(deployment_id="Davinci-003", api_key=xxx, resource_name=yyy)
    agent.run("Is the following `text` (in Spanish) positive or negative?", text="¡Este es un API muy agradable!")
    ```
    """

    def __init__(
        self,
        deployment_id,
        api_key=None,
        resource_name=None,
        api_version="2022-12-01",
        is_chat_model=None,
        chat_prompt_template=None,
        run_prompt_template=None,
        additional_tools=None,
    ):
        """
        初始化 AzureOpenAiAgent 实例。

        Args:
            deployment_id (`str`):
                要使用的已部署 Azure OpenAI 模型的名称。
            api_key (`str`, *optional*):
                要使用的 API 密钥。如果未设置,将查找环境变量 `"AZURE_OPENAI_API_KEY"`。
            resource_name (`str`, *optional*):
                Azure OpenAI 资源的名称。如果未设置,将查找环境变量 `"AZURE_OPENAI_RESOURCE_NAME"`。
            api_version (`str`, *optional*, default to `"2022-12-01"`):
                该代理使用的 API 版本。
            is_chat_mode (`bool`, *optional*):
                是否使用聊天模型而非完成模型(参见上述注释,聊天模型的效率较低)。默认根据 `deployment_id` 是否包含 `'gpt'` 来判断。
            chat_prompt_template (`str`, *optional*):
                如果要覆盖 `chat` 方法的默认模板,请传递自定义的提示模板。可以是实际的提示模板或 Hugging Face Hub 上的 repo ID。在这种情况下,提示应该在该 repo 中命名为 `chat_prompt_template.txt`。
            run_prompt_template (`str`, *optional*):
                如果要覆盖 `run` 方法的默认模板,请传递自定义的提示模板。可以是实际的提示模板或 Hugging Face Hub 上的 repo ID。在这种情况下,提示应该在该 repo 中命名为 `run_prompt_template.txt`。
            additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
                除默认工具外要包含的任何附加工具。如果传递与默认工具同名的工具,将覆盖默认工具。
        """
        super().__init__()  # 调用父类 Agent 的构造函数
        self.deployment_id = deployment_id
        self.api_key = api_key if api_key else os.getenv("AZURE_OPENAI_API_KEY")  # 设置 API 密钥,如果未提供则从环境变量获取
        self.resource_name = resource_name if resource_name else os.getenv("AZURE_OPENAI_RESOURCE_NAME")  # 设置 Azure OpenAI 资源名称,如果未提供则从环境变量获取
        self.api_version = api_version  # 设置 API 版本
        self.is_chat_mode = is_chat_mode if is_chat_mode is not None else 'gpt' in deployment_id.lower()  # 设置是否为聊天模式,默认根据 deployment_id 是否包含 'gpt' 来判断
        self.chat_prompt_template = chat_prompt_template  # 设置聊天模式的提示模板
        self.run_prompt_template = run_prompt_template  # 设置运行模式的提示模板
        self.additional_tools = additional_tools  # 设置额外的工具列表或字典
    ):
        # 检查是否安装了 openai 库,如果未安装则抛出 ImportError 异常
        if not is_openai_available():
            raise ImportError("Using `OpenAiAgent` requires `openai`: `pip install openai`.")

        # 设置部署 ID
        self.deployment_id = deployment_id
        # 设置 OpenAI API 类型为 "azure"
        openai.api_type = "azure"
        
        # 如果 API 密钥未提供,则尝试从环境变量中获取 AZURE_OPENAI_API_KEY
        if api_key is None:
            api_key = os.environ.get("AZURE_OPENAI_API_KEY", None)
        # 如果仍然没有 API 密钥,则抛出 ValueError 异常
        if api_key is None:
            raise ValueError(
                "You need an Azure openAI key to use `AzureOpenAIAgent`. If you have one, set it in your env with "
                "`os.environ['AZURE_OPENAI_API_KEY'] = xxx."
            )
        else:
            # 设置 OpenAI API 密钥
            openai.api_key = api_key
        
        # 如果资源名称未提供,则尝试从环境变量中获取 AZURE_OPENAI_RESOURCE_NAME
        if resource_name is None:
            resource_name = os.environ.get("AZURE_OPENAI_RESOURCE_NAME", None)
        # 如果仍然没有资源名称,则抛出 ValueError 异常
        if resource_name is None:
            raise ValueError(
                "You need a resource_name to use `AzureOpenAIAgent`. If you have one, set it in your env with "
                "`os.environ['AZURE_OPENAI_RESOURCE_NAME'] = xxx."
            )
        else:
            # 设置 OpenAI API 基础 URL
            openai.api_base = f"https://{resource_name}.openai.azure.com"
        
        # 设置 OpenAI API 版本
        openai.api_version = api_version

        # 如果 is_chat_model 未提供,则根据 deployment_id 决定是否是聊天模型
        if is_chat_model is None:
            is_chat_model = "gpt" in deployment_id.lower()
        # 设置实例的 is_chat_model 属性
        self.is_chat_model = is_chat_model

        # 调用父类的构造函数,初始化实例
        super().__init__(
            chat_prompt_template=chat_prompt_template,
            run_prompt_template=run_prompt_template,
            additional_tools=additional_tools,
        )

    # 生成多个结果的方法
    def generate_many(self, prompts, stop):
        # 如果是聊天模型,则使用 _chat_generate 方法生成多个结果
        if self.is_chat_model:
            return [self._chat_generate(prompt, stop) for prompt in prompts]
        else:
            # 否则使用 _completion_generate 方法生成多个结果
            return self._completion_generate(prompts, stop)

    # 生成单个结果的方法
    def generate_one(self, prompt, stop):
        # 如果是聊天模型,则使用 _chat_generate 方法生成单个结果
        if self.is_chat_model:
            return self._chat_generate(prompt, stop)
        else:
            # 否则使用 _completion_generate 方法生成单个结果并返回第一个结果
            return self._completion_generate([prompt], stop)[0]

    # 聊天生成方法,使用 OpenAI ChatCompletion API
    def _chat_generate(self, prompt, stop):
        result = openai.ChatCompletion.create(
            engine=self.deployment_id,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            stop=stop,
        )
        # 返回生成的聊天消息内容
        return result["choices"][0]["message"]["content"]

    # 完整生成方法,使用 OpenAI Completion API
    def _completion_generate(self, prompts, stop):
        result = openai.Completion.create(
            engine=self.deployment_id,
            prompt=prompts,
            temperature=0,
            stop=stop,
            max_tokens=200,
        )
        # 返回生成的每个答案的文本内容列表
        return [answer["text"] for answer in result["choices"]]
class HfAgent(Agent):
    """
    Agent that uses an inference endpoint to generate code.

    Args:
        url_endpoint (`str`):
            The name of the url endpoint to use.
        token (`str`, *optional*):
            The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated when
            running `huggingface-cli login` (stored in `~/.huggingface`).
        chat_prompt_template (`str`, *optional*):
            Pass along your own prompt if you want to override the default template for the `chat` method. Can be the
            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
            `chat_prompt_template.txt` in this repo in this case.
        run_prompt_template (`str`, *optional*):
            Pass along your own prompt if you want to override the default template for the `run` method. Can be the
            actual prompt template or a repo ID (on the Hugging Face Hub). The prompt should be in a file named
            `run_prompt_template.txt` in this repo in this case.
        additional_tools ([`Tool`], list of tools or dictionary with tool values, *optional*):
            Any additional tools to include on top of the default ones. If you pass along a tool with the same name as
            one of the default tools, that default tool will be overridden.

    Example:

    ```
    from transformers import HfAgent

    agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
    agent.run("Is the following `text` (in Spanish) positive or negative?", text="¡Este es un API muy agradable!")
    ```
    """

    def __init__(
        self, url_endpoint, token=None, chat_prompt_template=None, run_prompt_template=None, additional_tools=None
    ):
        # 设置推理端点的URL
        self.url_endpoint = url_endpoint
        # 根据传入的token参数或者从本地获取的token来设置HTTP授权token
        if token is None:
            self.token = f"Bearer {HfFolder().get_token()}"
        elif token.startswith("Bearer") or token.startswith("Basic"):
            self.token = token
        else:
            self.token = f"Bearer {token}"
        # 调用父类构造函数初始化Agent基类
        super().__init__(
            chat_prompt_template=chat_prompt_template,
            run_prompt_template=run_prompt_template,
            additional_tools=additional_tools,
        )
    # 生成一段文本,使用给定的提示和停止条件
    def generate_one(self, prompt, stop):
        # 设置请求头,包含授权信息
        headers = {"Authorization": self.token}
        # 构造请求体,包含输入提示、生成参数和停止条件
        inputs = {
            "inputs": prompt,
            "parameters": {"max_new_tokens": 200, "return_full_text": False, "stop": stop},
        }

        # 发送 POST 请求到推理 API
        response = requests.post(self.url_endpoint, json=inputs, headers=headers)
        # 处理请求返回的状态码
        if response.status_code == 429:
            # 如果返回状态码为 429 表示请求过多,记录日志并等待一秒后重试
            logger.info("Getting rate-limited, waiting a tiny bit before trying again.")
            time.sleep(1)
            return self._generate_one(prompt)
        elif response.status_code != 200:
            # 如果返回状态码不是 200,则抛出异常并附带错误信息
            raise ValueError(f"Error {response.status_code}: {response.json()}")

        # 解析响应内容,获取生成的文本结果
        result = response.json()[0]["generated_text"]
        # 检查生成的文本是否以任一停止序列结尾
        for stop_seq in stop:
            if result.endswith(stop_seq):
                # 如果是,则返回去掉停止序列部分的文本
                return result[: -len(stop_seq)]
        # 如果没有匹配到停止序列,则直接返回生成的文本结果
        return result
    @classmethod
    # 类方法装饰器,用于定义一个类的类方法,即可以不用实例化类就可以调用的方法
    def from_local(cls, model_name_or_path, chat_prompt_template=None, run_prompt_template=None, additional_tools=None):
        # 从本地加载模型和分词器
        model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        # 返回一个新的 LocalAgent 实例,初始化时会传入加载的模型和分词器
        return cls(model, tokenizer, chat_prompt_template, run_prompt_template, additional_tools)
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        """
        Convenience method to build a `LocalAgent` from a pretrained checkpoint.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                The name of a repo on the Hub or a local path to a folder containing both model and tokenizer.
            kwargs (`Dict[str, Any]`, *optional*):
                Keyword arguments passed along to [`~PreTrainedModel.from_pretrained`].

        Example:

        ```
        import torch
        from transformers import LocalAgent

        agent = LocalAgent.from_pretrained("bigcode/starcoder", device_map="auto", torch_dtype=torch.bfloat16)
        agent.run("Draw me a picture of rivers and lakes.")
        ```
        """
        # 使用预训练模型名称或路径创建 AutoModelForCausalLM 对象
        model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **kwargs)
        # 使用预训练模型名称或路径创建 AutoTokenizer 对象
        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
        # 返回使用创建的模型和分词器构建的 LocalAgent 对象
        return cls(model, tokenizer)

    @property
    def _model_device(self):
        # 检查模型是否有 hf_device_map 属性,返回第一个设备映射的值
        if hasattr(self.model, "hf_device_map"):
            return list(self.model.hf_device_map.values())[0]
        # 如果模型没有 hf_device_map 属性,则返回第一个参数的设备
        for param in self.model.parameters():
            return param.device

    def generate_one(self, prompt, stop):
        # 使用分词器对提示进行编码,返回张量,并移动到模型所在设备
        encoded_inputs = self.tokenizer(prompt, return_tensors="pt").to(self._model_device)
        # 计算输入序列的长度
        src_len = encoded_inputs["input_ids"].shape[1]
        # 创建停止条件列表
        stopping_criteria = StoppingCriteriaList([StopSequenceCriteria(stop, self.tokenizer)])
        # 使用模型生成文本序列,限制最大生成的新令牌数为200,并应用停止条件
        outputs = self.model.generate(
            encoded_inputs["input_ids"], max_new_tokens=200, stopping_criteria=stopping_criteria
        )

        # 解码生成的输出,去除原始输入部分并返回结果
        result = self.tokenizer.decode(outputs[0].tolist()[src_len:])
        # 如果结果以停止序列之一结尾,则去除停止序列并更新结果
        for stop_seq in stop:
            if result.endswith(stop_seq):
                result = result[: -len(stop_seq)]
        # 返回生成的结果文本
        return result
class StopSequenceCriteria(StoppingCriteria):
    """
    This class can be used to stop generation whenever a sequence of tokens is encountered.

    Args:
        stop_sequences (`str` or `List[str]`):
            The sequence (or list of sequences) on which to stop execution.
        tokenizer:
            The tokenizer used to decode the model outputs.
    """

    def __init__(self, stop_sequences, tokenizer):
        # 如果stop_sequences是单个字符串,则转换为列表
        if isinstance(stop_sequences, str):
            stop_sequences = [stop_sequences]
        # 初始化停止序列和分词器
        self.stop_sequences = stop_sequences
        self.tokenizer = tokenizer

    def __call__(self, input_ids, scores, **kwargs) -> bool:
        # 解码输入的模型输出,转换为字符串
        decoded_output = self.tokenizer.decode(input_ids.tolist()[0])
        # 检查解码后的输出是否以任何停止序列结尾
        return any(decoded_output.endswith(stop_sequence) for stop_sequence in self.stop_sequences)
posted @ 2024-07-01 10:55  绝不原创的飞龙  阅读(122)  评论(0编辑  收藏  举报