Transformers-源码解析-一百一十三-

Transformers 源码解析(一百一十三)

.\models\tvlt\processing_tvlt.py

# coding=utf-8
# 版权所有 2023 年 HuggingFace Inc. 团队。
#
# 根据 Apache 许可证 2.0 版本授权;
# 除非符合许可证的规定,否则不得使用此文件。
# 您可以在以下获取许可证的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则按“原样”分发软件,
# 不提供任何形式的担保或条件。详见许可证。
"""
TVLT 的处理器类。
"""

from ...processing_utils import ProcessorMixin

class TvltProcessor(ProcessorMixin):
    r"""
    构建一个 TVLT 处理器,将 TVLT 图像处理器和 TVLT 特征提取器包装成一个单一的处理器。

    [`TvltProcessor`] 提供了 [`TvltImageProcessor`] 和 [`TvltFeatureExtractor`] 的所有功能。查看
    [`~TvltProcessor.__call__`] 的文档字符串以获取更多信息。

    Args:
        image_processor (`TvltImageProcessor`):
            [`TvltImageProcessor`] 的实例。图像处理器是必需的输入。
        feature_extractor (`TvltFeatureExtractor`):
            [`TvltFeatureExtractor`] 的实例。特征提取器是必需的输入。
    """

    attributes = ["image_processor", "feature_extractor"]
    image_processor_class = "TvltImageProcessor"
    feature_extractor_class = "TvltFeatureExtractor"

    def __init__(self, image_processor, feature_extractor):
        super().__init__(image_processor=image_processor, feature_extractor=feature_extractor)

        self.image_processor = image_processor  # 初始化图像处理器
        self.feature_extractor = feature_extractor  # 初始化特征提取器

    def __call__(
        self,
        images=None,
        audio=None,
        images_mixed=None,
        sampling_rate=None,
        mask_audio=False,
        mask_pixel=False,
        *args,
        **kwargs,
    ):
        """
        Forwards the `images` argument to TvltImageProcessor's [`~TvltImageProcessor.preprocess`] and the `audio`
        argument to TvltFeatureExtractor's [`~TvltFeatureExtractor.__call__`]. Please refer to the docstring of the
        above two methods for more information.
        """

        # 检查输入参数 `images` 和 `audio` 是否都为 None
        if images is None and audio is None:
            # 如果都为 None,则抛出数值错误异常
            raise ValueError("You need to specify either an `images` or `audio` input to process.")

        images_mixed_dict = None
        # 如果 `images` 参数不为 None
        if images is not None:
            # 调用 self.image_processor 对象的 preprocess 方法处理 images 参数
            images_dict = self.image_processor(images, mask_pixel=mask_pixel, *args, **kwargs)
        
        # 如果 `images_mixed` 参数不为 None
        if images_mixed is not None:
            # 调用 self.image_processor 对象的 preprocess 方法处理 images_mixed 参数,设置 is_mixed 为 True
            images_mixed_dict = self.image_processor(images_mixed, is_mixed=True, *args, **kwargs)
        
        # 如果 `audio` 参数不为 None
        if audio is not None:
            # 调用 self.feature_extractor 对象的 __call__ 方法处理 audio 参数
            audio_dict = self.feature_extractor(
                audio, *args, sampling_rate=sampling_rate, mask_audio=mask_audio, **kwargs
            )

        # 初始化空的输出字典
        output_dict = {}
        # 如果 `audio` 参数不为 None,则将 audio_dict 中的内容更新到 output_dict 中
        if audio is not None:
            output_dict.update(audio_dict)
        # 如果 `images` 参数不为 None,则将 images_dict 中的内容更新到 output_dict 中
        if images is not None:
            output_dict.update(images_dict)
        # 如果 `images_mixed_dict` 不为 None,则将 images_mixed_dict 中的内容更新到 output_dict 中
        if images_mixed_dict is not None:
            output_dict.update(images_mixed_dict)
        
        # 返回最终的输出字典
        return output_dict

    @property
    def model_input_names(self):
        # 获取 self.image_processor 对象的 model_input_names 属性
        image_processor_input_names = self.image_processor.model_input_names
        # 获取 self.feature_extractor 对象的 model_input_names 属性
        feature_extractor_input_names = self.feature_extractor.model_input_names
        # 将两个列表合并成一个,并去除重复元素,作为最终的模型输入名称列表
        return list(dict.fromkeys(image_processor_input_names + feature_extractor_input_names))

.\models\tvlt\__init__.py

# flake8: noqa
# 禁用 flake8 检查此模块,因为无法忽略 "F401 '...' imported but unused" 警告,同时保留其他警告。

# Copyright 2023 The HuggingFace Team. All rights reserved.
# 版权声明,保留所有权利。

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache 许可证版本 2.0 许可;您不得使用此文件,除非遵守许可证。
# 您可以在以下网址获取许可证副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# 除非适用法律要求或书面同意,否则软件按"原样"提供,无任何明示或暗示的担保或条件。
# 请参阅许可证获取特定语言的权限和限制。
from typing import TYPE_CHECKING

from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
    is_vision_available,
)

# _import_structure 定义了模块的导入结构,包含不同模块及其导出的符号列表
_import_structure = {
    "configuration_tvlt": ["TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP", "TvltConfig"],
    "feature_extraction_tvlt": ["TvltFeatureExtractor"],
    "processing_tvlt": ["TvltProcessor"],
}

# 检查是否 Torch 可用,若不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用,则导入 modeling_tvlt 模块的符号列表
    _import_structure["modeling_tvlt"] = [
        "TVLT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TvltModel",
        "TvltForPreTraining",
        "TvltForAudioVisualClassification",
        "TvltPreTrainedModel",
    ]

# 检查是否 Vision 可用,若不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Vision 可用,则导入 image_processing_tvlt 模块的符号列表
    _import_structure["image_processing_tvlt"] = ["TvltImageProcessor"]

# 如果 TYPE_CHECKING 为真,导入各个模块的具体符号以供类型检查使用
if TYPE_CHECKING:
    from .configuration_tvlt import TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP, TvltConfig
    from .processing_tvlt import TvltProcessor
    from .feature_extraction_tvlt import TvltFeatureExtractor

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_tvlt import (
            TVLT_PRETRAINED_MODEL_ARCHIVE_LIST,
            TvltForAudioVisualClassification,
            TvltForPreTraining,
            TvltModel,
            TvltPreTrainedModel,
        )

    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .image_processing_tvlt import TvltImageProcessor

# 如果不是 TYPE_CHECKING 环境,则将当前模块设置为一个懒加载模块 _LazyModule
else:
    import sys

    # 使用 _LazyModule 将当前模块动态注册为延迟加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\tvp\configuration_tvp.py

# coding=utf-8
# Copyright 2023 The Intel AIA Team Authors, and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TVP model configuration"""

import copy  # 导入copy模块,用于复制对象

from ...configuration_utils import PretrainedConfig  # 导入预训练配置的基类
from ...utils import logging  # 导入日志工具
from ..auto import CONFIG_MAPPING  # 导入自动配置映射


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


TVP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "Intel/tvp-base": "https://huggingface.co/Intel/tvp-base/resolve/main/config.json",
}  # TVP预训练模型配置文件的映射表,指定模型名称和对应的配置文件URL


class TvpConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`TvpModel`]. It is used to instantiate an Tvp
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Tvp
    [Intel/tvp-base](https://huggingface.co/Intel/tvp-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """

    model_type = "tvp"  # 模型类型标识为"tvp"

    def __init__(
        self,
        backbone_config=None,  # 背景配置(backbone)的配置参数,默认为None
        backbone=None,  # 背景模型,默认为None
        use_pretrained_backbone=False,  # 是否使用预训练的背景模型,默认为False
        use_timm_backbone=False,  # 是否使用timm库中的背景模型,默认为False
        backbone_kwargs=None,  # 背景模型的其他参数配置,默认为None
        distance_loss_weight=1.0,  # 距离损失的权重,默认为1.0
        duration_loss_weight=0.1,  # 时长损失的权重,默认为0.1
        visual_prompter_type="framepad",  # 视觉提示器的类型,默认为"framepad"
        visual_prompter_apply="replace",  # 视觉提示器的应用方式,默认为"replace"
        visual_prompt_size=96,  # 视觉提示的大小,默认为96
        max_img_size=448,  # 最大图像尺寸,默认为448
        num_frames=48,  # 图像中的帧数,默认为48
        vocab_size=30522,  # 词汇表大小,默认为30522
        hidden_size=768,  # 隐藏层大小,默认为768
        intermediate_size=3072,  # 中间层大小,默认为3072
        num_hidden_layers=12,  # 隐藏层数量,默认为12
        num_attention_heads=12,  # 注意力头的数量,默认为12
        max_position_embeddings=512,  # 最大位置嵌入数,默认为512
        max_grid_col_position_embeddings=100,  # 最大网格列位置嵌入数,默认为100
        max_grid_row_position_embeddings=100,  # 最大网格行位置嵌入数,默认为100
        hidden_dropout_prob=0.1,  # 隐藏层dropout概率,默认为0.1
        hidden_act="gelu",  # 隐藏层激活函数,默认为"gelu"
        layer_norm_eps=1e-12,  # 层归一化的epsilon值,默认为1e-12
        initializer_range=0.02,  # 初始化范围,默认为0.02
        attention_probs_dropout_prob=0.1,  # 注意力概率dropout概率,默认为0.1
        **kwargs,
    ):
        """
        Initialize the TvpConfig with specific model configuration parameters.

        Args:
            backbone_config (Optional): Configuration for the backbone, default is None.
            backbone (Optional): Backbone model, default is None.
            use_pretrained_backbone (bool): Whether to use a pretrained backbone model, default is False.
            use_timm_backbone (bool): Whether to use a backbone model from the timm library, default is False.
            backbone_kwargs (Optional): Additional parameters for the backbone model, default is None.
            distance_loss_weight (float): Weight for the distance loss, default is 1.0.
            duration_loss_weight (float): Weight for the duration loss, default is 0.1.
            visual_prompter_type (str): Type of visual prompter, default is "framepad".
            visual_prompter_apply (str): Application method of visual prompter, default is "replace".
            visual_prompt_size (int): Size of the visual prompt, default is 96.
            max_img_size (int): Maximum image size, default is 448.
            num_frames (int): Number of frames in the image, default is 48.
            vocab_size (int): Size of the vocabulary, default is 30522.
            hidden_size (int): Size of the hidden layers, default is 768.
            intermediate_size (int): Size of the intermediate layers, default is 3072.
            num_hidden_layers (int): Number of hidden layers, default is 12.
            num_attention_heads (int): Number of attention heads, default is 12.
            max_position_embeddings (int): Maximum position embeddings, default is 512.
            max_grid_col_position_embeddings (int): Maximum grid column position embeddings, default is 100.
            max_grid_row_position_embeddings (int): Maximum grid row position embeddings, default is 100.
            hidden_dropout_prob (float): Dropout probability for hidden layers, default is 0.1.
            hidden_act (str): Activation function for hidden layers, default is "gelu".
            layer_norm_eps (float): Epsilon value for layer normalization, default is 1e-12.
            initializer_range (float): Range for weight initialization, default is 0.02.
            attention_probs_dropout_prob (float): Dropout probability for attention probabilities, default is 0.1.
            **kwargs: Additional keyword arguments for potential future updates.
        """
        super().__init__(**kwargs)  # 调用父类PretrainedConfig的初始化方法,传入额外的关键字参数
    ):
        # 调用父类的初始化方法,传入所有的关键字参数
        super().__init__(**kwargs)
        # 如果使用预训练的主干网络,抛出值错误异常
        if use_pretrained_backbone:
            raise ValueError("Pretrained backbones are not supported yet.")

        # 如果既指定了主干网络配置又指定了主干网络模型,抛出值错误异常
        if backbone_config is not None and backbone is not None:
            raise ValueError("You can't specify both `backbone` and `backbone_config`.")

        # 如果既未指定主干网络配置又未指定主干网络模型,记录日志并使用默认的 ResNet 主干网络配置进行初始化
        if backbone_config is None and backbone is None:
            logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
            backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
        # 如果主干网络配置是字典类型,则根据模型类型从字典创建配置类实例
        elif isinstance(backbone_config, dict):
            backbone_model_type = backbone_config.get("model_type")
            config_class = CONFIG_MAPPING[backbone_model_type]
            backbone_config = config_class.from_dict(backbone_config)

        # 如果既指定了主干网络配置参数又指定了主干网络关键字参数,抛出值错误异常
        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")

        # 初始化对象的各个属性
        self.backbone_config = backbone_config
        self.backbone = backbone
        self.use_pretrained_backbone = use_pretrained_backbone
        self.use_timm_backbone = use_timm_backbone
        self.backbone_kwargs = backbone_kwargs
        self.distance_loss_weight = distance_loss_weight
        self.duration_loss_weight = duration_loss_weight
        self.visual_prompter_type = visual_prompter_type
        self.visual_prompter_apply = visual_prompter_apply
        self.visual_prompt_size = visual_prompt_size
        self.max_img_size = max_img_size
        self.num_frames = num_frames
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.max_position_embeddings = max_position_embeddings
        self.max_grid_col_position_embeddings = max_grid_col_position_embeddings
        self.max_grid_row_position_embeddings = max_grid_row_position_embeddings
        self.layer_norm_eps = layer_norm_eps
        self.hidden_dropout_prob = hidden_dropout_prob
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.attention_probs_dropout_prob = attention_probs_dropout_prob

    @classmethod
    def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
        """Instantiate a [`TvpConfig`] (or a derived class) from a pre-trained backbone model configuration.

        Args:
            backbone_config ([`PretrainedConfig`]):
                The backbone configuration.
        Returns:
            [`TvpConfig`]: An instance of a configuration object
        """
        # 使用给定的主干网络配置实例化一个 [`TvpConfig`](或其派生类)对象
        return cls(backbone_config=backbone_config, **kwargs)
    def to_dict(self):
        """
        将当前实例序列化为一个 Python 字典。重写默认的 [`~PretrainedConfig.to_dict`] 方法。

        Returns:
            `Dict[str, any]`: 包含此配置实例所有属性的字典,
        """
        # 深拷贝当前实例的所有属性到 output 变量中
        output = copy.deepcopy(self.__dict__)
        
        # 如果 backbone_config 属性不为 None,则将其转换为字典形式
        if output["backbone_config"] is not None:
            output["backbone_config"] = self.backbone_config.to_dict()
        
        # 将 model_type 属性设置为当前类的模型类型
        output["model_type"] = self.__class__.model_type
        
        # 返回序列化后的字典
        return output

.\models\tvp\image_processing_tvp.py

# coding=utf-8
# 版权 2023 年 Intel AIA 团队作者和 HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可证 Version 2.0 许可;除非符合许可,否则不得使用此文件。
# 您可以在以下网址获取许可的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则根据许可分发的软件是基于"原样"分发的,
# 没有任何形式的明示或暗示的保证或条件。
# 有关特定语言的权限,请参阅许可证。

"""用于 TVP 的图像处理器类。"""

from typing import Dict, Iterable, List, Optional, Tuple, Union

import numpy as np

# 从 image_processing_utils 中导入所需的模块和类
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
# 从 image_transforms 中导入所需的函数
from ...image_transforms import (
    PaddingMode,
    flip_channel_order,
    pad,
    resize,
    to_channel_dimension_format,
)
# 从 image_utils 中导入所需的常量和函数
from ...image_utils import (
    IMAGENET_STANDARD_MEAN,
    IMAGENET_STANDARD_STD,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    is_valid_image,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)
# 从 utils 中导入所需的类型和函数
from ...utils import TensorType, is_vision_available, logging

# 如果视觉库可用,则导入 PIL
if is_vision_available():
    import PIL

# 获取 logger 对象
logger = logging.get_logger(__name__)


# 从 transformers.models.vivit.image_processing_vivit.make_batched 复制而来
def make_batched(videos) -> List[List[ImageInput]]:
    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
        return videos
    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
        return [videos]
    elif is_valid_image(videos):
        return [[videos]]
    # 如果无法生成批处理视频,则引发 ValueError
    raise ValueError(f"Could not make batched video from {videos}")


def get_resize_output_image_size(
    input_image: np.ndarray,
    max_size: int = 448,
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
    # 获取输入图像的高度和宽度
    height, width = get_image_size(input_image, input_data_format)
    # 根据图像的长宽比例调整新的高度和宽度
    if height >= width:
        ratio = width * 1.0 / height
        new_height = max_size
        new_width = int(new_height * ratio)
    else:
        ratio = height * 1.0 / width
        new_width = max_size
        new_height = int(new_width * ratio)
    size = (new_height, new_width)
    return size


class TvpImageProcessor(BaseImageProcessor):
    r"""
    构建一个 Tvp 图像处理器。

    """

    # 模型输入的名称列表
    model_input_names = ["pixel_values"]
    def __init__(
        self,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        do_center_crop: bool = True,
        crop_size: Dict[str, int] = None,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_pad: bool = True,
        pad_size: Dict[str, int] = None,
        constant_values: Union[float, Iterable[float]] = 0,
        pad_mode: PaddingMode = PaddingMode.CONSTANT,
        do_normalize: bool = True,
        do_flip_channel_order: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        # 如果没有传入 size 参数,则设定默认值为 {"longest_edge": 448}
        size = size if size is not None else {"longest_edge": 448}
        # 如果没有传入 crop_size 参数,则设定默认值为 {"height": 448, "width": 448}
        crop_size = crop_size if crop_size is not None else {"height": 448, "width": 448}
        # 如果没有传入 pad_size 参数,则设定默认值为 {"height": 448, "width": 448}
        pad_size = pad_size if pad_size is not None else {"height": 448, "width": 448}

        # 初始化各个属性值
        self.do_resize = do_resize  # 是否进行 resize 操作的标志
        self.size = size  # 图像尺寸相关的设定
        self.do_center_crop = do_center_crop  # 是否进行中心裁剪的标志
        self.crop_size = crop_size  # 裁剪后的图像尺寸设定
        self.resample = resample  # resize 时使用的重采样方法
        self.do_rescale = do_rescale  # 是否进行图像像素值的重新缩放
        self.rescale_factor = rescale_factor  # 图像像素值缩放的比例因子
        self.do_pad = do_pad  # 是否进行图像的填充操作
        self.pad_size = pad_size  # 图像填充的目标尺寸设定
        self.constant_values = constant_values  # 图像填充时使用的常数填充值
        self.pad_mode = pad_mode  # 图像填充时使用的填充模式
        self.do_normalize = do_normalize  # 是否进行图像的归一化操作
        self.do_flip_channel_order = do_flip_channel_order  # 是否翻转图像通道顺序的标志
        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN  # 图像归一化的均值
        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD  # 图像归一化的标准差
        # 有效的处理器关键字列表,用于后续验证
        self._valid_processor_keys = [
            "videos",
            "do_resize",
            "size",
            "resample",
            "do_center_crop",
            "crop_size",
            "do_rescale",
            "rescale_factor",
            "do_pad",
            "pad_size",
            "constant_values",
            "pad_mode",
            "do_normalize",
            "do_flip_channel_order",
            "image_mean",
            "image_std",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BILINEAR,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize an image.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
                have the size `(h, w)`. If `size` is of the form `{"longest_edge": s}`, the output image will have its
                longest edge of length `s` while keeping the aspect ratio of the original image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                Resampling filter to use when resizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # Determine the actual size to resize the image to
        size = get_size_dict(size, default_to_square=False)

        # Check if both 'height' and 'width' are provided in the size dictionary
        if "height" in size and "width" in size:
            output_size = (size["height"], size["width"])
        # If only 'longest_edge' is provided, calculate the output size accordingly
        elif "longest_edge" in size:
            output_size = get_resize_output_image_size(image, size["longest_edge"], input_data_format)
        else:
            # Raise an error if neither 'height' and 'width' nor 'longest_edge' are specified
            raise ValueError(f"Size must have 'height' and 'width' or 'longest_edge' as keys. Got {size.keys()}")

        # Perform the resizing operation using specified parameters
        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )
    ):
        """
        Pad an image with zeros to the given size.

        Args:
            image (`np.ndarray`):
                Image to pad.
            pad_size (`Dict[str, int]`)
                Size of the output image with pad.
            constant_values (`Union[float, Iterable[float]]`)
                The fill value to use when padding the image.
            pad_mode (`PaddingMode`)
                The pad mode, default to PaddingMode.CONSTANT
            data_format (`ChannelDimension` or `str`, *optional*)
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 获取输入图像的高度和宽度
        height, width = get_image_size(image, channel_dim=input_data_format)
        
        # 获取要填充到的最大高度和宽度
        max_height = pad_size.get("height", height)
        max_width = pad_size.get("width", width)

        # 计算需要填充的右边和底部的像素数
        pad_right, pad_bottom = max_width - width, max_height - height
        
        # 如果计算出的填充量小于零,抛出值错误异常
        if pad_right < 0 or pad_bottom < 0:
            raise ValueError("The padding size must be greater than image size")

        # 构建填充元组,用于指定图像的填充方式
        padding = ((0, pad_bottom), (0, pad_right))
        
        # 调用填充函数,对图像进行填充
        padded_image = pad(
            image,
            padding,
            mode=pad_mode,
            constant_values=constant_values,
            data_format=data_format,
            input_data_format=input_data_format,
        )

        # 返回填充后的图像
        return padded_image

    def _preprocess_image(
        self,
        image: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        resample: PILImageResampling = None,
        do_center_crop: bool = None,
        crop_size: Dict[str, int] = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_pad: bool = True,
        pad_size: Dict[str, int] = None,
        constant_values: Union[float, Iterable[float]] = None,
        pad_mode: PaddingMode = None,
        do_normalize: bool = None,
        do_flip_channel_order: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """Preprocesses a single image."""

        # 验证预处理参数的有效性
        validate_preprocess_arguments(
            do_rescale=do_rescale,
            rescale_factor=rescale_factor,
            do_normalize=do_normalize,
            image_mean=image_mean,
            image_std=image_std,
            do_pad=do_pad,
            size_divisibility=pad_size,  # 这里的 pad() 方法仅需要 pad_size 参数。
            do_center_crop=do_center_crop,
            crop_size=crop_size,
            do_resize=do_resize,
            size=size,
            resample=resample,
        )

        # 所有的转换操作都期望输入为 numpy 数组
        image = to_numpy_array(image)

        if do_resize:
            # 若需要调整大小,则调用 resize 方法进行大小调整
            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)

        if do_center_crop:
            # 若需要中心裁剪,则调用 center_crop 方法进行裁剪操作
            image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)

        if do_rescale:
            # 若需要重新缩放,则调用 rescale 方法进行缩放操作
            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)

        if do_normalize:
            # 若需要归一化,则调用 normalize 方法进行归一化操作
            image = self.normalize(
                image=image.astype(np.float32), mean=image_mean, std=image_std, input_data_format=input_data_format
            )

        if do_pad:
            # 若需要填充,则调用 pad_image 方法进行填充操作
            image = self.pad_image(
                image=image,
                pad_size=pad_size,
                constant_values=constant_values,
                pad_mode=pad_mode,
                input_data_format=input_data_format,
            )

        # 预训练模型的检查点假设图像为 BGR 格式,而非 RGB 格式
        if do_flip_channel_order:
            # 若需要翻转通道顺序,则调用 flip_channel_order 方法进行翻转操作
            image = flip_channel_order(image=image, input_data_format=input_data_format)

        # 将图像转换为指定的通道维度格式
        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)

        return image

.\models\tvp\modeling_tvp.py

# 设置文件编码为 UTF-8

# 版权声明,指出版权归属及许可协议
# Copyright 2023 The Intel AIA Team Authors, and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch TVP Model"""

# 导入必要的库
import math
from dataclasses import dataclass
from typing import Optional, Tuple

import torch
import torch.utils.checkpoint
from torch import nn

# 导入自定义模块
from ...activations import ACT2FN
from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ModelOutput
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import prune_linear_layer
from ...utils import logging
from ...utils.backbone_utils import load_backbone
from .configuration_tvp import TvpConfig

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 预训练模型的存档列表
TVP_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "Intel/tvp-base",
    "Intel/tvp-base-ANet",
    # See all Tvp models at https://huggingface.co/models?filter=tvp
]

@dataclass
# 定义 TvpVideoGroundingOutput 类,继承自 ModelOutput
class TvpVideoGroundingOutput(ModelOutput):
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Temporal-Distance IoU loss for video grounding.
        logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
            Contains start_time/duration and end_time/duration. It is the time slot of the videos corresponding to the
            input texts.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
            the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
    """
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None

# 定义 TvpLoss 类,继承自 nn.Module
class TvpLoss(nn.Module):
    """
    Placeholder for TvpLoss class definition.
    """
    This class computes the losses for `TvpForVideoGrounding`. The process happens in two steps: 1) we compute
    hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
    ground-truth / prediction (supervise class and box).

    Args:
        losses (`List[str]`):
            List of all the losses to be applied.
    """
    # 定义一个用于视频定位损失计算的类
    class TvpLossCalculator:
        
        # 初始化方法,接收损失列表并进行初始化
        def __init__(self, losses):
            super().__init__()
            # 定义损失函数映射字典
            self.loss_map = {
                "iou": self.loss_iou,
                "distance": self.loss_distance,
                "duration": self.loss_duration,
            }
            # 检查每个损失函数是否支持,若不支持则引发 ValueError 异常
            for loss in losses:
                if loss not in self.loss_map:
                    raise ValueError(f"Loss {loss} not supported")

            self.losses = losses

        # 计算 IoU 损失函数
        def loss_iou(self, start_time, end_time, candidates_start_time, candidates_end_time, duration):
            """
            Measure the intersection over union.
            """
            # 计算交集部分
            inter = torch.min(candidates_end_time, end_time) - torch.max(candidates_start_time, start_time)
            # 计算并集部分
            union = torch.max(candidates_end_time, end_time) - torch.min(candidates_start_time, start_time)
            # 计算 IoU
            iou = 1 - inter.clamp(min=0) / union

            return iou

        # 计算距离损失函数
        def loss_distance(self, start_time, end_time, candidates_start_time, candidates_end_time, duration):
            """
            Measure the distance of mid points.
            """
            # 计算候选框中点
            mid_candidates = torch.div(torch.add(candidates_start_time, candidates_end_time), 2.0)
            # 计算真实框中点
            mid_groundtruth = torch.div(torch.add(start_time, end_time), 2.0)
            # 计算中点距离差异
            distance_diff = torch.div(
                torch.max(mid_candidates, mid_groundtruth) - torch.min(mid_candidates, mid_groundtruth), duration
            ).clamp(min=0.2)

            return distance_diff

        # 计算时长损失函数
        def loss_duration(self, start_time, end_time, candidates_start_time, candidates_end_time, duration):
            """
            Measure the difference of duration.
            """
            # 计算候选框时长
            duration_candidates = torch.sub(candidates_end_time, candidates_start_time)
            # 计算真实框时长
            duration_groundtruth = torch.sub(end_time, start_time)
            # 计算时长差异
            duration_diff = torch.square(torch.div(torch.sub(duration_candidates, duration_groundtruth), duration))
            duration_diff = duration_diff.clamp(min=0.4)

            return duration_diff
    def forward(self, logits, labels):
        """
        This performs the loss computation.

        Args:
            logits (`torch.FloatTensor`):
                The output logits of head module.
            labels (`List[torch.FloatTensor]`):
                List of tensors ([start, end, duration]), which contains start time, end time of the video corresponding to the text, and also the duration.
        """
        # 从标签中解包出视频的时长、开始时间和结束时间
        duration, start_time, end_time = labels
        # 将logits乘以视频持续时间,得到候选的开始时间和结束时间
        candidates = torch.mul(logits, duration)
        # 将候选的开始时间和结束时间转换为浮点数张量
        candidates_start_time, candidates_end_time = candidates[:, 0].float(), candidates[:, 1].float()

        # 初始化损失字典
        losses_dict = {}
        # 遍历每种损失函数并计算损失值,将结果更新到损失字典中
        for loss in self.losses:
            losses_dict.update(
                {loss: self.loss_map[loss](start_time, end_time, candidates_start_time, candidates_end_time, duration)}
            )

        # 返回损失字典作为结果
        return losses_dict
class TvpVisionModel(nn.Module):
    # 定义一个视觉模型类,继承自nn.Module
    def __init__(self, config):
        super().__init__()
        # 加载指定配置的后端模型作为主干网络
        self.backbone = load_backbone(config)
        # 定义网格编码器的卷积层,用于处理特征图
        self.grid_encoder_conv = nn.Conv2d(
            config.backbone_config.hidden_sizes[-1],  # 输入通道数为主干网络的最后一个隐藏层大小
            config.hidden_size,  # 输出通道数为配置中指定的隐藏层大小
            kernel_size=3,  # 卷积核大小为3x3
            stride=1,  # 步长为1
            padding=1,  # 填充大小为1
            groups=1,  # 不使用分组卷积
            bias=False,  # 不使用偏置项
        )

    def forward(self, pixel_values):
        # 获取输入张量的形状信息
        batch_size, num_frames, num_channels, height, width = pixel_values.shape
        # 将输入张量重新排列为(batch_size * num_frames, num_channels, height, width)
        pixel_values = pixel_values.view(batch_size * num_frames, num_channels, height, width)
        # 将重新排列后的输入通过主干网络获取特征图输出,并选择第一个输出元素
        grid_feat_outputs = self.backbone(pixel_values)["feature_maps"][0]
        # 将特征图通过网格编码器的卷积层进行处理
        grid = self.grid_encoder_conv(grid_feat_outputs)
        # 对处理后的网格进行最大池化操作,核大小为2x2,步长为2
        grid = nn.functional.max_pool2d(grid, kernel_size=2, stride=2)
        # 对池化后的网格应用ReLU激活函数
        grid = nn.functional.relu(grid, inplace=True)
        # 获取处理后网格的通道数、高度和宽度信息
        new_channel, new_height, new_width = grid.shape[-3:]
        # 将网格重新排列为(batch_size, num_frames, new_channel, new_height, new_width)
        grid = grid.view(batch_size, num_frames, new_channel, new_height, new_width)
        # 将最后两个维度的顺序调整为(batch_size, num_frames, height, width, num_channels)
        grid = grid.permute(0, 1, 3, 4, 2)
        # 返回处理后的网格张量作为输出
        return grid


class TvpVisualInputEmbedding(nn.Module):
    """
    Takes input of both image and video (multi-frame)
    """

    def __init__(self, config):
        super().__init__()
        # 定义位置编码的Embedding层,用于序列的位置信息编码
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 定义行位置编码的Embedding层,用于网格的行位置信息编码
        self.row_position_embeddings = nn.Embedding(config.max_grid_row_position_embeddings, config.hidden_size)
        # 定义列位置编码的Embedding层,用于网格的列位置信息编码
        self.col_position_embeddings = nn.Embedding(config.max_grid_col_position_embeddings, config.hidden_size)
        # 定义令牌类型编码的Embedding层,用于区分不同类型的令牌
        self.token_type_embeddings = nn.Embedding(1, config.hidden_size)
        # 定义Layer Norm层,用于归一化输入特征
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 定义Dropout层,用于在训练过程中随机丢弃部分输入特征,防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
    def add_2d_positional_embeddings(self, grid):
        """
        Args:
            grid: (batch_size, height, width, hidden_dim)
        Returns:
            grid + col_position_embeddings.view(*col_shape): (batch_size, *, height, width, hidden_dim)
        """
        batch_size, height, width, hidden_dim = grid.shape

        # 添加行位置嵌入
        row_position_ids = torch.arange(height, dtype=torch.long, device=grid.device)  # (height, )
        row_position_embeddings = self.row_position_embeddings(row_position_ids)  # (height, hidden_dim)
        row_shape = (1,) * (len(grid.shape) - 3) + (height, 1, hidden_dim)  # (1, height, 1, hidden_dim)
        grid = grid + row_position_embeddings.view(*row_shape)  # 自动广播操作

        # 添加列位置嵌入
        col_position_ids = torch.arange(width, dtype=torch.long, device=grid.device)  # (width, )
        col_position_embeddings = self.col_position_embeddings(col_position_ids)  # (width, hidden_dim)
        col_shape = (batch_size, 1, width, hidden_dim)  # (1, 1, width, hidden_dim)
        return grid + col_position_embeddings.view(*col_shape)  # 自动广播操作

    def forward(self, grid):
        """
        Args:
            grid: Array of shape (batch_size, num_frames, height, width, num_channels).
                It contains processed frames extracted from videos, and is generated by Tvp image preprocessor. Note,
                num_frames can be 1

        Returns:
            embeddings: The embedding of grid with size (batch_size, height*width, num_channels)

        """
        batch_size, num_frames, height, width, num_channels = grid.shape
        # 时间平均池化,得到 (batch_size, height, width, hidden_size)
        grid = grid.mean(1)
        grid = self.add_2d_positional_embeddings(grid)
        # 图像令牌序列,得到 (batch_size, height*width, num_channels)
        visual_tokens = grid.view(batch_size, -1, num_channels)
        visual_tokens_shape = visual_tokens.shape[:-1]
        device = visual_tokens.device

        # 图像令牌类型嵌入
        token_type_ids = torch.zeros(visual_tokens_shape, dtype=torch.long, device=device)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = visual_tokens + token_type_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config):
        super().__init__()
        # 初始化词嵌入层,将词汇表大小映射到隐藏大小,支持填充索引
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 初始化位置嵌入层,将最大位置嵌入数映射到隐藏大小
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 初始化标记类型嵌入层,将类型词汇表大小映射到隐藏大小
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        # 初始化层归一化,对隐藏大小的张量进行归一化处理
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化丢弃层,根据隐藏丢弃概率进行随机丢弃
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]
        device = input_ids.device if input_ids is not None else inputs_embeds.device
        # 如果位置ID为None,则创建一个序列长度的张量作为位置ID
        if position_ids is None:
            position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).expand(input_shape)
        # 如果标记类型ID为None,则创建一个与输入形状相同的零张量作为标记类型ID
        if token_type_ids is None:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)

        # 如果输入嵌入为空,则使用输入ID获取词嵌入
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        # 获取位置嵌入和标记类型嵌入
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 计算总嵌入,包括词嵌入、位置嵌入和标记类型嵌入
        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings


class TvpAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 如果隐藏大小不能被注意力头数整除且配置中没有嵌入大小属性,则抛出错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size {config.hidden_size} is not a multiple of the number of attention heads {config.num_attention_heads}"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 初始化查询、键、值线性变换层,将隐藏大小映射到注意力头大小
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)
        # 注意力丢弃层,根据注意力概率丢弃
        self.attn_dropout = nn.Dropout(config.attention_probs_dropout_prob)

        # 初始化全连接层和层归一化层,用于输出注意力后的隐藏大小
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化丢弃层,根据隐藏丢弃概率进行随机丢弃
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 初始化剪枝头集合,用于标识应该被剪枝的注意力头
        self.pruned_heads = set()
    # 对 self.pruned_heads 进行修剪操作,移除已经修剪过的头部
    def prune_heads(self, heads):
        # 如果 heads 长度为 0,则直接返回,不进行修剪操作
        if len(heads) == 0:
            return
        # 创建一个全为 1 的掩码,形状为 (self.num_attention_heads, self.attention_head_size)
        mask = torch.ones(self.num_attention_heads, self.attention_head_size)
        # 将 heads 转换为集合,并从中移除已经修剪过的头部
        heads = set(heads) - self.pruned_heads  
        # 遍历剩余的 heads
        for head in heads:
            # 计算比当前 head 小的已修剪头部数量,调整索引
            head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
            # 将对应位置的掩码设为 0
            mask[head] = 0
        # 将掩码展平并获取非零元素的索引
        mask = mask.view(-1).contiguous().eq(1)
        index = torch.arange(len(mask))[mask].long()

        # 对线性层进行修剪操作
        self.query = prune_linear_layer(self.query, index)
        self.key = prune_linear_layer(self.key, index)
        self.value = prune_linear_layer(self.value, index)
        self.dense = prune_linear_layer(self.dense, index, dim=1)

        # 更新超参数并存储修剪过的头部
        self.num_attention_heads = self.num_attention_heads - len(heads)
        self.all_head_size = self.attention_head_size * self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 重新整形操作,用于将 tensor 从形状 (batch_size * sequence_length * ...) 转换为 (batch_size * ... * num_attention_heads * attention_head_size)
    def _reshape(self, tensor: torch.Tensor, sequence_length: int, batch_size: int):
        return (
            tensor.view(batch_size, sequence_length, self.num_attention_heads, self.attention_head_size)
            .transpose(1, 2)  # 将 sequence_length 和 num_attention_heads 这两个维度交换位置
            .contiguous()  # 确保张量的内存是连续的
        )

    # 前向传播函数
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions: Optional[bool] = None,
        ):
            # 获取隐藏状态的批量大小和序列长度
            batch_size, sequence_length = hidden_states.shape[:2]
            
            # 通过self.query对隐藏状态进行查询操作,生成混合查询层
            mixed_query_layer = self.query(hidden_states)

            # 通过self.key对隐藏状态进行键操作,生成混合键层
            mixed_key_layer = self.key(hidden_states)
            
            # 通过self.value对隐藏状态进行值操作,生成混合值层
            mixed_value_layer = self.value(hidden_states)

            # 使用私有方法self._reshape重新塑形混合查询、键、值层
            query_layer = self._reshape(mixed_query_layer, sequence_length, batch_size)
            key_layer = self._reshape(mixed_key_layer, sequence_length, batch_size)
            value_layer = self._reshape(mixed_value_layer, sequence_length, batch_size)

            # 计算"查询"和"键"之间的点积,得到原始注意力分数
            attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
            attention_scores = attention_scores / math.sqrt(self.attention_head_size)
            
            # 如果存在注意力遮罩,将其加到注意力分数上
            if attention_mask is not None:
                attention_scores = attention_scores + attention_mask

            # 将注意力分数归一化为注意力概率
            attention_probs = nn.functional.softmax(attention_scores, dim=-1)

            # 对注意力概率应用注意力dropout
            attention_probs = self.attn_dropout(attention_probs)

            # 如果存在头部遮罩,将其应用到注意力概率上
            if head_mask is not None:
                attention_probs = attention_probs * head_mask

            # 计算注意力输出,将注意力概率与值层进行加权求和
            attn_output = torch.matmul(attention_probs, value_layer)
            attn_output = attn_output.transpose(1, 2).contiguous()
            
            # 重塑注意力输出的形状
            attn_output = attn_output.reshape(batch_size, sequence_length, self.all_head_size)

            # 通过self.dense对注意力输出进行全连接层操作
            attn_output = self.dense(attn_output)
            
            # 应用dropout到注意力输出上
            attn_output = self.dropout(attn_output)
            
            # 将层归一化应用到注意力输出与隐藏状态的残差上
            attn_output = self.layer_norm(attn_output + hidden_states)
            
            # 如果需要输出注意力信息,则将注意力概率加入到输出中
            outputs = (attn_output, attention_probs) if output_attentions else (attn_output,)
            return outputs
# 从 transformers.models.bert.modeling_bert.BertIntermediate 复制并修改为 TvpIntermediate
class TvpIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个全连接层,将输入大小为 config.hidden_size 转换为 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置选择激活函数,若 config.hidden_act 是字符串则使用预定义的函数,否则直接使用配置中的函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 全连接层处理输入的 hidden_states
        hidden_states = self.dense(hidden_states)
        # 应用选择的激活函数到处理后的 hidden_states
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# 定义 TvpOutputLayer 类,包括线性层、LayerNorm 层和 Dropout 层
class TvpOutputLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个全连接层,将输入大小为 config.intermediate_size 转换为 config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个 LayerNorm 层,标准化大小为 config.hidden_size 的输入张量
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个 Dropout 层,应用概率为 config.hidden_dropout_prob 的丢弃率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 全连接层处理输入的 hidden_states
        hidden_states = self.dense(hidden_states)
        # 对全连接层的输出进行 Dropout 处理
        hidden_states = self.dropout(hidden_states)
        # 对加和后的结果应用 LayerNorm 处理
        hidden_states = self.layer_norm(hidden_states + input_tensor)
        return hidden_states


# 定义 TvpEncodeLayer 类,包括 TvpAttention、TvpIntermediate 和 TvpOutputLayer 实例
class TvpEncodeLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个 TvpAttention 实例
        self.attention = TvpAttention(config)
        # 创建一个 TvpIntermediate 实例
        self.intermediate = TvpIntermediate(config)
        # 创建一个 TvpOutputLayer 实例
        self.output = TvpOutputLayer(config)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        output_attentions: Optional[bool] = None,
    ):
        # 调用 attention 实例处理 hidden_states,并返回其输出
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
        )
        attention_output = self_attention_outputs[0]
        outputs = self_attention_outputs[1:]  # 如果输出注意力权重,则添加自注意力
        # 将 attention_output 输入到 intermediate 实例中进行处理
        intermediate_output = self.intermediate(attention_output)
        # 将 intermediate_output 和 attention_output 输入到 output 实例中进行处理
        layer_output = self.output(intermediate_output, attention_output)
        outputs = (layer_output,) + outputs
        return outputs


# 定义 TvpEncoder 类,包括多个 TvpEncodeLayer 层和一些配置项
class TvpEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 创建一个 nn.ModuleList,其中包含 config.num_hidden_layers 个 TvpEncodeLayer 实例
        self.layer = nn.ModuleList([TvpEncodeLayer(config) for _ in range(config.num_hidden_layers)])
        # 设置梯度检查点为 False
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ):
            # 如果 return_dict 参数未指定,则使用配置中的默认值
            return_dict = return_dict if return_dict is not None else self.config.return_dict
            # 如果 output_attentions 参数未指定,则使用配置中的默认值
            output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
            # 如果 output_hidden_states 参数未指定,则使用配置中的默认值
            output_hidden_states = (
                output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
            )
            # 初始化用于存储所有层隐藏状态的元组
            all_hidden_states = ()
            # 初始化用于存储所有注意力权重的元组
            all_attentions = ()

            # 遍历每个层次的 Transformer 层
            for i, layer_module in enumerate(self.layer):
                # 如果需要输出隐藏状态
                if output_hidden_states:
                    # 将当前层的隐藏状态添加到 all_hidden_states 中
                    all_hidden_states = all_hidden_states + (hidden_states,)

                # 如果启用了梯度检查点且在训练阶段
                if self.gradient_checkpointing and self.training:
                    # 调用 _gradient_checkpointing_func 方法实现梯度检查点
                    layer_outputs = self._gradient_checkpointing_func(
                        layer_module.__call__,
                        hidden_states,
                        attention_mask,
                        (head_mask[i] if head_mask is not None else None),
                        output_attentions,
                    )
                else:
                    # 普通地调用 Transformer 层,得到层的输出
                    layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i], output_attentions)

                # 更新 hidden_states 为当前层的输出的第一个元素,即隐藏状态
                hidden_states = layer_outputs[0]
                # 如果需要输出注意力权重
                if output_attentions:
                    # 将当前层的注意力权重添加到 all_attentions 中
                    all_attentions = all_attentions + (layer_outputs[1],)

            # 添加最后一层的隐藏状态到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果不需要以字典形式返回结果
            if not return_dict:
                # 构造 outputs,包含最后一层的隐藏状态及可能的所有隐藏状态和注意力权重
                outputs = (hidden_states,)
                if output_hidden_states:
                    outputs = outputs + (all_hidden_states,)
                if output_attentions:
                    outputs = outputs + (all_attentions,)
                return outputs  # 返回最后一层的隐藏状态,所有隐藏状态和注意力权重的元组

            # 如果需要以 BaseModelOutput 对象形式返回结果
            return BaseModelOutput(
                last_hidden_state=hidden_states,
                hidden_states=all_hidden_states if output_hidden_states else None,
                attentions=all_attentions if output_attentions else None,
            )
# 从transformers.models.bert.modeling_bert.BertPooler复制而来,将Bert改为Tvp
class TvpPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层,输入输出维度都为config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 激活函数使用双曲正切函数
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 取hidden_states中每个样本的第一个token对应的隐藏状态作为池化输出
        first_token_tensor = hidden_states[:, 0]
        # 经过全连接层变换
        pooled_output = self.dense(first_token_tensor)
        # 应用激活函数
        pooled_output = self.activation(pooled_output)
        return pooled_output


class TvpPreTrainedModel(PreTrainedModel):
    """一个抽象类,用于处理权重初始化和预训练模型的下载加载的简单接口。"""

    config_class = TvpConfig  # 使用TvpConfig作为配置类
    base_model_prefix = "model"  # 基础模型前缀为"model"
    supports_gradient_checkpointing = True  # 支持梯度检查点

    def _init_weights(self, module):
        """初始化权重"""
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # 使用正态分布初始化权重,均值为0,标准差为self.config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        elif isinstance(module, nn.LayerNorm):
            # 将偏置项初始化为零,将权重初始化为1
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

        if isinstance(module, nn.Linear) and module.bias is not None:
            # 如果是线性层且存在偏置项,则将偏置项初始化为零
            module.bias.data.zero_()

        if isinstance(module, nn.Conv2d):
            # 使用Kaiming正态分布初始化卷积层的权重
            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
            if module.bias is not None:
                # 如果存在偏置项,则将偏置项初始化为零
                nn.init.constant_(module.bias, 0)


TVP_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`TvpConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

TVP_INPUTS_DOCSTRING = r"""
    # 定义函数签名和参数说明
    def forward(
        input_ids: torch.LongTensor,
        pixel_values: torch.FloatTensor,
        attention_mask: torch.FloatTensor = None,
        head_mask: torch.FloatTensor = None,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True
    ):
        # 模型前向传播方法,接受输入的序列 token 索引和图像像素值作为输入
        """
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                输入序列 token 的索引,用于从词汇表中获取对应的 token。可使用 [`AutoTokenizer`] 获得。详见 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`]。[什么是输入 ID?](../glossary#input-ids)
    
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
                像素值,用于表示输入图像。可使用 [`TvpImageProcessor`] 获得。详见 [`TvpImageProcessor.__call__`]。
    
            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
                用于避免对填充 token 索引执行注意力操作的掩码。掩码取值为 `[0, 1]`:
                - 1 表示**不遮蔽**的 token,
                - 0 表示**遮蔽**的 token。
                [什么是注意力掩码?](../glossary#attention-mask)
    
            head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
                用于屏蔽自注意力模块中特定头部的掩码。掩码取值为 `[0, 1]`:
                - 1 表示**未遮蔽**的头部,
                - 0 表示**遮蔽**的头部。
    
            output_attentions (`bool`, *optional*):
                是否返回所有注意力层的注意力张量。详见返回的张量中的 `attentions` 字段。
    
            output_hidden_states (`bool`, *optional*):
                是否返回所有层的隐藏状态。详见返回的张量中的 `hidden_states` 字段。
    
            return_dict (`bool`, *optional*):
                是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
        """
"""
Pad frames extracted from videos in the surroundings.
"""

class TvpFrameDownPadPrompter(nn.Module):
    """
    Pad frames extracted from videos only at the bottom.
    """

    def __init__(self, config):
        # 检查 `visual_prompter_apply` 是否为有效取值 ("add", "replace", "remove")
        if config.visual_prompter_apply not in ("add", "replace", "remove"):
            raise ValueError("`visual_prompter_apply` must be in (add, replace, remove)")

        super().__init__()
        # 初始化可视化提示大小、帧数、最大图像尺寸、应用方式
        self.visual_prompt_size = config.visual_prompt_size
        self.frame_num = config.frame_num
        self.max_img_size = config.max_img_size
        self.visual_prompter_apply = config.visual_prompter_apply

        # 创建用于下方填充的可训练参数
        self.pad_down = nn.Parameter(
            torch.randn([1, config.frame_num, 3, config.visual_prompt_size, config.max_img_size])
        )

    def forward(self, pixel_values):
        # 如果不是应用 "add" 方式,则创建一个全为1的掩码并设置底部为0
        if self.visual_prompter_apply != "add":
            visual_prompt_mask = torch.ones(
                [self.max_img_size, self.max_img_size], dtype=pixel_values.dtype, device=pixel_values.device
            )
            visual_prompt_mask[self.max_img_size - self.visual_prompt_size : self.max_img_size, :] = 0.0
            pixel_values *= visual_prompt_mask

        # 如果不是应用 "remove" 方式,则创建一个填充用的零张量,并在指定位置填充下方填充数据
        if self.visual_prompter_apply != "remove":
            prompt = torch.zeros(
                [pixel_values.shape[0], pixel_values.shape[1], 3, self.max_img_size, self.max_img_size],
                device=pixel_values.device,
            )
            start_point = self.max_img_size - self.visual_prompt_size
            prompt[:, :, :, start_point : self.max_img_size, :] = self.pad_down
            pixel_values += prompt.to(pixel_values.dtype)

        return pixel_values


class TvpFramePadPrompter(nn.Module):
    """
    Pad frames extracted from videos in the surroundings.
    """
    # 初始化方法,接收一个配置对象 `config`
    def __init__(self, config):
        # 检查 `visual_prompter_apply` 是否在合法取值范围内
        if config.visual_prompter_apply not in ("add", "replace", "remove"):
            raise ValueError("`visual_prompter_apply` must be in (add, replace, remove)")

        # 调用父类初始化方法
        super().__init__()

        # 初始化属性
        self.num_frames = config.num_frames  # 设置帧数
        self.max_img_size = config.max_img_size  # 设置图像最大尺寸
        self.visual_prompter_apply = config.visual_prompter_apply  # 设置视觉提示器应用模式

        # 根据配置计算基础尺寸
        self.base_size = config.max_img_size - config.visual_prompt_size * 2

        # 初始化可学习参数:上边界填充
        self.pad_up = nn.Parameter(
            torch.randn([1, config.num_frames, 3, config.visual_prompt_size, config.max_img_size])
        )
        # 初始化可学习参数:下边界填充
        self.pad_down = nn.Parameter(
            torch.randn([1, config.num_frames, 3, config.visual_prompt_size, config.max_img_size])
        )
        # 初始化可学习参数:左边界填充
        self.pad_left = nn.Parameter(
            torch.randn(
                [
                    1,
                    config.num_frames,
                    3,
                    config.max_img_size - config.visual_prompt_size * 2,
                    config.visual_prompt_size,
                ]
            )
        )
        # 初始化可学习参数:右边界填充
        self.pad_right = nn.Parameter(
            torch.randn(
                [
                    1,
                    config.num_frames,
                    3,
                    config.max_img_size - config.visual_prompt_size * 2,
                    config.visual_prompt_size,
                ]
            )
        )

    # 前向传播方法,接收输入 `pixel_values`
    def forward(self, pixel_values):
        # 检查 `visual_prompter_apply` 是否在合法取值范围内,若不在则抛出异常
        if self.visual_prompter_apply not in ("add", "remove", "replace"):
            raise ValueError(f"Invalid visual_prompter_apply value {self.visual_prompter_apply}")

        # 如果 `visual_prompter_apply` 是 "replace" 或 "remove",则创建全为 1 的视觉提示掩码
        if self.visual_prompter_apply in ("replace", "remove"):
            visual_prompt_mask = torch.ones(
                [self.max_img_size, self.max_img_size], dtype=pixel_values.dtype, device=pixel_values.device
            )
            # 将输入 `pixel_values` 与视觉提示掩码相乘
            pixel_values *= visual_prompt_mask

        # 如果 `visual_prompter_apply` 是 "replace" 或 "add",则进行以下操作
        if self.visual_prompter_apply in ("replace", "add"):
            # 创建全零的基础张量
            base = torch.zeros(1, self.num_frames, 3, self.base_size, self.base_size, device=pixel_values.device)
            # 拼接左右填充到基础张量上
            prompt = torch.cat([self.pad_left, base, self.pad_right], dim=4)
            # 拼接上下填充到最终的视觉提示器上
            prompt = torch.cat([self.pad_up, prompt, self.pad_down], dim=3)
            # 将视觉提示器复制并与输入 `pixel_values` 相加
            prompt = torch.cat(pixel_values.size(0) * [prompt])
            pixel_values = pixel_values + prompt.to(pixel_values.dtype)

        # 返回处理后的 `pixel_values`
        return pixel_values
# 定义了一个映射,将字符串映射到相应的 TvpFrameDownPadPrompter 或 TvpFramePadPrompter 类
TVP_PROMPTER_CLASSES_MAPPING = {
    "framedownpad": TvpFrameDownPadPrompter,
    "framepad": TvpFramePadPrompter,
}

@add_start_docstrings(
    "The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on" " top.",
    TVP_START_DOCSTRING,
)
# 定义了一个 TvpModel 类,继承自 TvpPreTrainedModel 类
class TvpModel(TvpPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.config = config
        # 创建 TvpVisionModel 实例
        self.vision_model = TvpVisionModel(config)
        # 创建 TvpTextInputEmbeddings 实例
        self.embeddings = TvpTextInputEmbeddings(config)
        # 创建 TvpVisualInputEmbedding 实例
        self.visual_embeddings = TvpVisualInputEmbedding(config)
        # 创建 TvpEncoder 实例
        self.encoder = TvpEncoder(config)
        # 创建 TvpPooler 实例
        self.pooler = TvpPooler(config)
        # 创建 nn.Parameter 参数,形状为 [1, 10, hidden_size]
        self.text_prompt = nn.Parameter(torch.randn([1, 10, config.hidden_size]))
        # 创建 nn.Dropout 实例,使用给定的 hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 检查 config.visual_prompter_type 是否在 TVP_PROMPTER_CLASSES_MAPPING 中,如果不在则抛出 ValueError
        if config.visual_prompter_type not in TVP_PROMPTER_CLASSES_MAPPING:
            raise ValueError("`visual_prompter_type` must be in (framedownpad, framepad)")
        # 根据 config.visual_prompter_type 创建相应的 Prompter 类实例
        self.visual_prompter = TVP_PROMPTER_CLASSES_MAPPING[config.visual_prompter_type](config)

        # 执行初始化后的处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回 embeddings 的 word_embeddings 属性
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        # 设置 embeddings 的 word_embeddings 属性为给定的 value
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """Prunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        """
        # 遍历 heads_to_prune 中的每个项,将对应层的注意力头进行修剪
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(TVP_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=TvpConfig)
    # 定义了 forward 方法,接受多个输入参数,并返回一个 BaseModelOutputWithPooling 对象
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
):
    # 下面是 TvpVideoGroundingHead 类的定义和注释,略去了前面的部分
    def __init__(self, config):
        # 调用父类的初始化方法,传入配置参数
        super().__init__(config)
        # 将配置参数保存到实例变量中
        self.config = config
        # 创建一个 TvpModel 的实例,并保存到实例变量中
        self.model = TvpModel(config)
        # 创建一个 TvpVideoGroundingHead 的实例,并保存到实例变量中
        self.video_grounding_head = TvpVideoGroundingHead(config)

        # 执行初始化后的自定义操作
        self.post_init()

    @add_start_docstrings_to_model_forward(TVP_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TvpVideoGroundingOutput, config_class=TvpConfig)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        labels: Tuple[torch.Tensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        r"""
        labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*):
            The labels contains duration, start time, and end time of the video corresponding to the text.
        Returns:

        Examples:
        ```
        >>> import torch
        >>> from transformers import AutoConfig, AutoTokenizer, TvpForVideoGrounding

        >>> model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp")

        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
        ```"""
        # 如果 return_dict 为 None,则使用配置参数中的 return_dict
        return_dict = return_dict if return_dict is not None else self.config.return_dict
        # 调用模型的 forward 方法,传入各种输入参数
        outputs = self.model(
            input_ids,
            pixel_values,
            attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 从模型输出中获取 pooler_output
        pooler_output = outputs[1]

        # 将 pooler_output 传入视频 grounding 头部模型中得到 logits
        logits = self.video_grounding_head(pooler_output)

        # 初始化 loss 为 None
        loss = None
        # 如果 labels 不为 None,则计算损失
        if labels is not None:
            # 创建损失函数对象,包括 iou, distance, duration 三种损失
            criterion = TvpLoss(["iou", "distance", "duration"])
            # 将损失函数移动到当前设备(通常是 GPU)
            criterion.to(self.device)
            # 计算损失字典
            loss_dict = criterion(logits, labels)
            # 计算加权损失总和
            loss = (
                loss_dict["iou"]
                + self.config.distance_loss_weight * loss_dict["distance"]
                + self.config.duration_loss_weight * loss_dict["duration"]
            )

        # 如果 return_dict 为 False,则返回不同类型的输出
        if not return_dict:
            # 将 logits 添加到输出中
            outputs = (logits,) + outputs[2:]
            # 如果损失不为 None,则也添加到输出中
            if loss is not None:
                outputs = (loss,) + outputs
            return outputs

        # 如果 return_dict 为 True,则构造 TvpVideoGroundingOutput 对象返回
        return TvpVideoGroundingOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

.\models\tvp\processing_tvp.py

# coding=utf-8
# Copyright 2023 The Intel AIA Team Authors, and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS=,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for TVP.
"""

from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding

class TvpProcessor(ProcessorMixin):
    """
    Constructs an TVP processor which wraps a TVP image processor and a Bert tokenizer into a single processor.

    [`TvpProcessor`] offers all the functionalities of [`TvpImageProcessor`] and [`BertTokenizerFast`]. See the
    [`~TvpProcessor.__call__`] and [`~TvpProcessor.decode`] for more information.

    Args:
        image_processor ([`TvpImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`BertTokenizerFast`], *optional*):
            The tokenizer is a required input.
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "TvpImageProcessor"
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")

    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        """
        Initialize the TVP processor with an image processor and a tokenizer.

        Args:
            image_processor ([`TvpImageProcessor`], *optional*):
                The image processor is a required input.
            tokenizer ([`BertTokenizerFast`], *optional*):
                The tokenizer is a required input.

        Raises:
            ValueError: If either `image_processor` or `tokenizer` is not provided.
        """
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        super().__init__(image_processor, tokenizer)

    def batch_decode(self, *args, **kwargs):
        """
        Forward all arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`] method.

        Returns:
            Decoded outputs corresponding to the input tokens.

        See Also:
            [`~PreTrainedTokenizer.batch_decode`] for more details.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        Forward all arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`] method.

        Returns:
            Decoded string corresponding to the input token.

        See Also:
            [`~PreTrainedTokenizer.decode`] for more details.
        """
        return self.tokenizer.decode(*args, **kwargs)
    # 定义一个方法,用于处理视频定位的后处理,计算视频的开始和结束时间
    def post_process_video_grounding(self, logits, video_durations):
        """
        Compute the time of the video.

        Args:
            logits (`torch.Tensor`):
                The logits output of TvpForVideoGrounding.
            video_durations (`float`):
                The video's duration.

        Returns:
            start (`float`):
                The start time of the video.
            end (`float`):
                The end time of the video.
        """
        # 从 logits 中提取开始时间和结束时间,并乘以视频的总时长,保留一位小数
        start, end = (
            round(logits.tolist()[0][0] * video_durations, 1),
            round(logits.tolist()[0][1] * video_durations, 1),
        )

        return start, end

    @property
    # 从 transformers.models.blip.processing_blip.BlipProcessor.model_input_names 复制而来
    # 定义一个属性,返回模型输入的名称列表,合并并去重 tokenizer 和 image_processor 的输入名称
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

.\models\tvp\__init__.py

# coding=utf-8
# 设置文件编码为 UTF-8

# Copyright 2023 The Intel AIA Team Authors, and HuggingFace Inc. team. All rights reserved.
# 版权声明,保留所有权利

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache 许可证 2.0 版本许可

# you may not use this file except in compliance with the License.
# 除非遵守许可证,否则不得使用此文件

# You may obtain a copy of the License at
# 您可以在以下网址获取许可证副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing=, software
# distributed under the License is distributed on an "AS IS" BASIS=,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied.
# 除非适用法律要求或书面同意,否则根据许可证分发的软件均为“按原样”分发,不附带任何担保或条件,无论是明示的还是默示的

# See the License for the specific language governing permissions and
# limitations under the License.
# 详细了解许可证以了解权限和限制

from typing import TYPE_CHECKING

# 从特定位置导入依赖
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块导入结构
_import_structure = {
    "configuration_tvp": [
        "TVP_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "TvpConfig",
    ],
    "processing_tvp": ["TvpProcessor"],
}

# 尝试导入视觉处理相关模块,如果不可用则引发异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["image_processing_tvp"] = ["TvpImageProcessor"]

# 尝试导入 Torch 相关模块,如果不可用则引发异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    _import_structure["modeling_tvp"] = [
        "TVP_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TvpModel",
        "TvpPreTrainedModel",
        "TvpForVideoGrounding",
    ]

# 如果是类型检查模式,导入特定的模块
if TYPE_CHECKING:
    from .configuration_tvp import (
        TVP_PRETRAINED_CONFIG_ARCHIVE_MAP,
        TvpConfig,
    )
    from .processing_tvp import TvpProcessor

    # 尝试导入视觉处理相关模块,如果不可用则跳过
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .image_processing_tvp import TvpImageProcessor

    # 尝试导入 Torch 相关模块,如果不可用则跳过
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_tvp import (
            TVP_PRETRAINED_MODEL_ARCHIVE_LIST,
            TvpForVideoGrounding,
            TvpModel,
            TvpPreTrainedModel,
        )

# 如果不是类型检查模式,则将当前模块设置为延迟加载模块
else:
    import sys

    # 设置当前模块为 LazyModule,根据特定的导入结构和规范
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\udop\configuration_udop.py

# coding=utf-8
# Copyright 2024 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" UDOP model configuration"""


from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...utils import logging  # 导入日志工具


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

UDOP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/udop-large": "https://huggingface.co/microsoft/udop-large/resolve/main/config.json",
}


class UdopConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`UdopForConditionalGeneration`]. It is used to
    instantiate a UDOP model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the UDOP
    [microsoft/udop-large](https://huggingface.co/microsoft/udop-large) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    """

    model_type = "udop"  # 模型类型为 UDOP
    keys_to_ignore_at_inference = ["past_key_values"]  # 推断时忽略的键列表
    attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}  # 属性映射表

    def __init__(
        self,
        vocab_size=33201,  # 词汇表大小
        d_model=1024,  # 模型的隐藏层大小
        d_kv=64,  # key 和 value 向量的大小
        d_ff=4096,  # 前向传播网络中间层的大小
        num_layers=24,  # 模型的层数
        num_decoder_layers=None,  # 解码器层数
        num_heads=16,  # 注意力头的数量
        relative_attention_num_buckets=32,  # 相对注意力的桶数
        relative_attention_max_distance=128,  # 相对注意力的最大距离
        relative_bias_args=[{"type": "1d"}, {"type": "horizontal"}, {"type": "vertical"}],  # 相对偏置参数
        dropout_rate=0.1,  # dropout 率
        layer_norm_epsilon=1e-6,  # 层归一化的 epsilon 参数
        initializer_factor=1.0,  # 初始化因子
        feed_forward_proj="relu",  # 前向传播网络的激活函数
        is_encoder_decoder=True,  # 是否为编码器-解码器结构
        use_cache=True,  # 是否使用缓存
        pad_token_id=0,  # 填充 token 的 id
        eos_token_id=1,  # 终止 token 的 id
        max_2d_position_embeddings=1024,  # 最大的二维位置嵌入数
        image_size=224,  # 图像尺寸
        patch_size=16,  # 图像分块的大小
        num_channels=3,  # 图像通道数
        **kwargs,  # 其他参数
    ):
        super().__init__(**kwargs)  # 调用父类的初始化方法,传递其他参数
        ):
        # 初始化 Transformer 模型的各种参数
        self.vocab_size = vocab_size  # 词汇表大小
        self.d_model = d_model  # 模型的隐藏层维度
        self.d_kv = d_kv  # 键值对的维度
        self.d_ff = d_ff  # 前向传播层的维度
        self.num_layers = num_layers  # 总层数
        self.num_decoder_layers = (
            num_decoder_layers if num_decoder_layers is not None else self.num_layers
        )  # 解码器层数,默认与编码器对称
        self.num_heads = num_heads  # 头的数量
        self.relative_attention_num_buckets = relative_attention_num_buckets  # 相对注意力的桶数
        self.relative_attention_max_distance = relative_attention_max_distance  # 相对注意力的最大距离
        self.dropout_rate = dropout_rate  # Dropout 比率
        self.layer_norm_epsilon = layer_norm_epsilon  # Layer normalization 的 epsilon 值
        self.initializer_factor = initializer_factor  # 初始化因子
        self.feed_forward_proj = feed_forward_proj  # 前向传播层的激活函数
        self.use_cache = use_cache  # 是否使用缓存

        # UDOP 属性
        self.max_2d_position_embeddings = max_2d_position_embeddings  # 二维位置嵌入的最大值
        self.image_size = image_size  # 图像尺寸
        self.patch_size = patch_size  # 补丁尺寸
        self.num_channels = num_channels  # 通道数
        if not isinstance(relative_bias_args, list):
            raise ValueError("`relative_bias_args` should be a list of dictionaries.")
        self.relative_bias_args = relative_bias_args  # 相对偏置参数列表

        # 解析前向传播激活函数
        act_info = self.feed_forward_proj.split("-")
        self.dense_act_fn = act_info[-1]  # 密集层的激活函数
        self.is_gated_act = act_info[0] == "gated"  # 是否是门控激活函数

        # 检查前向传播激活函数格式是否正确
        if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2:
            raise ValueError(
                f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer."
                "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. "
                "'gated-gelu' or 'relu'"
            )

        # 调用父类构造函数,初始化基本参数
        super().__init__(
            pad_token_id=pad_token_id,  # 填充符号的 ID
            eos_token_id=eos_token_id,  # 终止符号的 ID
            is_encoder_decoder=is_encoder_decoder,  # 是否是编码器-解码器模型
            **kwargs,  # 其它参数
        )

.\models\udop\convert_udop_to_hf.py

    filepath = hf_hub_download(
        repo_id="hf-internal-testing/fixtures_docvqa", filename="document_2.png", repo_type="dataset"
    )
    # 使用 Hugging Face Hub 下载指定 repository 中的文件 'document_2.png',返回其本地文件路径
    image = Image.open(filepath).convert("RGB")
    # 打开下载的图像文件,并转换为 RGB 模式的 PIL 图像对象

    return image
    # 返回处理后的图像对象作为函数的输出
    words = ['7', 'ITC', 'Limited', 'REPORT', 'AND', 'ACCOUNTS', '2013', 'ITC’s', 'Brands:', 'An', 'Asset', 'for', 'the', 'Nation', 'The', 'consumer', 'needs', 'and', 'aspirations', 'they', 'fulfil,', 'the', 'benefit', 'they', 'generate', 'for', 'millions', 'across', 'ITC’s', 'value', 'chains,', 'the', 'future-ready', 'capabilities', 'that', 'support', 'them,', 'and', 'the', 'value', 'that', 'they', 'create', 'for', 'the', 'country,', 'have', 'made', 'ITC’s', 'brands', 'national', 'assets,', 'adding', 'to', 'India’s', 'competitiveness.', 'It', 'is', 'ITC’s', 'aspiration', 'to', 'be', 'the', 'No', '1', 'FMCG', 'player', 'in', 'the', 'country,', 'driven', 'by', 'its', 'new', 'FMCG', 'businesses.', 'A', 'recent', 'Nielsen', 'report', 'has', 'highlighted', 'that', "ITC's", 'new', 'FMCG', 'businesses', 'are', 'the', 'fastest', 'growing', 'among', 'the', 'top', 'consumer', 'goods', 'companies', 'operating', 'in', 'India.', 'ITC', 'takes', 'justifiable', 'pride', 'that,', 'along', 'with', 'generating', 'economic', 'value,', 'these', 'celebrated', 'Indian', 'brands', 'also', 'drive', 'the', 'creation', 'of', 'larger', 'societal', 'capital', 'through', 'the', 'virtuous', 'cycle', 'of', 'sustainable', 'and', 'inclusive', 'growth.', 'DI', 'WILLS', '*', ';', 'LOVE', 'DELIGHTFULLY', 'SOFT', 'SKIN?', 'aia', 'Ans', 'Source:', 'https://www.industrydocuments.ucsf.edu/docs/snbx0223']
    # 定义一个包含文本的列表和边界框的空列表
    text_list = []
    bbox_list = []
    # 遍历每个词和对应的框
    for text, box in zip(words, boxes):
        # 如果文本为空,则跳过当前循环
        if text == "":
            continue
        # 对文本进行分词处理
        sub_tokens = tokenizer.tokenize(text)
        # 遍历每个子词并添加到文本列表中,同时将框添加到框列表中
        for sub_token in sub_tokens:
            text_list.append(sub_token)
            bbox_list.append(box)

    # 将文本列表转换为输入 ID 列表
    input_ids = tokenizer.convert_tokens_to_ids(text_list)

    # 将前面的提示 ID 与当前输入 ID 拼接
    input_ids = prompt_ids + input_ids
    # 将框列表与一个全零的框列表拼接
    bbox = [[0, 0, 0, 0]] * len(prompt_ids) + bbox_list

    # 使用图像处理器处理图像并获取像素值
    pixel_values = image_processor(image, return_tensors="pt").pixel_values
    # 使用原始变换处理图像,并展开维度
    original_pixel_values = original_transform(image, image_size=image_processor.size["height"]).unsqueeze(0)
    # 验证像素值是否相似
    assert torch.allclose(original_pixel_values, pixel_values)
    # 打印信息确认像素值正常
    print("Pixel values are ok!")

    # 返回输入 ID 的张量、边界框的张量和像素值
    return torch.tensor(input_ids).unsqueeze(0), torch.tensor(bbox).unsqueeze(0).float(), pixel_values
# 将给定模型名称映射到其对应的检查点路径
def convert_udop_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
    # 不同模型名称到其对应的检查点路径的映射字典
    name_to_checkpoint_path = {
        "udop-large": "/Users/nielsrogge/Documents/UDOP/udop-unimodel-large-224/pytorch_model.bin",
        "udop-large-512": "/Users/nielsrogge/Documents/UDOP/udop-unimodel-large-512/pytorch_model.bin",
        "udop-large-512-300k": "/Users/nielsrogge/Documents/UDOP/udop-unimodel-large-512-300k-steps/pytorch_model.bin",
    }

    # 根据模型名称获取其对应的检查点路径
    checkpoint_path = name_to_checkpoint_path[model_name]
    # 使用 torch 加载检查点,将其状态字典加载到 CPU 上
    state_dict = torch.load(checkpoint_path, map_location="cpu")

    # 打印加载的检查点路径
    print("Checkpoint path:", checkpoint_path)

    # 创建 HF 模型对象
    image_size = 512 if "512" in model_name else 224
    # 使用指定的配置创建 UDOP 模型配置对象
    config = UdopConfig(decoder_start_token_id=0, image_size=image_size)
    # 使用配置创建条件生成的 UDOP 模型
    model = UdopForConditionalGeneration(config)
    # 将模型设置为评估模式
    model.eval()

    # 重命名状态字典的键名中的特定子字符串
    state_dict = {k.replace("cell2dembedding", "cell_2d_embedding"): v for k, v in state_dict.items()}

    # 加载模型的权重,忽略不匹配的键
    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
    # 打印缺失的键和不期待的键
    print("Missing keys:", missing_keys)
    print("Unexpected keys:", unexpected_keys)
    # 断言确保缺失的键和不期待的键满足预期
    assert missing_keys == ["encoder.embed_patches.proj.weight", "encoder.embed_patches.proj.bias"]
    assert unexpected_keys == ["pos_embed"]

    # 准备虚拟输入
    # 从预训练模型 "t5-base" 创建 UDOP 分词器
    tokenizer = UdopTokenizer.from_pretrained("t5-base", legacy=True)
    # 设置图像处理器的大小
    size = {"height": image_size, "width": image_size}
    # 使用 LayoutLMv3 图像处理器创建 UDOP 处理器
    image_processor = LayoutLMv3ImageProcessor(
        image_mean=IMAGENET_DEFAULT_MEAN, image_std=IMAGENET_DEFAULT_STD, size=size
    )
    # 创建 UDOP 处理器
    processor = UdopProcessor(image_processor=image_processor, tokenizer=tokenizer)
    # 准备虚拟输入数据:分词输入 ID、边界框和图像
    input_ids, bbox, image = prepare_dummy_inputs(tokenizer, image_processor)
    # 指定提示文本
    prompt = "Question answering. In which year is the report made?"
    # 使用处理器编码图像和文本输入,返回 PyTorch 张量
    encoding = processor(images=get_image(), text=prompt, return_tensors="pt")

    # 获取输入文本的分词 ID
    input_ids = encoding.input_ids
    # 定义预期的输入 ID 张量,这是一个 2D 张量,表示模型的预期输入
    EXPECTED_INPUT_IDS = torch.tensor([[11860, 18243, 5, 86, 84, 215, 19, 8, 934, 263, 58, 1, 489, 27, 3838, 7363, 4083, 14536, 3430, 5686, 5911, 17161, 134, 2038, 27, 3838, 22, 7, 4688, 7, 10, 389, 18202, 21, 8, 11046, 37, 3733, 523, 11, 38, 2388, 1628, 3, 13133, 23334, 6, 8, 1656, 79, 3806, 21, 4040, 640, 27, 3838, 22, 7, 701, 16534, 6, 8, 3, 76, 2693, 18, 23015, 5644, 24, 380, 3, 6015, 6, 11, 8, 701, 24, 79, 482, 21, 3, 88, 684, 6, 43, 263, 27, 3838, 22, 7, 3635, 1157, 4089, 6, 2651, 12, 1547, 22, 7, 3265, 655, 5, 19, 27, 3838, 22, 7, 38, 2388, 257, 12, 36, 8, 465, 209, 13409, 12150, 1959, 16, 8, 684, 6, 6737, 57, 165, 126, 13409, 12150, 1623, 5, 71, 1100, 30298, 934, 65, 12566, 24, 27, 3838, 31, 7, 126, 13409, 12150, 1623, 33, 8, 10391, 1710, 859, 8, 420, 3733, 4968, 688, 2699, 16, 1547, 5, 27, 3838, 1217, 131, 99, 23, 179, 6064, 24, 6, 590, 28, 3, 11600, 1456, 701, 6, 175, 9443, 2557, 3635, 92, 1262, 8, 3409, 13, 2186, 3, 27908, 1784, 190, 8, 3, 5771, 17, 13281, 4005, 13, 5086, 11, 13066, 1170, 5, 10826, 16309, 134, 3, 2, 276, 26, 3, 55, 391, 13570, 5, 10315, 309, 3577, 19114, 371, 4254, 5121, 5055, 6245, 3, 10047, 3162, 58, 3, 9, 61, 1713, 2703, 476, 667, 25158, 301, 6058, 6038, 476, 3765, 9149, 10, 4893, 1303, 1986, 5, 13580, 7, 8224, 28244, 7, 5, 76, 75, 7, 89, 5, 15, 1259, 87, 7171, 7, 87, 7, 29, 115, 226, 4305, 2773, 1]])  # fmt: skip
    # 检查预期输入 ID 是否与实际输入 ID 相匹配,使用 torch.testing.assert_close 进行断言
    torch.testing.assert_close(EXPECTED_INPUT_IDS, input_ids)
    # 获得编码中的边界框,转换为浮点数类型
    bbox = encoding.bbox.float()
    # 获得编码中的像素值
    pixel_values = encoding.pixel_values

    # 如果出现异常,打印错误信息,并准备使用虚拟输入
    except Exception:
        print("Input_ids don't match, preparing dummy inputs")
        # 调用准备虚拟输入的函数,获取输入 ID、边界框和像素值
        input_ids, bbox, pixel_values = prepare_dummy_inputs(tokenizer, image_processor)

    # 验证单个前向传播过程
    print("Testing single forward pass..")
    # 禁用梯度计算的上下文管理器
    with torch.no_grad():
        # 设置解码器的输入 ID
        decoder_input_ids = torch.tensor([[101]])
        # 调用模型进行前向传播,获取输出结果
        outputs = model(input_ids=input_ids, bbox=bbox, pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
        # 打印输出 logits 的形状
        print("Shape of logits:", outputs.logits.shape)
        # 打印 logits 的前几个值
        print("First values of logits:", outputs.logits[0, :3, :3])

    # 比较输出 logits 的前几个值与预期值的接近程度,设定容差值为 1e-4
    # 在 Linux 上:tensor([[-18.5262, 1.5087, -15.7051]])
    # 在 Mac 上:tensor([[-19.4976, 0.8515, -17.1873]])
    try:
        assert torch.allclose(outputs.logits[0, :3, :3], torch.tensor([[-18.5262, 1.5087, -15.7051]]), atol=1e-4)
        print("Looks ok!")
    # 如果比较不通过,打印提示信息
    except Exception:
        print("logits don't match let's try to generate")

    # 验证自回归解码过程
    print("Testing generation...")
    # 构建模型参数字典
    model_kwargs = {"bbox": bbox, "pixel_values": pixel_values}
    # 调用模型进行生成,指定最大新增 token 数量为 20
    outputs = model.generate(input_ids=input_ids, **model_kwargs, max_new_tokens=20)

    # 打印生成的结果文本,跳过特殊 token 后解码成字符串
    print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))

    # 使用原始输入数据进行自回归解码
    print("Testing generation with original inputs...")
    # 下载指定的模型输入数据文件
    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="input_ids_udop.pt", repo_type="dataset")
    # 从文件加载预训练模型的输入标识符
    input_ids = torch.load(filepath)
    # 使用hf_hub_download函数下载指定仓库和文件名的内容,并更新filepath变量
    filepath = hf_hub_download(repo_id="nielsr/test-image", filename="bbox_udop.pt", repo_type="dataset")
    # 加载保存在filepath中的包围框数据
    bbox = torch.load(filepath)
    # 根据模型名称确定要加载的像素值文件名
    pixel_values_filename = "pixel_values_udop_512.pt" if "512" in model_name else "pixel_values_udop_224.pt"
    # 使用hf_hub_download函数下载指定仓库和文件名的内容,并更新filepath变量
    filepath = hf_hub_download(repo_id="nielsr/test-image", filename=pixel_values_filename, repo_type="dataset")
    # 加载保存在filepath中的像素值数据
    pixel_values = torch.load(filepath)

    # 打印解码后的输入标识符,跳过特殊标记
    print("Decoded input ids:", tokenizer.decode(input_ids[0], skip_special_tokens=True))
    # 打印包围框的形状
    print("Bbox shape:", bbox.shape)

    # 准备模型参数,包括bbox和pixel_values
    model_kwargs = {"bbox": bbox, "pixel_values": pixel_values}
    # 使用模型生成文本输出,限制最大新生成的标记数量为20
    outputs = model.generate(input_ids=input_ids, **model_kwargs, max_new_tokens=20)
    # 解码模型生成的输出文本,跳过特殊标记,并取第一个文本结果
    generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    # 打印生成的文本
    print("Generated:", generated_text)

    # 如果指定了PyTorch模型保存文件夹路径,则保存模型和分词器的预训练状态
    if pytorch_dump_folder_path is not None:
        model.save_pretrained(pytorch_dump_folder_path)
        tokenizer.save_pretrained(pytorch_dump_folder_path)

    # 如果设置了push_to_hub标志,则将模型和处理器推送到Hub上指定的仓库
    if push_to_hub:
        model.push_to_hub(f"microsoft/{model_name}")
        processor.push_to_hub(f"microsoft/{model_name}")
        # 重要提示:要将快速分词器文件保存在Hub上的仓库中,请执行以下操作:
        # 参见https://discuss.huggingface.co/t/convert-slow-xlmrobertatokenizer-to-fast-one/20876
if __name__ == "__main__":
    # 如果脚本直接执行而非被导入,则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # Required parameters
    parser.add_argument(
        "--model_name",
        default="udop-large",
        type=str,
        choices=["udop-large", "udop-large-512", "udop-large-512-300k"],
        help=("Name of the UDOP model you'd like to convert."),
    )
    # 添加一个必需的参数,用于指定要转换的 UDOP 模型的名称,提供了默认值和可选项

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    # 添加一个参数,用于指定输出的 PyTorch 模型目录的路径

    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )
    # 添加一个参数,指定是否将转换后的模型推送到 🤗 hub

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数,将解析后的参数传递给函数进行 UDOP 模型的转换
    convert_udop_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

.\models\udop\modeling_udop.py

# coding=utf-8
# 版权 2024 年 Microsoft Research 和 HuggingFace Inc. 团队所有。
#
# 根据 Apache 许可证 2.0 版本(“许可证”)授权;
# 除非符合许可证,否则不得使用此文件。
# 您可以在以下网址获取许可证副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则本软件基于“原样”分发,
# 没有任何形式的明示或暗示担保或条件。
# 有关详细信息,请参阅许可证。
""" PyTorch UDOP model."""

import collections  # 导入 collections 模块
import logging  # 导入 logging 模块
import math  # 导入 math 模块
import random  # 导入 random 模块
from abc import ABC, abstractmethod  # 从 abc 模块导入 ABC 抽象类和 abstractmethod 装饰器
from copy import deepcopy  # 导入 deepcopy 函数
from dataclasses import dataclass  # 导入 dataclass 装饰器
from typing import Any, Dict, Optional, Sequence, Tuple, Union  # 导入类型提示相关的类和函数

import torch  # 导入 PyTorch 库
from torch import Tensor, nn  # 从 torch 库导入 Tensor 和 nn 模块
from torch.nn import CrossEntropyLoss  # 从 torch.nn 模块导入 CrossEntropyLoss 类

from transformers import UdopConfig  # 导入 UdopConfig 类
from transformers.modeling_outputs import (  # 从 transformers.modeling_outputs 导入以下类
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
)

from ...activations import ACT2FN  # 导入 ACT2FN 激活函数
from ...modeling_utils import PreTrainedModel  # 导入 PreTrainedModel 类
from ...pytorch_utils import (  # 从 ...pytorch_utils 导入以下函数
    find_pruneable_heads_and_indices,
    prune_linear_layer,
)
from ...utils import (  # 从 ...utils 导入以下函数和类
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)

logger = logging.getLogger(__name__)  # 获取当前模块的 logger 对象

UDOP_PRETRAINED_MODEL_ARCHIVE_LIST = [  # UDOP 预训练模型的列表
    "microsoft/udop-large",
    # 查看所有 UDOP 模型:https://huggingface.co/models?filter=udop
]

_CONFIG_FOR_DOC = "UdopConfig"  # 用于文档的配置名称

UDOP_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Args:
        config ([`UdopConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

UDOP_INPUTS_DOCSTRING = r"""
"""

UDOP_ENCODER_INPUTS_DOCSTRING = r"""
"""

@dataclass
class BaseModelOutputWithAttentionMask(ModelOutput):
    """
    Class for the model's outputs that may also contain a past key/values (to speed up sequential decoding). Includes
    an additional attention mask.
"""
    # 最后一层模型的隐藏状态,形状为(batch_size, sequence_length, hidden_size),若使用了past_key_values,只输出形状为(batch_size, 1, hidden_size)的序列的最后隐藏状态。
    last_hidden_state: torch.FloatTensor = None
    
    # 注意力掩码,形状为(batch_size, sequence_length),用于指示模型在计算注意力时要忽略的位置。
    attention_mask: torch.FloatTensor = None
    
    # 过去的键值对,类型为Optional[Tuple[Tuple[torch.FloatTensor]]],当使用use_cache=True或者config.use_cache=True时返回,包含预先计算的隐藏状态(在自注意力块中的键和值),
    # 若config.is_encoder_decoder=True,还包含交叉注意力块中的隐藏状态。
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
    
    # 隐藏状态的元组,类型为Optional[Tuple[torch.FloatTensor]],当传递output_hidden_states=True或者config.output_hidden_states=True时返回,
    # 包含模型每一层的隐藏状态(如果模型有嵌入层,则还包括初始嵌入输出)。
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    
    # 注意力权重的元组,类型为Optional[Tuple[torch.FloatTensor]],当传递output_attentions=True或者config.output_attentions=True时返回,
    # 包含每一层的注意力权重,形状为(batch_size, num_heads, sequence_length, sequence_length),用于计算自注意力头中的加权平均值。
    attentions: Optional[Tuple[torch.FloatTensor]] = None
    
    # 交叉注意力权重的元组,类型为Optional[Tuple[torch.FloatTensor]],当传递output_attentions=True和config.add_cross_attention=True时返回,
    # 包含解码器的交叉注意力层的注意力权重,形状为(batch_size, num_heads, sequence_length, sequence_length),用于计算交叉注意力头中的加权平均值。
    cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
    """
    合并图像和文本嵌入,作为UDOP编码器/解码器的输入。

    首先,通过检查每个视觉补丁是否在标记边界框内,创建图像嵌入。如果是,则将视觉补丁与标记嵌入组合。
    然后,将视觉边界框与文本边界框结合起来。
    最后,将视觉边界框与文本注意力掩码结合起来。
    """

    # 计算序列的长度,即视觉补丁的数量
    sequence_length = num_patches

    # 计算OCR点的x坐标,取值范围在0到sequence_length-1之间
    ocr_points_x = torch.clip(
        torch.floor((bbox[:, :, 0] + bbox[:, :, 2]) / 2.0 * sequence_length).long(), 0, sequence_length - 1
    )

    # 计算OCR点的y坐标,取值范围在0到(sequence_length-1)*sequence_length之间
    ocr_points_y = (
        torch.clip(torch.floor((bbox[:, :, 1] + bbox[:, :, 3]) / 2.0 * sequence_length).long(), 0, sequence_length - 1)
        * sequence_length
    )

    # 组合计算得到OCR点的索引
    ocr_points = ocr_points_x + ocr_points_y

    # 确保边界框的类型为float以计算均值
    bbox = bbox.to(torch.float64)

    # 创建目标分段,判断是否边界框均值为0或1
    target_seg = (bbox.mean(-1) == 0.0) | (bbox.mean(-1) == 1.0)

    # 根据OCR点索引,重复使用视觉嵌入
    repeated_vision_embeds = torch.gather(
        image_embeddings, 1, ocr_points.unsqueeze(-1).repeat(1, 1, image_embeddings.size(-1))
    )

    # 将重复视觉嵌入中属于目标分段的部分置为0
    repeated_vision_embeds[target_seg] = 0.0

    # 将重复视觉嵌入添加到输入嵌入中
    inputs_embeds += repeated_vision_embeds

    # 创建补丁索引,全为True的布尔张量
    patch_inds = torch.full_like(image_embeddings[:, :, 0], True).bool()

    # 构造索引张量ind,用于聚合OCR点
    ind = torch.cat(
        [
            torch.arange(len(ocr_points))[:, None].repeat(1, ocr_points.size(-1))[:, :, None].to(ocr_points),
            ocr_points[:, :, None],
        ],
        dim=-1,
    )

    # 展平ind张量,以便用于后续操作
    ind = ind.flatten(0, 1)
    # 将元组列表解压缩为行和列两个分离的列表
    rows, cols = zip(*ind)
    # 根据给定的行列索引将 patch_inds 中对应位置的元素设为 False
    patch_inds[rows, cols] = False

    # 从 image_embeddings 中选择符合 patch_inds 条件的图像嵌入片段,并组成列表
    input_vision_patches = [image_embeddings[i][patch_inds[i]] for i in range(len(patch_inds))]

    # 如果 visual_bbox 为 None,则调用 get_visual_bbox 函数获取视觉边界框,并做扩展和设备适配处理
    if visual_bbox is None:
        visual_bbox = get_visual_bbox(image_size=image_size, patch_size=patch_size)
        visual_bbox = visual_bbox.unsqueeze(0).repeat(image_embeddings.size(0), 1, 1)
        visual_bbox = visual_bbox.to(image_embeddings.device)

    # 根据 patch_inds 条件选择 visual_bbox 中的子集,并组成列表
    visual_bbox = [visual_bbox[i][patch_inds[i]] for i in range(len(patch_inds))]

    # 如果 attention_mask 不为 None,则为 visual_bbox 创建对应的视觉注意力掩码列表
    if attention_mask is not None:
        visual_attention_mask = [torch.tensor([1] * len(item)).to(attention_mask) for item in visual_bbox]

    # 如果 max_len 为 0,则设为 image_embeddings 的第一维度大小;否则减去 inputs_embeds 的第一维度大小
    if max_len == 0:
        max_len = image_embeddings.size(1)
    else:
        max_len = max_len - inputs_embeds.size(1)

    # 将 input_vision_patches 中的每个张量填充到相同的最大长度,并组成张量列表
    inputs_vision_patches = torch.stack(
        [pad_sequence(item, max_len, torch.zeros_like(image_embeddings[0, 0])) for item in input_vision_patches]
    )

    # 将 visual_bbox 中的每个张量填充到相同的最大长度,并组成张量列表
    visual_bbox = torch.stack([pad_sequence(item, max_len, torch.zeros_like(bbox[0, 0])) for item in visual_bbox])

    # 如果 attention_mask 不为 None,则将 visual_attention_mask 中的每个张量填充到相同的最大长度,并组成张量列表
    if attention_mask is not None:
        visual_attention_mask = torch.stack(
            [pad_sequence(item, max_len, torch.zeros_like(attention_mask[0, 0])) for item in visual_attention_mask]
        )

    # 将 inputs_embeds 和 inputs_vision_patches 拼接在第二维度上
    inputs_embeds = torch.cat([inputs_embeds, inputs_vision_patches], 1)

    # 将 bbox 和 visual_bbox 拼接在第二维度上
    bbox = torch.cat([bbox, visual_bbox], 1)

    # 如果 attention_mask 不为 None,则将 attention_mask 和 visual_attention_mask 拼接在第二维度上
    if attention_mask is not None:
        attention_mask = torch.cat([attention_mask, visual_attention_mask], 1)

    # 返回拼接后的 inputs_embeds, bbox 和 attention_mask(如果有的话)
    return inputs_embeds, bbox, attention_mask
class UdopPatchEmbeddings(nn.Module):
    """2D Image to Patch Embeddings"""

    def __init__(self, config):
        super().__init__()
        # 初始化函数,接收配置参数config,并进行初始化设置
        image_size, patch_size = config.image_size, config.patch_size
        num_channels, hidden_size = config.num_channels, config.hidden_size

        # 如果image_size和patch_size不是可迭代对象,则转换为元组形式
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        
        # 计算图像分块数
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        
        # 设置对象的属性
        self.image_size = image_size
        self.patch_size = patch_size
        self.num_channels = num_channels
        self.num_patches = num_patches

        # 使用2D卷积层将图像块映射为嵌入向量
        self.proj = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)

    def forward(self, pixel_values):
        # 前向传播函数,接收像素值张量,并返回嵌入向量
        batch_size, num_channels, height, width = pixel_values.shape
        
        # 检查输入图像尺寸是否符合预期
        if height != self.image_size[0] or width != self.image_size[1]:
            raise ValueError(
                f"Input image size ({height}*{width}) doesn't match model"
                f" ({self.image_size[0]}*{self.image_size[1]})."
            )
        
        # 使用卷积层进行嵌入向量的计算
        embeddings = self.proj(pixel_values)
        
        # 将卷积输出展平,并转置维度,以便后续处理
        embeddings = embeddings.flatten(2).transpose(1, 2)
        
        # 返回计算得到的嵌入向量
        return embeddings


class UdopPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models. Based on `T5PreTrainedModel`.
    """

    config_class = UdopConfig
    base_model_prefix = "transformer"
    supports_gradient_checkpointing = True
    _no_split_modules = ["UdopBlock"]
    _keep_in_fp32_modules = ["wo"]

    # 从transformers.models.prophetnet.modeling_prophetnet.ProphetNetPreTrainedModel._shift_right复制而来,将ProphetNet替换为Udop
    def _shift_right(self, input_ids):
        # 获取解码器起始标记和填充标记
        decoder_start_token_id = self.config.decoder_start_token_id
        pad_token_id = self.config.pad_token_id

        # 确保解码器起始标记被定义
        assert decoder_start_token_id is not None, (
            "self.model.config.decoder_start_token_id has to be defined. In Udop it is usually set to the"
            " pad_token_id. See Udop docs for more information"
        )

        # 将输入向右移动一位
        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
        shifted_input_ids[..., 0] = decoder_start_token_id

        # 确保填充标记被定义
        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
        
        # 将标签中可能存在的-100值替换为填充标记
        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

        # 确保`shifted_input_ids`中的值都是非负数
        assert torch.all(shifted_input_ids >= 0).item(), "Verify that `shifted_input_ids` has only positive values"

        # 返回向右移位后的输入标记
        return shifted_input_ids

# 从transformers.models.t5.modeling_t5.T5LayerNorm复制而来,将T5替换为Udop
# 定义一个名为 UdopLayerNorm 的自定义 PyTorch 模块,用于实现 Udop 风格的 Layer Normalization
class UdopLayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        构造一个 Udop 风格的 layernorm 模块。无偏置和无均值减法。
        """
        super().__init__()
        # 使用 nn.Parameter 定义可学习的权重参数,默认为全 1 的张量
        self.weight = nn.Parameter(torch.ones(hidden_size))
        # 设置方差的 epsilon 值,用于数值稳定性
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        # Udop 使用一种仅进行缩放而不进行偏移的 layer_norm,也被称为 Root Mean Square Layer Normalization
        # https://arxiv.org/abs/1910.07467,因此计算方差时没有均值,并且没有偏置。此外,我们希望确保
        # 半精度输入的累积在 fp32 中完成

        # 计算输入张量的方差,将输入转换为 float32 类型,平方,沿着指定维度求均值,并保持维度
        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
        # 根据方差和 epsilon 计算归一化后的 hidden_states
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)

        # 如果权重参数的数据类型是半精度(float16 或 bfloat16),则将 hidden_states 转换为相同数据类型
        if self.weight.dtype in [torch.float16, torch.bfloat16]:
            hidden_states = hidden_states.to(self.weight.dtype)

        # 返回经过权重缩放后的 hidden_states
        return self.weight * hidden_states


# 从 transformers.models.t5.modeling_t5.T5DenseActDense 复制并修改为 Udop 风格
class UdopDenseActDense(nn.Module):
    def __init__(self, config: UdopConfig):
        super().__init__()
        # 使用 nn.Linear 定义一个线性变换层 wi,输入维度为 config.d_model,输出维度为 config.d_ff,无偏置
        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
        # 定义一个线性变换层 wo,输入维度为 config.d_ff,输出维度为 config.d_model,无偏置
        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
        # 定义一个 Dropout 层,使用 config.dropout_rate 作为丢弃概率
        self.dropout = nn.Dropout(config.dropout_rate)
        # 选择激活函数,根据 config.dense_act_fn 选择对应的激活函数
        self.act = ACT2FN[config.dense_act_fn]

    def forward(self, hidden_states):
        # 对输入的 hidden_states 进行 wi 层的线性变换
        hidden_states = self.wi(hidden_states)
        # 对变换后的 hidden_states 应用选择的激活函数
        hidden_states = self.act(hidden_states)
        # 对激活后的 hidden_states 应用 Dropout 操作
        hidden_states = self.dropout(hidden_states)
        
        # 如果 wo.weight 是 torch.Tensor,并且 hidden_states 的数据类型不等于 wo.weight 的数据类型,
        # 并且 wo.weight 的数据类型不是 torch.int8,则将 hidden_states 转换为 wo.weight 的数据类型
        if (
            isinstance(self.wo.weight, torch.Tensor)
            and hidden_states.dtype != self.wo.weight.dtype
            and self.wo.weight.dtype != torch.int8
        ):
            hidden_states = hidden_states.to(self.wo.weight.dtype)
        
        # 对应用完所有层操作后的 hidden_states 应用 wo 层的线性变换
        hidden_states = self.wo(hidden_states)
        
        # 返回最终的 hidden_states
        return hidden_states


# 从 transformers.models.t5.modeling_t5.T5DenseGatedActDense 复制并修改为 Udop 风格
class UdopDenseGatedActDense(nn.Module):
    def __init__(self, config: UdopConfig):
        super().__init__()
        # 定义两个线性变换层 wi_0 和 wi_1,输入维度均为 config.d_model,输出维度为 config.d_ff,无偏置
        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
        # 定义一个线性变换层 wo,输入维度为 config.d_ff,输出维度为 config.d_model,无偏置
        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
        # 定义一个 Dropout 层,使用 config.dropout_rate 作为丢弃概率
        self.dropout = nn.Dropout(config.dropout_rate)
        # 选择激活函数,根据 config.dense_act_fn 选择对应的激活函数
        self.act = ACT2FN[config.dense_act_fn]
    # 定义神经网络的前向传播方法,接收隐藏状态作为输入
    def forward(self, hidden_states):
        # 将隐藏状态通过激活函数 gelu 处理
        hidden_gelu = self.act(self.wi_0(hidden_states))
        # 将隐藏状态通过线性层 wi_1 处理
        hidden_linear = self.wi_1(hidden_states)
        # 将 gelu 处理后的结果与线性处理后的结果相乘,得到新的隐藏状态
        hidden_states = hidden_gelu * hidden_linear
        # 对新的隐藏状态进行 dropout 处理
        hidden_states = self.dropout(hidden_states)

        # 为了使 8 位量化在 google/flan-t5-xxl 模型中正常工作,self.wo 保持为 float32 类型
        # 参考 https://github.com/huggingface/transformers/issues/20287
        # 同时,确保权重不是 `int8` 类型,以防用户强制 `_keep_in_fp32_modules` 为 `None`
        if (
            isinstance(self.wo.weight, torch.Tensor)
            and hidden_states.dtype != self.wo.weight.dtype
            and self.wo.weight.dtype != torch.int8
        ):
            # 如果 hidden_states 的数据类型与 self.wo.weight 的数据类型不同,则将 hidden_states 转换为 self.wo.weight 的数据类型
            hidden_states = hidden_states.to(self.wo.weight.dtype)

        # 将处理后的隐藏状态传递给输出层 wo 进行处理
        hidden_states = self.wo(hidden_states)
        # 返回处理后的隐藏状态作为输出
        return hidden_states
# Copied from transformers.models.t5.modeling_t5.T5LayerFF with T5->Udop
class UdopLayerFF(nn.Module):
    def __init__(self, config: UdopConfig):
        super().__init__()
        # 根据配置选择不同类型的前向传播层:带门控激活函数或普通激活函数
        if config.is_gated_act:
            self.DenseReluDense = UdopDenseGatedActDense(config)
        else:
            self.DenseReluDense = UdopDenseActDense(config)

        # 初始化层归一化和dropout层
        self.layer_norm = UdopLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, hidden_states):
        # 应用层归一化到隐藏状态
        forwarded_states = self.layer_norm(hidden_states)
        # 经过前向传播层(带门控或普通激活函数)
        forwarded_states = self.DenseReluDense(forwarded_states)
        # 使用dropout应用到隐藏状态
        hidden_states = hidden_states + self.dropout(forwarded_states)
        return hidden_states


# Copied from transformers.models.t5.modeling_t5.T5Attention with T5->Udop
class UdopAttention(nn.Module):
    def __init__(self, config: UdopConfig, has_relative_attention_bias=False):
        super().__init__()
        # 初始化注意力层参数和配置
        self.is_decoder = config.is_decoder
        self.has_relative_attention_bias = has_relative_attention_bias
        self.relative_attention_num_buckets = config.relative_attention_num_buckets
        self.relative_attention_max_distance = config.relative_attention_max_distance
        self.d_model = config.d_model
        self.key_value_proj_dim = config.d_kv
        self.n_heads = config.num_heads
        self.dropout = config.dropout_rate
        self.inner_dim = self.n_heads * self.key_value_proj_dim

        # 使用线性层定义查询、键、值和输出的投影
        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)

        # 如果有相对注意力偏置,则初始化相对注意力偏置的嵌入
        if self.has_relative_attention_bias:
            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
        self.pruned_heads = set()
        self.gradient_checkpointing = False

    def prune_heads(self, heads):
        # 如果需要修剪注意力头部,则根据给定的头部索引进行修剪
        if len(heads) == 0:
            return
        heads, index = find_pruneable_heads_and_indices(
            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
        )
        # 对线性层进行修剪
        self.q = prune_linear_layer(self.q, index)
        self.k = prune_linear_layer(self.k, index)
        self.v = prune_linear_layer(self.v, index)
        self.o = prune_linear_layer(self.o, index, dim=1)
        # 更新超参数
        self.n_heads = self.n_heads - len(heads)
        self.inner_dim = self.key_value_proj_dim * self.n_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    @staticmethod
    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
        """
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor - 相对位置差值的整数张量
            bidirectional: a boolean - 是否为双向注意力的标志
            num_buckets: an integer - 桶的数量,决定了输出的范围 [0, num_buckets)
            max_distance: an integer - 最大的相对距离,超过该距离的相对位置都映射到同一个桶中

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
            返回一个形状与relative_position相同的张量,包含在范围[0, num_buckets)内的int32值
        """
        relative_buckets = 0
        if bidirectional:
            num_buckets //= 2
            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
            relative_position = torch.abs(relative_position)
        else:
            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
        # now relative_position is in the range [0, inf)

        # half of the buckets are for exact increments in positions
        max_exact = num_buckets // 2
        is_small = relative_position < max_exact

        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
        relative_position_if_large = max_exact + (
            torch.log(relative_position.float() / max_exact)
            / math.log(max_distance / max_exact)
            * (num_buckets - max_exact)
        ).to(torch.long)
        relative_position_if_large = torch.min(
            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
        )

        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
        return relative_buckets
    # 计算相对位置偏置
    def compute_bias(self, query_length, key_length, device=None):
        """Compute binned relative position bias"""
        # 如果未指定设备,则使用预定义的相对注意力偏置权重的设备
        if device is None:
            device = self.relative_attention_bias.weight.device
        # 创建一个表示查询长度的张量,并在第二维添加维度
        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
        # 创建一个表示键长度的张量,并在第一维添加维度
        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
        # 计算相对位置差,得到形状为 (query_length, key_length) 的张量
        relative_position = memory_position - context_position  
        # 根据相对位置差进行分桶,得到形状为 (query_length, key_length) 的张量
        relative_position_bucket = self._relative_position_bucket(
            relative_position,  # 形状为 (query_length, key_length) 的相对位置差张量
            bidirectional=(not self.is_decoder),  # 是否双向
            num_buckets=self.relative_attention_num_buckets,  # 分桶数量
            max_distance=self.relative_attention_max_distance,  # 最大距离限制
        )
        # 根据分桶结果获取相对注意力偏置值,形状为 (query_length, key_length, num_heads)
        values = self.relative_attention_bias(relative_position_bucket)
        # 调整张量的维度顺序,添加额外的维度,形状变为 (1, num_heads, query_length, key_length)
        values = values.permute([2, 0, 1]).unsqueeze(0)
        # 返回相对位置偏置张量
        return values

    # 前向传播方法
    def forward(
        self,
        hidden_states,
        mask=None,
        key_value_states=None,
        position_bias=None,
        past_key_value=None,
        layer_head_mask=None,
        query_length=None,
        use_cache=False,
        output_attentions=False,
# 从transformers.models.t5.modeling_t5.T5LayerSelfAttention复制过来,将T5替换为Udop
class UdopLayerSelfAttention(nn.Module):
    def __init__(self, config, has_relative_attention_bias=False):
        super().__init__()
        # 初始化自注意力层,使用UdopAttention代替T5中的Attention机制
        self.SelfAttention = UdopAttention(config, has_relative_attention_bias=has_relative_attention_bias)
        # 初始化层归一化模块,使用UdopLayerNorm代替T5中的LayerNorm
        self.layer_norm = UdopLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        # 初始化dropout层,使用给定的dropout率
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        output_attentions=False,
    ):
        # 对输入的hidden_states进行层归一化处理
        normed_hidden_states = self.layer_norm(hidden_states)
        # 调用SelfAttention进行注意力计算
        attention_output = self.SelfAttention(
            normed_hidden_states,
            mask=attention_mask,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            output_attentions=output_attentions,
        )
        # 将原始的hidden_states和dropout后的attention_output相加,得到新的hidden_states
        hidden_states = hidden_states + self.dropout(attention_output[0])
        # 如果输出注意力权重,将其包含在outputs中
        outputs = (hidden_states,) + attention_output[1:]  # 如果有输出,添加注意力权重
        return outputs


# 从transformers.models.t5.modeling_t5.T5LayerCrossAttention复制过来,将T5替换为Udop
class UdopLayerCrossAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化跨层注意力层,使用UdopAttention代替T5中的Attention机制
        self.EncDecAttention = UdopAttention(config, has_relative_attention_bias=False)
        # 初始化层归一化模块,使用UdopLayerNorm代替T5中的LayerNorm
        self.layer_norm = UdopLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
        # 初始化dropout层,使用给定的dropout率
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(
        self,
        hidden_states,
        key_value_states,
        attention_mask=None,
        position_bias=None,
        layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        query_length=None,
        output_attentions=False,
    ):
        # 对输入的hidden_states进行层归一化处理
        normed_hidden_states = self.layer_norm(hidden_states)
        # 调用EncDecAttention进行跨层注意力计算
        attention_output = self.EncDecAttention(
            normed_hidden_states,
            mask=attention_mask,
            key_value_states=key_value_states,
            position_bias=position_bias,
            layer_head_mask=layer_head_mask,
            past_key_value=past_key_value,
            use_cache=use_cache,
            query_length=query_length,
            output_attentions=output_attentions,
        )
        # 将原始的hidden_states和dropout后的attention_output相加,得到新的层输出
        layer_output = hidden_states + self.dropout(attention_output[0])
        # 如果输出注意力权重,将其包含在outputs中
        outputs = (layer_output,) + attention_output[1:]  # 如果有输出,添加注意力权重
        return outputs


# 从transformers.models.t5.modeling_t5.T5Block复制过来,将T5替换为Udop
class UdopBlock(nn.Module):
    # 初始化方法,用于创建一个新的UdopLayer对象
    def __init__(self, config, has_relative_attention_bias=False):
        # 调用父类的初始化方法
        super().__init__()
        # 根据配置设置是否为解码器
        self.is_decoder = config.is_decoder
        # 创建一个空的模块列表用于存储各层的UdopLayer
        self.layer = nn.ModuleList()
        # 将一个UdopLayerSelfAttention层添加到模块列表中
        self.layer.append(UdopLayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
        # 如果是解码器,则添加一个UdopLayerCrossAttention层到模块列表中
        if self.is_decoder:
            self.layer.append(UdopLayerCrossAttention(config))

        # 添加一个UdopLayerFF层到模块列表中
        self.layer.append(UdopLayerFF(config))

    # 前向传播方法,定义了模型的计算流程
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        position_bias=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        encoder_decoder_position_bias=None,
        layer_head_mask=None,
        cross_attn_layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        output_attentions=False,
        return_dict=True,
# UdopCellEmbeddings 类定义,用于生成嵌入表示单元格位置的模块
class UdopCellEmbeddings(nn.Module):
    # 初始化函数,设置最大二维位置嵌入和隐藏大小
    def __init__(self, max_2d_position_embeddings=501, hidden_size=1024):
        super(UdopCellEmbeddings, self).__init__()
        # 设置最大二维位置嵌入的数量
        self.max_2d_position_embeddings = max_2d_position_embeddings

        # 创建 X 轴和 Y 轴位置嵌入的 Embedding 层
        self.x_position_embeddings = nn.Embedding(max_2d_position_embeddings, hidden_size)
        self.y_position_embeddings = nn.Embedding(max_2d_position_embeddings, hidden_size)

    # 前向传播函数,计算嵌入表示
    def forward(self, bbox):
        # 将 bbox 的值裁剪到 [0.0, 1.0] 范围内
        bbox = torch.clip(bbox, 0.0, 1.0)
        # 将裁剪后的 bbox 转换为整数索引,乘以最大二维位置嵌入的数量
        bbox = (bbox * (self.max_2d_position_embeddings - 1)).long()
        # 获取左上角和右下角位置的 X 和 Y 轴嵌入
        left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
        upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
        right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
        lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])

        # 计算最终的嵌入表示,将四个位置的嵌入相加
        embeddings = (
            left_position_embeddings
            + upper_position_embeddings
            + right_position_embeddings
            + lower_position_embeddings
        )

        return embeddings


# UdopAttention._relative_position_bucket 的别名函数 get_relative_position_bucket
get_relative_position_bucket = UdopAttention._relative_position_bucket
# 定义增强范围的元组常量
AUGMENTATION_RANGE = (0.80, 1.25)


# RelativePositionBiasBase 类定义,用于定义相对位置偏置的基础类
class RelativePositionBiasBase(nn.Module, ABC):
    """
    相对位置偏置的基础类。

    Args:
        num_heads (`int`):
            模型中的注意力头数,将创建大小为 `num_heads` 的嵌入,将添加到每个令牌对的分数上。
        relative_attention_num_buckets (`int`, *optional*, 默认为 32):
            令牌对度量(序列中的距离、像素中的距离等)将被分桶化,该参数定义了这种桶的数量。
        bidirectional (`bool`, *optional*, 默认为 `True`):
            令牌对之间的距离是否应该是双向的。如果为 `False`,则距离(tok1, tok2) == 距离(tok2, tok1)。
        scaling_factor (`int`, *optional*, 默认为 1):
            用于缩放相对距离的因子。
        max_distance (`int`, *optional*, 默认为 128):
            所有大于此值的距离将进入同一个桶中。
        augmentation (`bool`, *optional*, 默认为 `False`):
            是否将相对距离乘以随机标量。
        expand (`bool`, *optional*, 默认为 `False`):
            是否扩展现有的预训练模型,并在后续添加中添加 prefix_bucket。
    """

    # 初始化函数,设置相对位置偏置的基础参数
    def __init__(
        self,
        num_heads=None,
        relative_attention_num_buckets=32,
        bidirectional=True,
        scaling_factor=1,
        max_distance=128,
        level="tokens",
        augmentation=False,
        prefix_bucket=False,
        expand=False,
        # 继承自 nn.Module 和 ABC 类的初始化
        *args,
        **kwargs
    ):
        super().__init__(*args, **kwargs)  # 调用父类的初始化函数
    ):
        # 调用父类的构造函数,初始化基类的属性
        super(RelativePositionBiasBase, self).__init__()
        # 初始化相对位置偏置基类的属性
        self.prefix_bucket = prefix_bucket
        self.augmentation = augmentation
        self.level = level
        self.max_distance = max_distance
        self.scaling_factor = scaling_factor
        self.bidirectional = bidirectional
        self.num_heads = num_heads
        self.expand = expand
        self.relative_attention_num_buckets = relative_attention_num_buckets
        # 如果使用前缀桶并且不扩展,额外增加两个头
        extra_head = 2 if prefix_bucket and not self.expand else 0
        # 创建相对注意力偏置的嵌入层,根据桶的数量和额外头的数量进行初始化
        self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets + extra_head, self.num_heads)

    @abstractmethod
    def prepare_input(
        self,
        attention_mask: Optional[Tensor] = None,
        bbox: Optional[Dict[str, Any]] = None,
    ) -> Tensor:
        # 准备输入的抽象方法,接受注意力掩码和边界框作为可选参数,返回张量
        pass

    def get_bucket(self, attention_mask: Optional[Tensor] = None, bbox: Optional[Dict[str, Any]] = None) -> Tensor:
        # 获取桶(bucket)函数,计算相对位置桶
        relative_position = self.prepare_input(attention_mask, bbox)
        # 调用函数计算相对位置桶的张量
        rp_bucket: Tensor = get_relative_position_bucket(
            relative_position,
            bidirectional=self.bidirectional,
            num_buckets=self.relative_attention_num_buckets,
            max_distance=self.max_distance,
        )
        return rp_bucket

    def get_relative_position(self, positions):
        # 获取相对位置的函数,输入位置张量,返回相对位置张量
        context_position = positions[:, :, None]
        memory_position = positions[:, None, :]
        relative_position = memory_position - context_position
        # 如果启用了增强且处于训练模式,则对相对位置进行随机增强
        if self.augmentation and self.training:
            relative_position *= random.uniform(*AUGMENTATION_RANGE)
        relative_position *= self.scaling_factor

        return relative_position.to(torch.long)
    # 如果设置了self.expand并且self.prefix_bucket为True,则执行以下操作:
    # 创建一个新的Embedding层用于相对注意力偏置,长度为self.relative_attention_num_buckets + 2,每个头部有一个值
    new_bias = nn.Embedding(self.relative_attention_num_buckets + 2, self.num_heads)
    # 将旧的相对注意力偏置的权重复制到新的Embedding层中
    new_bias.weight.data[: self.relative_attention_num_buckets] = self.relative_attention_bias.weight.data
    # 将剩余部分设置为0.1
    new_bias.weight.data[self.relative_attention_num_buckets :] = 0.1
    # 将模型的相对注意力偏置设置为新的Embedding层
    self.relative_attention_bias = new_bias
    # 设置self.expand为False,表示不再需要扩展相对注意力偏置
    self.expand = False

rp_bucket = self.get_bucket(attention_mask, bbox)

# 如果self.prefix_bucket为True,则执行以下操作:
if self.prefix_bucket:
    # 如果rp_bucket的第一维度为1,而attention_mask的第一维度大于1,则将rp_bucket复制到相同的长度
    if rp_bucket.size(0) == 1 and attention_mask.size(0) > 1:
        rp_bucket = rp_bucket.repeat(attention_mask.size(0), 1, 1)
    # 基于假设前缀的边界框是负数,判断边界框是否为前缀
    is_prefix = bbox[:, :, 1] < 0
    # 统计每个样本中负数边界框的数量
    num_prefix = is_prefix.sum(-1)
    # 对于每个样本和其对应的负数边界框数量,执行以下操作:
    for idx, num_prefix_row in enumerate(num_prefix.cpu().numpy()):
        # 将rp_bucket中前num_prefix_row行和列填充为self.relative_attention_num_buckets
        rp_bucket[idx, :num_prefix_row, num_prefix_row:] = self.relative_attention_num_buckets
        rp_bucket[idx, num_prefix_row:, :num_prefix_row] = self.relative_attention_num_buckets + 1

# 使用rp_bucket作为输入,计算相对注意力偏置的值
values: Tensor = self.relative_attention_bias(rp_bucket)
# 如果计算得到的values张量维度不是4,则抛出值错误异常
if values.dim() != 4:
    raise ValueError("Wrong dimension of values tensor")
# 调整values张量的维度顺序为[0, 3, 1, 2]
values = values.permute([0, 3, 1, 2])

# 返回调整后的相对注意力偏置值张量
return values
class RelativePositionBias1D(RelativePositionBiasBase):
    def __init__(self, scaling_factor=1, max_distance=128, **kwargs):
        """
        Reimplementation of T5 relative position bias. Distance between given tokens is their distance in the sequence.
        Parameters are the same as in base class
        """
        # 调用父类的初始化方法,设置缩放因子和最大距离等参数
        super().__init__(scaling_factor=scaling_factor, max_distance=max_distance, **kwargs)

    def prepare_input(self, attention_mask: Optional[Tensor] = None, bbox: Optional[Dict[str, Any]] = None) -> Tensor:
        # 如果缩放因子不为1,抛出异常
        if self.scaling_factor != 1:
            raise ValueError("No need to scale 1d features")
        # 创建一个序列长度的张量,代表位置信息,使用与attention_mask相同的设备类型
        relative_position = self.get_relative_position(
            torch.arange(attention_mask.size(1), dtype=torch.long, device=attention_mask.device)[None, :]
        )

        return relative_position


class RelativePositionBiasHorizontal(RelativePositionBiasBase):
    def __init__(self, scaling_factor=100, max_distance=100, **kwargs):
        """
        Represents in the bucket embeddings horizontal distance between two tokens. Parameters are the same as in base
        class
        """
        # 调用父类的初始化方法,设置缩放因子和最大距离等参数
        super().__init__(scaling_factor=scaling_factor, max_distance=max_distance, **kwargs)

    def prepare_input(self, attention_mask: Optional[Tensor] = None, bbox: Optional[Dict[str, Any]] = None) -> Tensor:
        # 如果缩放因子不大于1.0,抛出异常
        if not self.scaling_factor > 1.0:
            raise ValueError("Need to scale the values of bboxes, as they are in small (0,1) range")
        # 如果bbox为None,抛出异常
        if bbox is None:
            raise ValueError("Bbox is required for horizontal relative position bias")
        # 获取bbox左侧点的x坐标位置
        horizontal_position: Tensor = bbox[:, :, [0, 2]].mean(dim=-1)

        return self.get_relative_position(horizontal_position)


class RelativePositionBiasVertical(RelativePositionBiasBase):
    def __init__(self, scaling_factor=100, max_distance=100, **kwargs):
        """
        Represents in the bucket embeddings vertical distance between two tokens. Parameters are the same as in base
        class
        """
        # 调用父类的初始化方法,设置缩放因子和最大距离等参数
        super().__init__(scaling_factor=scaling_factor, max_distance=max_distance, **kwargs)

    def prepare_input(self, attention_mask: Optional[Tensor] = None, bbox: Optional[Dict[str, Any]] = None) -> Tensor:
        # 如果缩放因子不大于1.0,抛出异常
        if not self.scaling_factor > 1.0:
            raise ValueError("Need to scale the values of bboxes, as they are in small (0,1) range")
        # 如果bbox为None,抛出异常
        if bbox is None:
            raise ValueError("Bbox is required for vertical relative position bias")
        # 获取bbox中间点的y坐标位置
        vertical_position: Tensor = bbox[:, :, [1, 3]].mean(dim=-1)

        return self.get_relative_position(vertical_position)


class RelativePositionBiasAggregated(nn.Module):
    # 初始化方法,用于创建一个新的相对位置偏置合并类实例
    def __init__(self, modules: Sequence[RelativePositionBiasBase]):
        """
        Class which sums up various computed biases.

        Args:
            modules (Sequence[RelativePositionBiasBase]):
                List of relative bias modules.
        """
        # 调用父类的初始化方法
        super().__init__()
        # 使用传入的模块列表创建一个神经网络模块列表
        self.biases = nn.ModuleList(modules)

    # 前向传播方法,计算相对位置偏置的总和
    def forward(
        self, attention_mask: Optional[Tensor] = None, bbox: Optional[Dict[str, Any]] = None
    ) -> Union[float, Tensor]:
        # 初始化输出为浮点数0.0
        output = 0.0
        # 遍历每个偏置模块
        for bias in self.biases:  # type: ignore
            # 将当前偏置模块的计算结果加到输出上
            output = bias(attention_mask, bbox) + output

        # 返回最终的输出结果,可以是浮点数或张量
        return output
# 定义了一个字典,将字符串映射到相应的相对位置偏置类上
BIAS_CLASSES = {
    "1d": RelativePositionBias1D,
    "horizontal": RelativePositionBiasHorizontal,
    "vertical": RelativePositionBiasVertical,
}


def create_relative_bias(config: UdopConfig) -> Sequence[RelativePositionBiasBase]:
    """
    创建一个空列表或一个/多个相对偏置对象。

    :param config: 模型的配置对象
    :return: 创建的偏置模块序列
    """
    bias_list = []
    # 检查配置对象是否有 'relative_bias_args' 属性
    if hasattr(config, "relative_bias_args"):
        # 遍历配置中的每个相对偏置参数
        for bias_kwargs_org in config.relative_bias_args:
            # 深拷贝相对偏置参数
            bias_kwargs = deepcopy(bias_kwargs_org)
            # 弹出 'type' 键作为偏置类型
            bias_type = bias_kwargs.pop("type")
            # 获取模型的头数(如果配置对象中有 'num_heads' 属性,则使用它;否则使用 'num_attention_heads' 属性)
            model_num_heads = config.num_heads if hasattr(config, "num_heads") else config.num_attention_heads
            # 如果偏置参数中包含 'num_heads' 键
            if "num_heads" in bias_kwargs:
                # 如果 'num_heads' 不等于模型的头数,则抛出数值错误
                if bias_kwargs["num_heads"] != model_num_heads:
                    raise ValueError("Number of heads must match num of heads in the model")
            else:
                # 否则,将 'num_heads' 设置为模型的头数
                bias_kwargs["num_heads"] = model_num_heads
            # 根据偏置类型创建相对偏置对象,并添加到偏置列表中
            bias_list.append(BIAS_CLASSES[bias_type](**bias_kwargs))  # type: ignore

    return bias_list


class UdopStack(UdopPreTrainedModel):
    """
    这个类基于 `T5Stack`,但修改以考虑图像模态以及2D位置嵌入。
    """

    def __init__(self, config, embed_tokens=None, embed_patches=None):
        super().__init__(config)

        # 初始化嵌入 tokens 和嵌入 patches
        self.embed_tokens = embed_tokens
        self.embed_patches = embed_patches
        self.is_decoder = config.is_decoder  # 是否为解码器
        self._max_length = config.max_length  # 最大长度
        self.num_layers = config.num_layers  # 层数

        # 创建 UdopBlock 模块列表,每层的第一层是否有相对注意力偏置由 'has_relative_attention_bias' 决定
        self.block = nn.ModuleList(
            [UdopBlock(config, has_relative_attention_bias=bool(i == 0)) for i in range(self.num_layers)]
        )

        # 最终层的层归一化
        self.final_layer_norm = UdopLayerNorm(config.d_model, eps=config.layer_norm_epsilon)

        # dropout 层
        self.dropout = nn.Dropout(config.dropout_rate)

        # 如果不是解码器,初始化 2D 单元嵌入
        if not self.is_decoder:
            self.cell_2d_embedding = UdopCellEmbeddings(config.max_2d_position_embeddings, config.hidden_size)

        # 获取编码器位置偏置的权重
        self.relative_bias = self._get_relative_bias(config)

        # 将编码器原始位置偏置的权重绑定到相应位置
        for bias in self.relative_bias.biases:
            if isinstance(bias, RelativePositionBias1D):
                self._tie_or_clone_weights(
                    bias.relative_attention_bias, self.block[0].layer[0].SelfAttention.relative_attention_bias
                )

    @staticmethod
    def _get_relative_bias(config: UdopConfig) -> RelativePositionBiasAggregated:
        # 创建相对位置偏置列表
        relative_bias_list = create_relative_bias(config)
        # 返回聚合的相对位置偏置对象
        return RelativePositionBiasAggregated(relative_bias_list)

    def get_input_embeddings(self):
        # 返回输入嵌入 tokens
        return self.embed_tokens

    def get_output_embeddings(self):
        # 返回输出嵌入 tokens
        return self.embed_tokens

    def set_input_embeddings(self, new_embeddings):
        # 设置输入嵌入 tokens
        self.embed_tokens = new_embeddings
    # 定义模型的前向传播方法,接受多个参数来处理输入数据和模型配置
    def forward(
        self,
        input_ids=None,  # 输入的token ids序列
        attention_mask=None,  # 注意力遮罩,指示模型应注意的token位置
        bbox=None,  # 图像bounding box信息
        encoder_hidden_states=None,  # 编码器的隐藏状态
        encoder_attention_mask=None,  # 编码器的注意力遮罩
        inputs_embeds=None,  # 输入的嵌入表示
        pixel_values=None,  # 图像的像素值
        visual_bbox=None,  # 可视信息的bounding box
        image_embeddings=None,  # 图像嵌入
        position_bias=None,  # 位置偏置
        head_mask=None,  # 头部注意力掩码
        cross_attn_head_mask=None,  # 跨注意力头部掩码
        past_key_values=None,  # 过去的键值对
        use_cache=None,  # 是否使用缓存
        output_attentions=None,  # 是否输出注意力权重
        output_hidden_states=None,  # 是否输出隐藏状态
        return_dict=None,  # 是否返回字典形式的结果
@add_start_docstrings(
    "The bare UDOP encoder-decoder Transformer outputting raw hidden-states without any specific head on top.",
    UDOP_START_DOCSTRING,
)
class UdopModel(UdopPreTrainedModel):
    # 定义了共享权重的关键键名列表,用于权重共享
    _tied_weights_keys = [
        "encoder.embed_tokens.weight",
        "decoder.embed_tokens.weight",
        "encoder.embed_patches.proj.weight",
        "encoder.embed_patches.proj.bias",
        "encoder.relative_bias.biases.0.relative_attention_bias.weight",
        "decoder.relative_bias.biases.0.relative_attention_bias.weight",
    ]

    def __init__(self, config):
        # 调用父类构造函数初始化模型
        super(UdopModel, self).__init__(config)

        # 定义文本和图像嵌入层
        self.shared = nn.Embedding(config.vocab_size, config.d_model)
        self.patch_embed = UdopPatchEmbeddings(config)

        # 复制配置以用于编码器
        encoder_config = deepcopy(config)
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        # 初始化编码器栈
        self.encoder = UdopStack(encoder_config, self.shared, self.patch_embed)

        # 复制配置以用于解码器
        decoder_config = deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        # 初始化解码器栈
        self.decoder = UdopStack(decoder_config, self.shared)

        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输入嵌入
    def get_input_embeddings(self):
        return self.shared

    # 设置输入嵌入
    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        self.encoder.set_input_embeddings(new_embeddings)
        self.decoder.set_input_embeddings(new_embeddings)

    # 获取编码器
    def get_encoder(self):
        return self.encoder

    # 获取解码器
    def get_decoder(self):
        return self.decoder

    @add_start_docstrings_to_model_forward(UDOP_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Tensor = None,
        attention_mask: Tensor = None,
        bbox: Dict[str, Any] = None,
        pixel_values: Optional[Tensor] = None,
        visual_bbox: Dict[str, Any] = None,
        decoder_input_ids: Optional[Tensor] = None,
        decoder_attention_mask: Optional[Tensor] = None,
        inputs_embeds: Optional[Tensor] = None,
        encoder_outputs: Optional[Tensor] = None,
        past_key_values: Optional[Tensor] = None,
        head_mask: Optional[Tensor] = None,
        decoder_inputs_embeds: Optional[Tensor] = None,
        decoder_head_mask: Optional[Tensor] = None,
        cross_attn_head_mask: Optional[Tensor] = None,
        use_cache=True,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 此处应该继续,但是代码未完整给出
        pass


注释:这段代码定义了一个UDOP模型类,它是一个编码器-解码器的Transformer模型,用于文本生成任务,包含了共享的嵌入层和编码器解码器栈的初始化。
    This class is based on [`T5ForConditionalGeneration`], extended to deal with images and layout (2D) data."""
    # 此处是基于 `T5ForConditionalGeneration` 类扩展而来,用于处理图像和布局(2D)数据。
    UDOP_START_DOCSTRING,
    # 开始 UDOP 文档字符串
# 定义了一个继承自UdopPreTrainedModel的新模型类UdopForConditionalGeneration,用于条件生成任务
class UdopForConditionalGeneration(UdopPreTrainedModel):
    # 定义了一些共享权重的键列表,这些键将在模型中被共享使用
    _tied_weights_keys = [
        "encoder.embed_tokens.weight",
        "decoder.embed_tokens.weight",
        "encoder.embed_patches.proj.weight",
        "encoder.embed_patches.proj.bias",
        "encoder.relative_bias.biases.0.relative_attention_bias.weight",
        "decoder.relative_bias.biases.0.relative_attention_bias.weight",
        "lm_head.weight",
    ]

    # 初始化函数,接受一个配置对象config
    def __init__(self, config):
        # 调用父类的初始化方法,传入配置对象config
        super(UdopForConditionalGeneration, self).__init__(config)

        # 定义共享的文本和图像嵌入层,使用nn.Embedding创建,形状为(vocab_size, d_model)
        self.shared = nn.Embedding(config.vocab_size, config.d_model)
        
        # 创建UdopPatchEmbeddings对象,用于图像块的嵌入表示
        self.patch_embed = UdopPatchEmbeddings(config)

        # 复制配置对象config以创建编码器的配置,并设置一些标志位
        encoder_config = deepcopy(config)
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        # 创建编码器对象UdopStack,传入复制后的配置对象、共享的嵌入层和图像块嵌入对象
        self.encoder = UdopStack(encoder_config, self.shared, self.patch_embed)

        # 复制配置对象config以创建解码器的配置,并设置一些标志位和解码层数
        decoder_config = deepcopy(config)
        decoder_config.is_decoder = True
        decoder_config.is_encoder_decoder = False
        decoder_config.num_layers = config.num_decoder_layers
        # 创建解码器对象UdopStack,传入复制后的配置对象、共享的嵌入层
        self.decoder = UdopStack(decoder_config, self.shared)

        # 定义语言建模头部的权重,使用nn.Linear创建,输入特征大小为config.d_model,输出大小为config.vocab_size
        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)

        # 执行额外的初始化操作,可能包括权重初始化和最终处理
        self.post_init()

    # 获取输入嵌入层的方法,返回共享的嵌入层对象
    def get_input_embeddings(self):
        return self.shared

    # 设置输入嵌入层的方法,接受一个新的嵌入层对象new_embeddings,并将其设置为共享的嵌入层
    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        # 分别将新的嵌入层设置到编码器和解码器中
        self.encoder.set_input_embeddings(new_embeddings)
        self.decoder.set_input_embeddings(new_embeddings)

    # 设置输出嵌入层的方法,接受一个新的嵌入层对象new_embeddings,并将其设置为语言建模头部的权重
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 获取输出嵌入层的方法,返回语言建模头部的权重对象
    def get_output_embeddings(self):
        return self.lm_head

    # 获取编码器对象的方法,返回编码器对象
    def get_encoder(self):
        return self.encoder

    # 获取解码器对象的方法,返回解码器对象
    def get_decoder(self):
        return self.decoder

    # 使用装饰器@add_start_docstrings_to_model_forward和@replace_return_docstrings修饰的方法
    # 用于向模型的前向方法添加输入文档字符串和替换返回文档字符串
    # 定义一个方法 `forward`,用于模型的前向传播
    def forward(
        self,
        input_ids: Tensor = None,
        attention_mask: Tensor = None,
        bbox: Dict[str, Any] = None,
        pixel_values: Optional[Tensor] = None,
        visual_bbox: Dict[str, Any] = None,
        decoder_input_ids: Optional[Tensor] = None,
        decoder_attention_mask: Optional[Tensor] = None,
        inputs_embeds: Optional[Tensor] = None,
        encoder_outputs: Optional[Tensor] = None,
        past_key_values: Optional[Tensor] = None,
        head_mask: Optional[Tensor] = None,
        decoder_inputs_embeds: Optional[Tensor] = None,
        decoder_head_mask: Optional[Tensor] = None,
        cross_attn_head_mask: Optional[Tensor] = None,
        use_cache=True,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[Tensor] = None,
    ):
        # 如果 `past_key_values` 不为 None,则截取 `input_ids` 的最后一个位置
        if past_key_values is not None:
            input_ids = input_ids[:, -1:]

        # 返回一个包含各种输入的字典,用于生成模型的输入
        return {
            "decoder_input_ids": input_ids,
            "past_key_values": past_key_values,
            "encoder_outputs": encoder_outputs,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,
            # 使用 `kwargs.get` 获取可能存在的额外输入
            "bbox": kwargs.get("bbox", None),
            "pixel_values": kwargs.get("pixel_values", None),
            "visual_bbox": kwargs.get("visual_bbox", None),
        }

    # 从 `transformers.models.t5.modeling_t5.T5ForConditionalGeneration._reorder_cache` 复制过来的方法
    # 重新排序缓存中的过去键值,以适应束搜索的索引
    # 如果过去的键值未包含在输出中
    # 提示用户快速解码已禁用,无需重新排序
    if past_key_values is None:
        logger.warning("You might want to consider setting `use_cache=True` to speed up decoding")
        return past_key_values

    # 初始化重新排序后的解码器过去状态元组
    reordered_decoder_past = ()
    # 遍历每个层级的过去状态
    for layer_past_states in past_key_values:
        # 初始化重新排序后的层级过去状态元组
        reordered_layer_past_states = ()
        # 遍历每个层级内部的过去状态
        for layer_past_state in layer_past_states:
            # 根据束搜索的索引重新排列层级过去状态,以正确的批次索引
            reordered_layer_past_states = reordered_layer_past_states + (
                layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)),
            )

        # 检查重新排序后的第一个层级过去状态的形状是否与原始形状匹配
        if reordered_layer_past_states[0].shape != layer_past_states[0].shape:
            raise ValueError(
                f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched"
            )
        # 检查重新排序后的层级过去状态元组长度是否与原始长度匹配
        if len(reordered_layer_past_states) != len(layer_past_states):
            raise ValueError(
                f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched"
            )

        # 将重新排序后的层级过去状态元组添加到重新排序后的解码器过去状态元组中
        reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,)

    # 返回重新排序后的解码器过去状态元组
    return reordered_decoder_past
# 为 UDOP 模型的编码器定义一个新的类,继承自 UdopPreTrainedModel
@add_start_docstrings(
    "The bare UDOP Model transformer outputting encoder's raw hidden-states without any specific head on top.",
    UDOP_START_DOCSTRING,
)
class UdopEncoderModel(UdopPreTrainedModel):
    # 被绑定权重的键列表,用于共享权重的层
    _tied_weights_keys = [
        "encoder.embed_tokens.weight",
        "encoder.embed_patches.proj.weight",
        "encoder.embed_patches.proj.bias",
        "encoder.relative_bias.biases.0.relative_attention_bias.weight",
    ]

    # 初始化函数,接受一个 UdopConfig 类型的参数 config
    def __init__(self, config: UdopConfig):
        super().__init__(config)

        # 文本和图像的嵌入层
        self.shared = nn.Embedding(config.vocab_size, config.d_model)
        # 图像补丁的嵌入层
        self.patch_embed = UdopPatchEmbeddings(config)

        # 深拷贝配置以创建编码器的配置
        encoder_config = deepcopy(config)
        encoder_config.is_decoder = False
        encoder_config.use_cache = False
        encoder_config.is_encoder_decoder = False
        # 创建 UdopStack 编码器层
        self.encoder = UdopStack(encoder_config, self.shared, self.patch_embed)

        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输入嵌入层的方法
    def get_input_embeddings(self):
        return self.shared

    # 设置输入嵌入层的方法,接受新的嵌入层参数
    def set_input_embeddings(self, new_embeddings):
        self.shared = new_embeddings
        # 更新编码器的输入嵌入层
        self.encoder.set_input_embeddings(new_embeddings)

    # 获取编码器的方法
    def get_encoder(self):
        return self.encoder

    # 剪枝模型头部的方法,接受一个 heads_to_prune 字典参数
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历 heads_to_prune 字典中的每个层和对应要剪枝的头部列表
        for layer, heads in heads_to_prune.items():
            # 调用编码器中相应层的自注意力模块的剪枝方法
            self.encoder.block[layer].layer[0].SelfAttention.prune_heads(heads)

    # 前向传播方法,接受多个输入参数,并使用装饰器添加了输入输出文档字符串
    @add_start_docstrings_to_model_forward(UDOP_ENCODER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithAttentionMask, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Tensor = None,
        bbox: Dict[str, Any] = None,
        attention_mask: Tensor = None,
        pixel_values: Optional[Tensor] = None,
        visual_bbox: Dict[str, Any] = None,
        head_mask: Optional[Tensor] = None,
        inputs_embeds: Optional[Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 设置输出注意力权重,默认为模型配置中的设定,如果未显式提供则使用模型的配置值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 设置输出隐藏状态,默认为模型配置中的设定,如果未显式提供则使用模型的配置值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置是否返回字典形式的输出,默认为模型配置中的设定,如果未显式提供则使用模型的配置值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用编码器模型的前向方法,传入各种输入参数和配置选项
        encoder_outputs = self.encoder(
            input_ids=input_ids,                 # 输入的token IDs
            bbox=bbox,                           # 文本框的边界框信息
            visual_bbox=visual_bbox,             # 可视化边界框信息
            pixel_values=pixel_values,           # 图像像素值
            attention_mask=attention_mask,       # 注意力遮罩,控制哪些token参与注意力计算
            inputs_embeds=inputs_embeds,         # 替代token IDs的嵌入表示
            head_mask=head_mask,                 # 多头注意力掩码
            output_attentions=output_attentions, # 控制是否输出注意力权重
            output_hidden_states=output_hidden_states, # 控制是否输出隐藏状态
            return_dict=return_dict,             # 控制是否返回字典形式的输出
        )

        # 返回编码器的输出结果
        return encoder_outputs

.\models\udop\processing_udop.py

# coding=utf-8
# 设置文件编码格式为 UTF-8,确保代码中的中文和其他非ASCII字符能正确处理
# Copyright 2024 The HuggingFace Inc. team.
# 版权声明,指明代码的版权归属于 HuggingFace Inc. 团队
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache 许可证版本 2.0 授权,可以自由使用、修改和分发本代码,但需遵守许可证规定
# you may not use this file except in compliance with the License;
# 除非遵守许可证规定,否则不得使用本文件
# You may obtain a copy of the License at
# 可在上述链接获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
# 许可证的详细信息链接
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非法律要求或书面同意,否则本软件按"原样"提供,不附带任何明示或暗示的担保或条件
# See the License for the specific language governing permissions and
# limitations under the License.
# 查看许可证以了解具体的语言及权限的约束和限制
"""
Processor class for UDOP.
UDOP 的处理器类。
"""

from typing import List, Optional, Union

from ...image_utils import ImageInput
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType

# 导入必要的模块和类

class UdopProcessor(ProcessorMixin):
    r"""
    Constructs a UDOP processor which combines a LayoutLMv3 image processor and a UDOP tokenizer into a single processor.

    构建 UDOP 处理器,将 LayoutLMv3 图像处理器和 UDOP 分词器结合成一个单一的处理器。

    [`UdopProcessor`] offers all the functionalities you need to prepare data for the model.

    [`UdopProcessor`] 提供了准备数据以供模型使用的所有功能。

    It first uses [`LayoutLMv3ImageProcessor`] to resize, rescale and normalize document images, and optionally applies OCR
    to get words and normalized bounding boxes. These are then provided to [`UdopTokenizer`] or [`UdopTokenizerFast`],
    which turns the words and bounding boxes into token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`.
    Optionally, one can provide integer `word_labels`, which are turned into token-level `labels` for token
    classification tasks (such as FUNSD, CORD).

    首先使用 [`LayoutLMv3ImageProcessor`] 调整、重新缩放和归一化文档图像,可选地应用 OCR
    获取单词和归一化边界框。然后提供给 [`UdopTokenizer`] 或 [`UdopTokenizerFast`],
    将单词和边界框转换为令牌级别的 `input_ids`、`attention_mask`、`token_type_ids` 和 `bbox`。
    可选地,可以提供整数 `word_labels`,将其转换为令牌级别的标签,用于令牌分类任务(如 FUNSD、CORD)。

    Additionally, it also supports passing `text_target` and `text_pair_target` to the tokenizer, which can be used to
    prepare labels for language modeling tasks.

    此外,还支持将 `text_target` 和 `text_pair_target` 传递给分词器,可用于准备语言建模任务的标签。

    Args:
        image_processor (`LayoutLMv3ImageProcessor`):
            An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
        tokenizer (`UdopTokenizer` or `UdopTokenizerFast`):
            An instance of [`UdopTokenizer`] or [`UdopTokenizerFast`]. The tokenizer is a required input.

    参数:
        image_processor (`LayoutLMv3ImageProcessor`):
            [`LayoutLMv3ImageProcessor`] 的一个实例。图像处理器是必需的输入。
        tokenizer (`UdopTokenizer` or `UdopTokenizerFast`):
            [`UdopTokenizer`] 或 [`UdopTokenizerFast`] 的一个实例。分词器是必需的输入。
    """

    attributes = ["image_processor", "tokenizer"]
    # 类属性,包含字符串列表,列出了 UDOP 处理器的属性
    image_processor_class = "LayoutLMv3ImageProcessor"
    # 类属性,指明了图像处理器的类名字串为 "LayoutLMv3ImageProcessor"
    tokenizer_class = ("UdopTokenizer", "UdopTokenizerFast")
    # 类属性,指明了分词器的类名字串为 ("UdopTokenizer", "UdopTokenizerFast")

    def __init__(self, image_processor, tokenizer):
        # 初始化方法,接收图像处理器和分词器作为参数
        super().__init__(image_processor, tokenizer)
        # 调用父类 ProcessorMixin 的初始化方法,传入图像处理器和分词器
    # 定义 __call__ 方法,允许将对象实例作为函数调用
    def __call__(
        self,
        images: Optional[ImageInput] = None,  # 图像输入,可选
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,  # 文本输入,支持多种类型
        text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,  # 第二段文本输入,可选
        boxes: Union[List[List[int]], List[List[List[int]]]] = None,  # 包含边界框的列表,支持多种维度
        word_labels: Optional[Union[List[int], List[List[int]]]] = None,  # 单词标签列表,可选
        text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,  # 目标文本输入,支持多种类型
        text_pair_target: Optional[
            Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
        ] = None,  # 第二段目标文本输入,可选
        add_special_tokens: bool = True,  # 是否添加特殊标记,默认为 True
        padding: Union[bool, str, PaddingStrategy] = False,  # 填充策略,支持布尔值、字符串或填充策略对象
        truncation: Union[bool, str, TruncationStrategy] = False,  # 截断策略,支持布尔值、字符串或截断策略对象
        max_length: Optional[int] = None,  # 最大长度限制,可选
        stride: int = 0,  # 步长,默认为 0
        pad_to_multiple_of: Optional[int] = None,  # 填充到的倍数,可选
        return_token_type_ids: Optional[bool] = None,  # 是否返回标记类型 ID,可选
        return_attention_mask: Optional[bool] = None,  # 是否返回注意力掩码,可选
        return_overflowing_tokens: bool = False,  # 是否返回溢出的标记,默认为 False
        return_special_tokens_mask: bool = False,  # 是否返回特殊标记掩码,默认为 False
        return_offsets_mapping: bool = False,  # 是否返回偏移映射,默认为 False
        return_length: bool = False,  # 是否返回长度,默认为 False
        verbose: bool = True,  # 是否详细输出信息, 默认为 True
        return_tensors: Optional[Union[str, TensorType]] = None,  # 是否返回张量,可选
    ):
        """
        Copied from transformers.models.layoutlmv3.processing_layoutlmv3.LayoutLMv3Processor.__call__
        Method defining the behavior of the object when called as a function.
        """

        # Copied from transformers.models.layoutlmv3.processing_layoutlmv3.LayoutLMv3Processor.get_overflowing_images
        def get_overflowing_images(self, images, overflow_to_sample_mapping):
            """
            This method ensures each `input_ids` sample is mapped to its corresponding image in case of overflow.
            """
            images_with_overflow = []
            for sample_idx in overflow_to_sample_mapping:
                images_with_overflow.append(images[sample_idx])

            if len(images_with_overflow) != len(overflow_to_sample_mapping):
                raise ValueError(
                    "Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
                    f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
                )

            return images_with_overflow

        # Copied from transformers.models.layoutlmv3.processing_layoutlmv3.LayoutLMv3Processor.batch_decode
        def batch_decode(self, *args, **kwargs):
            """
            This method forwards all its arguments to PreTrainedTokenizer's `batch_decode`.
            Please refer to the docstring of that method for more information.
            """
            return self.tokenizer.batch_decode(*args, **kwargs)

        # Copied from transformers.models.layoutlmv3.processing_layoutlmv3.LayoutLMv3Processor.decode
        def decode(self, *args, **kwargs):
            """
            This method forwards all its arguments to PreTrainedTokenizer's `decode`.
            Please refer to the docstring of that method for more information.
            """
            return self.tokenizer.decode(*args, **kwargs)

        @property
    # 从 transformers.models.layoutlmv3.processing_layoutlmv3.LayoutLMv3Processor.model_input_names 复制而来的方法
    def model_input_names(self):
        # 返回一个包含固定字符串列表的列表,这些字符串代表模型的输入名称
        return ["input_ids", "bbox", "attention_mask", "pixel_values"]

.\models\udop\tokenization_udop.py

# 定义了编码为 UTF-8
# 版权声明 2024 年由 HuggingFace Inc. 团队所有
# 使用 Apache 许可证 2.0 版本授权,除非符合许可证,否则不得使用此文件
# 可以在以下网址获得许可证的副本:http://www.apache.org/licenses/LICENSE-2.0
# 根据适用法律或书面同意,按“原样”分发软件,无任何形式的明示或暗示担保或条件
# 详见许可证,限制和限制条件
""" Tokenization classes for UDOP model."""

# 导入标准库模块
import os
import re
import warnings
# 从 shutil 模块中导入 copyfile 函数
from shutil import copyfile
# 从 typing 模块导入各种类型注解
from typing import Any, Dict, List, Optional, Tuple, Union

# 导入 sentencepiece 库,用于分词
import sentencepiece as spm

# 导入 HuggingFace 库中的相关模块和类
from ...tokenization_utils import PreTrainedTokenizer
from ...tokenization_utils_base import (
    AddedToken,
    BatchEncoding,
    EncodedInput,
    PreTokenizedInput,
    TextInput,
    TextInputPair,
    TruncationStrategy,
)
# 导入 HuggingFace 库中的工具类和函数
from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging

# 获取 logger 实例,用于记录日志
logger = logging.get_logger(__name__)

# SentencePiece 模型中用于表示单词起始的符号
SPIECE_UNDERLINE = "▁"

# SentencePiece 模型的文件名映射
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}

# 预训练模型的文件映射,包括 spiece.model 和 tokenizer.json 文件的下载链接
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/udop-large": "https://huggingface.co/microsoft/udop-large/resolve/main/spiece.model",
    },
    "tokenizer_file": {
        "microsoft/udop-large": "https://huggingface.co/microsoft/udop-large/resolve/main/tokenizer.json",
    },
}

# UdopTokenizer 类,继承自 PreTrainedTokenizer 类
class UdopTokenizer(PreTrainedTokenizer):
    """
    从 LayoutXLMTokenizer 和 T5Tokenizer 改编而来。基于 SentencePiece 实现的 tokenizer。

    继承自 PreTrainedTokenizer 类,该类包含大多数主要方法。用户应参考超类以获取有关这些方法的更多信息。

    属性:
        sp_model (`SentencePieceProcessor`):
            每次转换(字符串、token 和 ID)所使用的 SentencePiece 处理器。
    """

    # 词汇文件的名称映射
    vocab_files_names = VOCAB_FILES_NAMES

    # 预训练模型的文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP

    # 模型输入名称列表
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(
        self,
        vocab_file,
        eos_token="</s>",
        unk_token="<unk>",
        sep_token="</s>",
        pad_token="<pad>",
        sep_token_box=[1000, 1000, 1000, 1000],
        pad_token_box=[0, 0, 0, 0],
        pad_token_label=-100,
        only_label_first_subword=True,
        additional_special_tokens=None,
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        legacy=True,
        add_prefix_space=True,
        **kwargs,
    ) -> None:
        # 如果 eos_token 是字符串,则将其封装成特殊的 AddedToken 对象,否则保持原样
        eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
        # 如果 unk_token 是字符串,则将其封装成特殊的 AddedToken 对象,否则保持原样
        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
        # 如果 sep_token 是字符串,则将其封装成特殊的 AddedToken 对象,否则保持原样
        sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
        # 如果 pad_token 是字符串,则将其封装成特殊的 AddedToken 对象,否则保持原样
        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token

        # 设置 legacy 属性
        self.legacy = legacy
        # 设置 add_prefix_space 属性
        self.add_prefix_space = add_prefix_space
        # 如果 sp_model_kwargs 为 None,则设置为空字典,否则使用传入的 sp_model_kwargs
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        # 设置 vocab_file 属性
        self.vocab_file = vocab_file

        # 使用 sp_model_kwargs 创建 SentencePieceProcessor 对象
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        # 加载指定的 vocab_file 到 SentencePieceProcessor 对象中
        self.sp_model.Load(vocab_file)

        # 设置额外的属性
        self.sep_token_box = sep_token_box
        self.pad_token_box = pad_token_box
        self.pad_token_label = pad_token_label
        self.only_label_first_subword = only_label_first_subword

        # 调用父类的初始化方法,传入相应参数
        super().__init__(
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            sep_token_box=sep_token_box,
            pad_token_box=pad_token_box,
            pad_token_label=pad_token_label,
            only_label_first_subword=only_label_first_subword,
            additional_special_tokens=additional_special_tokens,
            sp_model_kwargs=self.sp_model_kwargs,
            legacy=legacy,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )

    @property
    # 返回当前 SentencePieceProcessor 对象的词汇大小
    def vocab_size(self):
        return len(self.sp_model)

    # 从 T5Tokenizer 类中复制而来的方法,获取词汇表的字典表示
    def get_vocab(self):
        # 创建词汇表的字典,将 token 到 id 的映射关系逆转
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        # 添加额外的特殊 token 编码器映射关系
        vocab.update(self.added_tokens_encoder)
        return vocab

    # 从 T5Tokenizer 类中复制而来的方法,获取特殊 token 的 mask
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ):
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        # If the token list already has special tokens, delegate to the superclass method
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # Normal case: adding special tokens to token_ids_0 and token_ids_1
        if token_ids_1 is None:
            return ([0] * len(token_ids_0)) + [1]  # Append 1 after token_ids_0 for the special token
        else:
            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]  # Append 1 after both token_ids_0 and token_ids_1 for their special tokens

    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_sentinel_tokens
    def get_sentinel_tokens(self):
        """
        Retrieves sentinel tokens from the list of additional special tokens.

        Returns:
            list: List of sentinel tokens identified by regex pattern "<extra_id_\d+>".
        """
        return list(
            set(filter(lambda x: bool(re.search(r"<extra_id_\d+>", x)), self.additional_special_tokens))
        )

    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_sentinel_token_ids
    def get_sentinel_token_ids(self):
        """
        Retrieves token IDs for sentinel tokens using the tokenizer's vocabulary.

        Returns:
            list: List of token IDs corresponding to sentinel tokens.
        """
        return [self.convert_tokens_to_ids(token) for token in self.get_sentinel_tokens()]

    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._add_eos_if_not_present
    def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
        """
        Adds an end-of-sequence (EOS) token to token_ids if it's not already present.

        Args:
            token_ids (List[int]): List of token IDs.

        Returns:
            List[int]: List of token IDs with EOS appended if not already present.
        """
        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
            # Warn if the sequence already ends with EOS
            warnings.warn(
                f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
                " eos tokens being added."
            )
            return token_ids
        else:
            # Append EOS token to the end of token_ids
            return token_ids + [self.eos_token_id]

    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.create_token_type_ids_from_sequences
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Creates token type IDs for sequences, distinguishing between token_ids_0 and token_ids_1.

        Args:
            token_ids_0 (List[int]): List of token IDs for the first sequence.
            token_ids_1 (List[int], optional): List of token IDs for the second sequence (if exists).

        Returns:
            List[int]: List of token type IDs where 0 corresponds to token_ids_0 and 1 to token_ids_1 (if provided).
        """
    # 返回一个用于序列对分类任务的掩码。T5 不使用 token type ids,因此返回一个全为零的列表。
    def create_model_input_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """

        # EOS 标记列表,包含一个结束标记的 ID
        eos = [self.eos_token_id]

        # 如果没有第二个序列,则返回第一个序列加上 EOS 标记的长度的零列表
        if token_ids_1 is None:
            return len(token_ids_0 + eos) * [0]

        # 否则,返回两个序列加上各自的 EOS 标记的长度的零列表
        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]

    # 从 T5Tokenizer 类的 build_inputs_with_special_tokens 方法复制而来
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # 如果 token_ids_1 为 None,则直接返回 token_ids_0
        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
        if token_ids_1 is None:
            return token_ids_0
        else:
            # 否则为 token_ids_1 也添加 EOS 标记后返回两个列表的连接
            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
            return token_ids_0 + token_ids_1

    # 从 T5Tokenizer 类的 __getstate__ 方法复制而来
    def __getstate__(self):
        """
        Serialize the T5Tokenizer instance, preparing it for pickling.
        """
        # 复制实例字典并设置 sp_model 为 None,然后返回状态
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state

    # 从 T5Tokenizer 类的 __setstate__ 方法复制而来
    def __setstate__(self, d):
        """
        Deserialize and restore a previously serialized T5Tokenizer instance.
        """
        # 使用传入的字典 d 恢复实例的状态,然后重新加载 sp_model
        self.__dict__ = d
        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
        self.sp_model.Load(self.vocab_file)

    # 从 T5Tokenizer 类的 tokenize 方法复制而来
    def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
        """
        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
        first token is special.
        """
        # 如果 legacy 标志为真或者文本长度为零,则调用父类的 tokenize 方法并返回结果
        if self.legacy or len(text) == 0:
            return super().tokenize(text, **kwargs)

        # 替换文本中的 SPIECE_UNDERLINE 为空格
        text = text.replace(SPIECE_UNDERLINE, " ")

        # 如果 add_prefix_space 为真,则在文本前添加 SPIECE_UNDERLINE
        if self.add_prefix_space:
            text = SPIECE_UNDERLINE + text

        # 调用父类的 tokenize 方法获取 token 列表
        tokens = super().tokenize(text, **kwargs)

        # 如果 tokens 长度大于 1 并且第一个 token 是 SPIECE_UNDERLINE 且第二个 token 是特殊 token,则去掉第一个 token
        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
            tokens = tokens[1:]

        return tokens
    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
    def _tokenize(self, text, **kwargs):
        """
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        """
        # 使用 sentencepiece 模型对文本进行编码,返回字符串类型的 token 列表
        tokens = self.sp_model.encode(text, out_type=str)
        
        # 检查是否为旧版本或者文本不以 SPIECE_UNDERLINE 或空格开头
        if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
            return tokens

        # 1. 对字符串添加前缀,例如 "<unk> Hey"
        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
        # 2. 从编码后的 token 列表中移除 self.unk_token
        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 使用 vocab 将 token 转换为对应的 id
        return self.sp_model.piece_to_id(token)

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 使用 vocab 将 index 转换为对应的 token
        return self.sp_model.IdToPiece(index)

    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.convert_tokens_to_string
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 因为我们手动添加了前缀空格,所以在解码时需要将其移除
        if tokens[0].startswith(SPIECE_UNDERLINE) and self.add_prefix_space:
            tokens[0] = tokens[0][1:]

        current_sub_tokens = []
        out_string = ""
        prev_is_special = False
        for token in tokens:
            # 确保特殊 token 不使用 sentencepiece 模型进行解码
            if token in self.all_special_tokens:
                if not prev_is_special:
                    out_string += " "
                out_string += self.sp_model.decode(current_sub_tokens) + token
                prev_is_special = True
                current_sub_tokens = []
            else:
                current_sub_tokens.append(token)
                prev_is_special = False
        out_string += self.sp_model.decode(current_sub_tokens)
        return out_string.strip()

    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.save_vocabulary
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在,若不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # 构造输出词汇表文件路径,根据是否提供文件名前缀决定文件名
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # 如果当前词汇表文件与输出文件不是同一个文件且当前词汇表文件存在,则复制当前词汇表文件到输出文件
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        # 如果当前词汇表文件不存在,则将序列化后的 sp_model 内容写入输出文件
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        # 返回输出文件路径的元组形式
        return (out_vocab_file,)

    @add_end_docstrings(UDOP_ENCODE_KWARGS_DOCSTRING)
    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
        boxes: Union[List[List[int]], List[List[List[int]]]] = None,
        word_labels: Optional[Union[List[int], List[List[int]]]] = None,
        text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        text_pair_target: Optional[
            Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
        ] = None,
        **kwargs,
    ) -> BatchEncoding:
        # 检查是否同时提供了 text 和 text_target,若未提供则引发 ValueError
        if text is None and text_target is None:
            raise ValueError("You need to specify either `text` or `text_target`.")
        
        # 如果提供了 text,则根据当前上下文切换输入模式,并调用 call_boxes 方法获取编码结果
        if text is not None:
            # 如果当前不处于目标文本上下文管理器中,则切换到输入模式
            if not self._in_target_context_manager:
                self._switch_to_input_mode()
            encodings = self.call_boxes(text=text, text_pair=text_pair, boxes=boxes, word_labels=word_labels, **kwargs)
        
        # 如果提供了 text_target,则切换到目标文本模式,并调用 _call_one 方法获取目标文本编码结果
        if text_target is not None:
            self._switch_to_target_mode()
            target_encodings = self._call_one(text=text_target, text_pair=text_pair_target, **kwargs)
        
        # 离开目标标记器回到输入模式
        self._switch_to_input_mode()

        # 根据是否提供了 text_target 决定返回编码结果或者目标编码结果或者混合编码结果
        if text_target is None:
            return encodings
        elif text is None:
            return target_encodings
        else:
            encodings["labels"] = target_encodings["input_ids"]
            return encodings
    # 定义一个方法用于处理文本、文本对、文本序列的输入,并根据需要添加边界框信息
    def call_boxes(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
        boxes: Union[List[List[int]], List[List[List[int]]]] = None,
        word_labels: Optional[Union[List[int], List[List[int]]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        # 执行文本和文本对的编码与边界框集成处理
        # 略...
    
    # 批量处理文本或文本对列表,可以选择是否为文本对,同时处理边界框信息
    def batch_encode_plus_boxes(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
        ],
        is_pair: bool = None,
        boxes: Optional[List[List[List[int]]]] = None,
        word_labels: Optional[List[List[int]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        # 批量处理输入文本或文本对,并可选地处理它们的边界框标签
        # 略...
        ) -> BatchEncoding:
        """
        Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.

        Args:
            batch_text_or_text_pairs (`List[str]`, `List[Tuple[str, str]]`, `List[List[str]]`, `List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also `List[List[int]]`, `List[Tuple[List[int], List[int]]]`):
                Batch of sequences or pair of sequences to be encoded. This can be a list of
                string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
                details in `encode_plus`).
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        # 获取填充和截断策略,以及其他参数
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用内部方法 `_batch_encode_plus_boxes` 进行批量编码
        return self._batch_encode_plus_boxes(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            is_pair=is_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            is_split_into_words=is_split_into_words,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

    def encode_boxes(
        self,
        text: Union[TextInput, PreTokenizedInput, EncodedInput],
        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
        boxes: Optional[List[List[int]]] = None,
        word_labels: Optional[List[List[int]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs,
    ) -> List[int]:
        """
        Args:
            Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary. Same as doing
            `self.convert_tokens_to_ids(self.tokenize(text))`.
            text (`str`, `List[str]` or `List[int]`):
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
            text_pair (`str`, `List[str]` or `List[int]`, *optional*):
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
        """
        # Call the `encode_plus_boxes` method to encode text and optional text_pair with additional parameters
        encoded_inputs = self.encode_plus_boxes(
            text,
            text_pair=text_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            return_tensors=return_tensors,
            **kwargs,
        )

        # Return only the 'input_ids' from the encoded inputs
        return encoded_inputs["input_ids"]

    def encode_plus_boxes(
        self,
        text: Union[TextInput, PreTokenizedInput],
        text_pair: Optional[PreTokenizedInput] = None,
        boxes: Optional[List[List[int]]] = None,
        word_labels: Optional[List[List[int]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
        ):
        """
        Encode text inputs with associated boxes, word labels, and other optional parameters into model inputs.
        
        Args:
            text: Input text or tokenized input.
            text_pair: Optional second input text or tokenized input.
            boxes: Optional list of bounding boxes for each token.
            word_labels: Optional list of labels corresponding to each token.
            add_special_tokens: Whether to add special tokens (like [CLS], [SEP]) to the encoded inputs.
            padding: Strategy for padding sequences to a certain length.
            truncation: Strategy for truncating sequences longer than `max_length`.
            max_length: Maximum length of the sequences after padding/truncation.
            stride: Stride for splitting the sequence into smaller parts.
            is_split_into_words: Whether the input is already split into words.
            pad_to_multiple_of: Pad the sequence length to a multiple of this value.
            return_tensors: Whether to return tensors (e.g., PyTorch tensors) as outputs.
            return_token_type_ids: Whether to return token type ids as part of the outputs.
            return_attention_mask: Whether to return attention masks as part of the outputs.
            return_overflowing_tokens: Whether to return overflowing tokens beyond max_length.
            return_special_tokens_mask: Whether to return a mask indicating special tokens.
            return_offsets_mapping: Whether to return offsets mapping from original text to tokens.
            return_length: Whether to return the length of the encoded inputs.
            verbose: Whether to print verbose information during encoding.
            **kwargs: Additional keyword arguments for specific encoders.

        Returns:
            Dictionary containing the encoded inputs with specified model inputs (like 'input_ids', 'attention_mask', etc.).
        """
        # Implementation details of encoding process are handled internally by this method
        # and depend on the specific tokenizer and encoding strategy used.
        # Detailed processing steps are not commented here to maintain brevity.
        pass
    ) -> BatchEncoding:
        """
        Tokenize and prepare for the model a sequence or a pair of sequences.

        <Tip warning={true}>

        This method is deprecated, `__call__` should be used instead.

        </Tip>

        Args:
            text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
            text_pair (`str`, `List[str]` or `List[int]`, *optional*):
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
        """

        # 获取填充和截断策略,以及其他相关参数
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用内部方法 `_encode_plus_boxes` 进行编码和特殊标记添加
        return self._encode_plus_boxes(
            text=text,
            text_pair=text_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            is_split_into_words=is_split_into_words,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )
    def _batch_encode_plus_boxes(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],  # 输入参数:可以是单文本、文本对或预处理过的输入列表
            List[TextInputPair],
            List[PreTokenizedInput],
        ],
        is_pair: bool = None,  # 是否是文本对的标志,可以为None
        boxes: Optional[List[List[List[int]]]] = None,  # 文本框的位置信息,可选参数,默认为None
        word_labels: Optional[List[List[int]]] = None,  # 单词标签,可选参数,默认为None
        add_special_tokens: bool = True,  # 是否添加特殊token,默认为True
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,  # 填充策略,默认不填充
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,  # 截断策略,默认不截断
        max_length: Optional[int] = None,  # 最大长度限制,可选参数,默认为None
        stride: int = 0,  # 步长,默认为0
        pad_to_multiple_of: Optional[int] = None,  # 填充到指定的倍数,默认为None
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回的张量类型,可选参数,默认为None
        return_token_type_ids: Optional[bool] = None,  # 是否返回token类型IDs,可选参数,默认为None
        return_attention_mask: Optional[bool] = None,  # 是否返回注意力掩码,可选参数,默认为None
        return_overflowing_tokens: bool = False,  # 是否返回溢出的token,默认为False
        return_special_tokens_mask: bool = False,  # 是否返回特殊token掩码,默认为False
        return_offsets_mapping: bool = False,  # 是否返回偏移映射,默认为False
        return_length: bool = False,  # 是否返回长度,默认为False
        verbose: bool = True,  # 是否显示详细信息,默认为True
        **kwargs,
    ) -> BatchEncoding:
        if return_offsets_mapping:
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast."
            )

        # 调用_batch_prepare_for_model_boxes方法进行批量编码准备
        batch_outputs = self._batch_prepare_for_model_boxes(
            batch_text_or_text_pairs=batch_text_or_text_pairs,
            is_pair=is_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
            return_token_type_ids=return_token_type_ids,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_length=return_length,
            return_tensors=return_tensors,
            verbose=verbose,
        )

        # 返回批量编码结果的BatchEncoding对象
        return BatchEncoding(batch_outputs)

    @add_end_docstrings(UDOP_ENCODE_KWARGS_DOCSTRING)
    # 准备数据以批量输入模型,处理文本或文本对
    def _batch_prepare_for_model_boxes(
        self,
        batch_text_or_text_pairs,  # 批量文本或文本对输入
        is_pair: bool = None,  # 是否为文本对
        boxes: Optional[List[List[int]]] = None,  # 文本框坐标(可选)
        word_labels: Optional[List[List[int]]] = None,  # 单词标签(可选)
        add_special_tokens: bool = True,  # 是否添加特殊标记
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,  # 填充策略
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,  # 截断策略
        max_length: Optional[int] = None,  # 最大长度(可选)
        stride: int = 0,  # 步长
        pad_to_multiple_of: Optional[int] = None,  # 填充到指定的倍数(可选)
        return_tensors: Optional[str] = None,  # 返回的张量类型(可选)
        return_token_type_ids: Optional[bool] = None,  # 是否返回token类型ID(可选)
        return_attention_mask: Optional[bool] = None,  # 是否返回注意力掩码(可选)
        return_overflowing_tokens: bool = False,  # 是否返回溢出的token
        return_special_tokens_mask: bool = False,  # 是否返回特殊token掩码
        return_length: bool = False,  # 是否返回长度
        verbose: bool = True,  # 是否详细输出信息
    ) -> BatchEncoding:
        """
        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
        manages a moving window (with user defined stride) for overflowing tokens

        Args:
            batch_ids_pairs: list of tokenized input ids or input ids pairs
        """

        # Initialize an empty dictionary to store batch outputs
        batch_outputs = {}

        # Iterate over each index and example in the zipped batch_text_or_text_pairs and boxes
        for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)):
            batch_text_or_text_pair, boxes_example = example
            
            # Determine if the example is a pair of texts or a single text
            if is_pair:
                text_or_text_pair = batch_text_or_text_pair[0]
            else:
                text_or_text_pair = batch_text_or_text_pair

            # Prepare inputs for the model, including handling special tokens, padding, truncation, etc.
            outputs = self.prepare_for_model_boxes(
                text_or_text_pair,
                batch_text_or_text_pair[1] if is_pair else None,
                boxes_example,
                word_labels=word_labels[idx] if word_labels is not None else None,
                add_special_tokens=add_special_tokens,
                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
                truncation=truncation_strategy.value,
                max_length=max_length,
                stride=stride,
                pad_to_multiple_of=None,  # we pad in batch afterward
                return_attention_mask=False,  # we pad in batch afterward
                return_token_type_ids=return_token_type_ids,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_length=return_length,
                return_tensors=None,  # We convert the whole batch to tensors at the end
                prepend_batch_axis=False,
                verbose=verbose,
            )

            # Aggregate outputs into batch_outputs dictionary
            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        # Pad the batch outputs according to specified padding strategy and max length
        batch_outputs = self.pad(
            batch_outputs,
            padding=padding_strategy.value,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
        )

        # Convert the batch outputs into a BatchEncoding object with specified tensor type
        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)

        # Return the final batch outputs
        return batch_outputs
    def _encode_plus_boxes(
        self,
        text: Union[TextInput, PreTokenizedInput],
        text_pair: Optional[PreTokenizedInput] = None,
        boxes: Optional[List[List[int]]] = None,
        word_labels: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ) -> BatchEncoding:
        # 如果设置了 return_offsets_mapping 参数,则抛出未实现错误,因为 Python tokenizer 不支持此特性
        if return_offsets_mapping:
            raise NotImplementedError(
                "return_offset_mapping is not available when using Python tokenizers. "
                "To use this feature, change your tokenizer to one deriving from "
                "transformers.PreTrainedTokenizerFast. "
                "More information on available tokenizers at "
                "https://github.com/huggingface/transformers/pull/2674"
            )

        # 调用实例方法 prepare_for_model_boxes 来准备输入数据并编码成模型所需的格式
        return self.prepare_for_model_boxes(
            text=text,
            text_pair=text_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding=padding_strategy.value,  # 根据指定的填充策略进行填充
            truncation=truncation_strategy.value,  # 根据指定的截断策略进行截断
            max_length=max_length,  # 设置最大长度限制
            stride=stride,  # 设置步进值
            pad_to_multiple_of=pad_to_multiple_of,  # 设置填充到的倍数
            return_tensors=return_tensors,  # 控制返回的张量类型
            prepend_batch_axis=True,  # 在返回的张量中添加批次维度
            return_attention_mask=return_attention_mask,  # 控制是否返回注意力掩码
            return_token_type_ids=return_token_type_ids,  # 控制是否返回 token 类型 IDs
            return_overflowing_tokens=return_overflowing_tokens,  # 控制是否返回溢出的 tokens
            return_special_tokens_mask=return_special_tokens_mask,  # 控制是否返回特殊 tokens 掩码
            return_length=return_length,  # 控制是否返回编码长度
            verbose=verbose,  # 控制是否打印详细信息
        )

    @add_end_docstrings(UDOP_ENCODE_KWARGS_DOCSTRING)
    # 定义方法 `prepare_for_model_boxes`,准备输入数据以供模型处理
    def prepare_for_model_boxes(
        self,
        text: Union[TextInput, PreTokenizedInput],
        text_pair: Optional[PreTokenizedInput] = None,
        boxes: Optional[List[List[int]]] = None,
        word_labels: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        prepend_batch_axis: bool = False,
        **kwargs,
    ):
    # 从 `transformers.models.layoutxlm.tokenization_layoutxlm.LayoutXLMTokenizer.truncate_sequences` 复制的方法
    def truncate_sequences(
        self,
        ids: List[int],
        token_boxes: List[List[int]],
        pair_ids: Optional[List[int]] = None,
        pair_token_boxes: Optional[List[List[int]]] = None,
        labels: Optional[List[int]] = None,
        num_tokens_to_remove: int = 0,
        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
        stride: int = 0,
    ):
    # 从 `transformers.models.layoutxlm.tokenization_layoutxlm.LayoutXLMTokenizer._pad` 复制的方法
    def _pad(
        self,
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
        max_length: Optional[int] = None,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,

.\models\udop\tokenization_udop_fast.py

"""
定义一个 UdopTokenizerFast 类,继承自 PreTrainedTokenizerFast 类,用于实现快速的 UDOP 分词器,基于 HuggingFace 的 tokenizers 库。

该类提供了从 LayoutXLMTokenizer 和 T5Tokenizer 中适配的功能,并基于 BPE 模型实现。

继承自 PreTrainedTokenizerFast 类,包含了大部分主要方法,用户可以参考其超类以获取更多关于这些方法的信息。
"""
class UdopTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a "fast" UDOP tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
    [`LayoutXLMTokenizer`] and [`T5Tokenizer`]. Based on
    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.
    """
    # 定义一个类,用于处理特定任务的标记器
    Args:
        vocab_file (`str`, *optional*):
            Path to the vocabulary file. 词汇表文件的路径。

        tokenizer_file (`str`, *optional*):
            Path to the tokenizer file. 标记器文件的路径。

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token. 序列结束标记,默认为 `"</s>"`。

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
            分隔符标记,在构建多个序列时使用,例如用于序列分类或问题回答中的文本和问题。还用作使用特殊标记构建的序列的最后一个标记。

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
            未知标记,词汇表中不存在的标记会被设置为此标记。

        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
            用于填充的标记,例如在批处理具有不同长度序列时使用。

        sep_token_box (`List[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
            The bounding box to use for the special [SEP] token.
            用于特殊 [SEP] 标记的边界框。

        pad_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
            The bounding box to use for the special [PAD] token.
            用于特殊 [PAD] 标记的边界框。

        pad_token_label (`int`, *optional*, defaults to -100):
            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
            CrossEntropyLoss.
            用于填充标记的标签。默认为 -100,这是 PyTorch CrossEntropyLoss 的 `ignore_index`。

        only_label_first_subword (`bool`, *optional*, defaults to `True`):
            Whether or not to only label the first subword, in case word labels are provided.
            是否仅标记第一个子词,如果提供了单词标签。

        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.
            标记器使用的额外特殊标记。
    """

    # 定义用于加载预训练模型的相关常量和类
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    model_input_names = ["input_ids", "attention_mask"]
    slow_tokenizer_class = UdopTokenizer

    # 初始化方法,用于设置类的属性
    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        eos_token="</s>",
        sep_token="</s>",
        unk_token="<unk>",
        pad_token="<pad>",
        sep_token_box=[1000, 1000, 1000, 1000],
        pad_token_box=[0, 0, 0, 0],
        pad_token_label=-100,
        only_label_first_subword=True,
        additional_special_tokens=None,
        **kwargs,
    ):
    ):
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            eos_token=eos_token,
            sep_token=sep_token,
            unk_token=unk_token,
            pad_token=pad_token,
            sep_token_box=sep_token_box,
            pad_token_box=pad_token_box,
            pad_token_label=pad_token_label,
            only_label_first_subword=only_label_first_subword,
            additional_special_tokens=additional_special_tokens,
            **kwargs,
        )


        # 调用父类的初始化方法,传递必要的参数和关键字参数
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            eos_token=eos_token,
            sep_token=sep_token,
            unk_token=unk_token,
            pad_token=pad_token,
            sep_token_box=sep_token_box,
            pad_token_box=pad_token_box,
            pad_token_label=pad_token_label,
            only_label_first_subword=only_label_first_subword,
            additional_special_tokens=additional_special_tokens,
            **kwargs,
        )


        self.vocab_file = vocab_file

        # 添加额外的属性
        self.sep_token_box = sep_token_box
        self.pad_token_box = pad_token_box
        self.pad_token_label = pad_token_label
        self.only_label_first_subword = only_label_first_subword


    @property
    def can_save_slow_tokenizer(self) -> bool:
        # 检查是否可以保存慢速的分词器,需要检查词汇文件是否存在
        return os.path.isfile(self.vocab_file) if self.vocab_file else False


    @add_end_docstrings(UDOP_ENCODE_KWARGS_DOCSTRING)
    def __call__(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
        boxes: Union[List[List[int]], List[List[List[int]]]] = None,
        word_labels: Optional[Union[List[int], List[List[int]]]] = None,
        text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        text_pair_target: Optional[
            Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
        ] = None,
        **kwargs,
    ) -> BatchEncoding:
        # 检查输入参数,确保至少有 `text` 或 `text_target` 被指定
        if text is None and text_target is None:
            raise ValueError("You need to specify either `text` or `text_target`.")
        if text is not None:
            # 如果没有处于目标文本模式,则切换到输入文本模式
            if not self._in_target_context_manager:
                self._switch_to_input_mode()
            # 调用 `call_boxes` 方法处理文本、文本对、框和词标签等参数
            encodings = self.call_boxes(text=text, text_pair=text_pair, boxes=boxes, word_labels=word_labels, **kwargs)
        if text_target is not None:
            # 切换到目标文本模式
            self._switch_to_target_mode()
            # 调用 `_call_one` 方法处理目标文本、目标文本对等参数
            target_encodings = self._call_one(text=text_target, text_pair=text_pair_target, **kwargs)
        # 回到输入文本模式
        self._switch_to_input_mode()

        # 根据是否有目标文本,返回相应的编码结果
        if text_target is None:
            return encodings
        elif text is None:
            return target_encodings
        else:
            # 将目标文本的 `input_ids` 放入编码结果的 `labels` 键中
            encodings["labels"] = target_encodings["input_ids"]
            return encodings


    @add_end_docstrings(UDOP_ENCODE_KWARGS_DOCSTRING)
    # 定义一个方法用于处理文本、文本对、文本列表或预分词输入,同时接收盒子坐标和词标签等参数
    def call_boxes(
        self,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
        text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
        boxes: Union[List[List[int]], List[List[List[int]]]] = None,
        word_labels: Optional[Union[List[int], List[List[int]]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        # 从文本和文本对(如果存在)创建批处理输入
        batched_input = [(text, text_pair)] if text_pair else [text]
        # 使用预定义的 tokenizer 对批处理输入进行编码
        encodings = self._tokenizer.encode_batch(
            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
        )
        # 返回编码结果的第一个样本的 token 列表
        return encodings[0].tokens

    # 定义一个方法用于将文本或文本对列表批量编码并处理盒子坐标和词标签等参数
    def batch_encode_plus_boxes(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],
            List[TextInputPair],
            List[PreTokenizedInput],
        ],
        is_pair: bool = None,
        boxes: Optional[List[List[List[int]]]] = None,
        word_labels: Optional[List[List[int]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ):
        # 创建一个包含文本或文本对的批处理输入
        batched_input = batch_text_or_text_pairs
        # 使用预定义的 tokenizer 对批处理输入进行编码
        encodings = self._tokenizer.encode_batch(
            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
        )
        # 返回编码结果
        return encodings
    ) -> BatchEncoding:
        """
        Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.

        <Tip warning={true}>

        This method is deprecated, `__call__` should be used instead.

        </Tip>

        Args:
            batch_text_or_text_pairs (`List[str]`, `List[Tuple[str, str]]`, `List[List[str]]`, `List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also `List[List[int]]`, `List[Tuple[List[int], List[int]]]`):
                Batch of sequences or pair of sequences to be encoded. This can be a list of
                string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
                details in `encode_plus`).
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        # 获取填充和截断策略以及其他相关参数,以确保向后兼容性
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用底层方法 `_batch_encode_plus_boxes` 进行批量编码
        return self._batch_encode_plus_boxes(
            batch_text_or_text_pairs=batch_text_or_text_pairs,  # 待编码的文本或文本对
            is_pair=is_pair,  # 是否是文本对
            boxes=boxes,  # 区域框
            word_labels=word_labels,  # 单词标签
            add_special_tokens=add_special_tokens,  # 是否添加特殊标记
            padding_strategy=padding_strategy,  # 填充策略
            truncation_strategy=truncation_strategy,  # 截断策略
            max_length=max_length,  # 最大长度
            stride=stride,  # 步长
            is_split_into_words=is_split_into_words,  # 是否已拆分为单词
            pad_to_multiple_of=pad_to_multiple_of,  # 填充至倍数长度
            return_tensors=return_tensors,  # 是否返回张量
            return_token_type_ids=return_token_type_ids,  # 是否返回 token 类型 id
            return_attention_mask=return_attention_mask,  # 是否返回注意力掩码
            return_overflowing_tokens=return_overflowing_tokens,  # 是否返回溢出的 token
            return_special_tokens_mask=return_special_tokens_mask,  # 是否返回特殊 token 掩码
            return_offsets_mapping=return_offsets_mapping,  # 是否返回偏移映射
            return_length=return_length,  # 是否返回长度
            verbose=verbose,  # 是否详细输出
            **kwargs,  # 其他参数
        )
    # 定义一个方法用于批量编码文本或文本对,支持多种输入类型
    def _batch_encode_plus_boxes(
        self,
        batch_text_or_text_pairs: Union[
            List[TextInput],         # 输入为单个文本
            List[TextInputPair],     # 输入为文本对
            List[PreTokenizedInput], # 输入为预分词文本
        ],
        is_pair: bool = None,        # 标志是否为文本对
        boxes: Optional[List[List[List[int]]]] = None,    # 相关文本的边框坐标
        word_labels: Optional[List[List[int]]] = None,    # 文本中单词的标签
        add_special_tokens: bool = True,                   # 是否添加特殊标记
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,   # 填充策略
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,  # 截断策略
        max_length: Optional[int] = None,                  # 最大长度限制
        stride: int = 0,                                   # 截断和填充时的步长
        pad_to_multiple_of: Optional[int] = None,          # 填充到倍数长度
        return_tensors: Optional[str] = None,              # 返回的张量类型
        return_token_type_ids: Optional[bool] = None,      # 是否返回token类型id
        return_attention_mask: Optional[bool] = None,      # 是否返回attention mask
        return_overflowing_tokens: bool = False,           # 是否返回超出最大长度的token
        return_special_tokens_mask: bool = False,          # 是否返回特殊token的mask
        return_offsets_mapping: bool = False,              # 是否返回偏移映射
        return_length: bool = False,                       # 是否返回编码后的长度
        verbose: bool = True,                              # 是否输出详细信息
        **kwargs,                                           # 其他关键字参数
    ):
        # TODO: 实现批量编码文本及边框的功能
        pass

    # 定义一个方法用于编码单个文本或文本对,支持多种输入类型
    def _encode_plus_boxes(
        self,
        text: Union[TextInput, PreTokenizedInput],          # 输入的文本
        text_pair: Optional[PreTokenizedInput] = None,     # 可选的第二个文本
        boxes: Optional[List[List[int]]] = None,           # 相关文本的边框坐标
        word_labels: Optional[List[int]] = None,           # 文本中单词的标签
        add_special_tokens: bool = True,                   # 是否添加特殊标记
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,   # 填充策略
        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,  # 截断策略
        max_length: Optional[int] = None,                  # 最大长度限制
        stride: int = 0,                                   # 截断和填充时的步长
        pad_to_multiple_of: Optional[int] = None,          # 填充到倍数长度
        return_tensors: Optional[bool] = None,             # 返回的张量类型
        return_token_type_ids: Optional[bool] = None,      # 是否返回token类型id
        return_attention_mask: Optional[bool] = None,      # 是否返回attention mask
        return_overflowing_tokens: bool = False,           # 是否返回超出最大长度的token
        return_special_tokens_mask: bool = False,          # 是否返回特殊token的mask
        return_offsets_mapping: bool = False,              # 是否返回偏移映射
        return_length: bool = False,                       # 是否返回编码后的长度
        verbose: bool = True,                              # 是否输出详细信息
        **kwargs,                                           # 其他关键字参数
    ):
        # TODO: 实现编码单个文本及边框的功能
        pass
    ) -> BatchEncoding:
        # 将输入组成批处理输入
        # 两种选项:
        # 1) 只有文本,如果文本必须是一个字符串列表
        # 2) 文本 + 文本对,此时文本是字符串,text_pair 是字符串列表
        batched_input = [(text, text_pair)] if text_pair else [text]
        batched_boxes = [boxes]  # 将盒子坐标转为批处理列表
        batched_word_labels = [word_labels] if word_labels is not None else None  # 将单词标签转为批处理列表,如果不存在则为 None
        batched_output = self._batch_encode_plus_boxes(
            batched_input,
            is_pair=bool(text_pair is not None),  # 如果存在 text_pair 则设置为 True
            boxes=batched_boxes,
            word_labels=batched_word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

        # 如果返回的张量为 None,并且不返回溢出的 tokens,则移除前导的批处理轴
        # 在这种情况下,溢出的 tokens 作为批处理输出返回
        if return_tensors is None and not return_overflowing_tokens:
            batched_output = BatchEncoding(
                {
                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
                    for key, value in batched_output.items()
                },
                batched_output.encodings,  # 将编码添加到批处理输出中
            )

        # 检查是否需要提醒序列过长
        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)

        return batched_output
    ) -> List[int]:
        """
        Args:
            Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary. Same as doing
            `self.convert_tokens_to_ids(self.tokenize(text))`.
            text (`str`, `List[str]` or `List[int]`):
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
            text_pair (`str`, `List[str]` or `List[int]`, *optional*):
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
        """
        # 使用 `encode_plus_boxes` 方法对输入文本及其可选的文本对进行编码,同时处理其他参数
        encoded_inputs = self.encode_plus_boxes(
            text,
            text_pair=text_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            return_tensors=return_tensors,
            **kwargs,
        )

        # 返回编码后的输入文本的 `input_ids` 列表
        return encoded_inputs["input_ids"]

    def encode_plus_boxes(
        self,
        text: Union[TextInput, PreTokenizedInput],
        text_pair: Optional[PreTokenizedInput] = None,
        boxes: Optional[List[List[int]]] = None,
        word_labels: Optional[List[List[int]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        is_split_into_words: bool = False,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs,
    ) -> BatchEncoding:
        """
        Tokenize and prepare for the model a sequence or a pair of sequences.

        <Tip warning={true}>

        This method is deprecated, `__call__` should be used instead.

        </Tip>

        Args:
            text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
            text_pair (`str`, `List[str]` or `List[int]`, *optional*):
                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                method).
        """

        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        # 获取填充和截断策略,以及其他参数
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            verbose=verbose,
            **kwargs,
        )

        # 调用内部方法 `_encode_plus_boxes` 进行编码,并返回结果
        return self._encode_plus_boxes(
            text=text,
            text_pair=text_pair,
            boxes=boxes,
            word_labels=word_labels,
            add_special_tokens=add_special_tokens,
            padding_strategy=padding_strategy,
            truncation_strategy=truncation_strategy,
            max_length=max_length,
            stride=stride,
            is_split_into_words=is_split_into_words,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors=return_tensors,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            **kwargs,
        )

    # Copied from transformers.models.layoutxlm.tokenization_layoutxlm_fast.LayoutXLMTokenizerFast._pad
    def _pad(
        self,
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
        max_length: Optional[int] = None,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
    ):
        # 方法 `_pad` 负责对编码后的输入进行填充操作,根据传入的参数进行相应的处理
        pass

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An XLM-RoBERTa sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """

        # If only one sequence is provided, append the separator token to the end of token_ids_0
        if token_ids_1 is None:
            return token_ids_0 + [self.sep_token_id]
        
        # Define the separator token as a list
        sep = [self.sep_token_id]
        
        # Concatenate token_ids_0, separator, token_ids_1, and another separator
        return token_ids_0 + sep + token_ids_1 + sep

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.

        """

        # Define the separator token as a list
        sep = [self.sep_token_id]

        # If only one sequence is provided, return a list of zeros of length equal to token_ids_0 + separator
        if token_ids_1 is None:
            return len(token_ids_0 + sep) * [0]
        
        # If two sequences are provided, return a list of zeros of length equal to token_ids_0 + separator + token_ids_1 + separator
        return len(token_ids_0 + sep + token_ids_1 + sep) * [0]

    # Copied from transformers.models.layoutxlm.tokenization_layoutxlm_fast.LayoutXLMTokenizerFast.save_vocabulary
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary to a directory. This method is adapted from the LayoutXLMTokenizerFast class.

        Args:
            save_directory (`str`):
                Directory where the vocabulary will be saved.
            filename_prefix (`str`, *optional*):
                Optional prefix to prepend to the vocabulary filename.

        Returns:
            `Tuple[str]`: Tuple containing the path to the saved vocabulary file.

        Raises:
            ValueError: If the fast tokenizer cannot save the vocabulary.
        """

        # Check if the fast tokenizer has the capability to save the vocabulary
        if not self.can_save_slow_tokenizer:
            raise ValueError(
                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
                "tokenizer."
            )

        # Ensure save_directory exists and is a directory; log an error if not
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
            return
        
        # Define the output vocabulary file path
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # If the current vocabulary file path is different from the desired output path, copy the vocabulary file
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        # Return the path to the saved vocabulary file
        return (out_vocab_file,)
posted @ 2024-07-01 10:57  绝不原创的飞龙  阅读(41)  评论(0编辑  收藏  举报