Transformers-源码解析-二十二-

Transformers 源码解析(二十二)

.\models\bloom\modeling_flax_bloom.py

# 导入所需的模块和库
import math
from functools import partial
from typing import Optional, Tuple

# 导入 Flax 相关模块
import flax.linen as nn
import jax
import jax.numpy as jnp
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen import combine_masks, dot_product_attention_weights, make_causal_mask
from flax.linen.activation import tanh
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax

# 导入自定义的模型输出类和工具函数
from ...modeling_flax_outputs import (
    FlaxBaseModelOutput,
    FlaxBaseModelOutputWithPastAndCrossAttentions,
    FlaxCausalLMOutput,
)
from ...modeling_flax_utils import FlaxPreTrainedModel, append_call_sample_docstring
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging

# 设置日志记录器
logger = logging.get_logger(__name__)

# 模型的预训练检查点和配置信息用于文档
_CHECKPOINT_FOR_DOC = "bigscience/bloom"
_CONFIG_FOR_DOC = "BloomConfig"

# 模型起始文档字符串,包含对模型的描述和相关链接
BLOOM_START_DOCSTRING = r"""

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a Flax Linen
    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
    Parameters:
        config ([`BloomConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
            `jax.numpy.bfloat16` (on TPUs).

            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
            specified all the computation will be performed with the given `dtype`.

            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
            parameters.**

            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
            [`~FlaxPreTrainedModel.to_bf16`].
"""
BLOOM_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`BloomTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


def build_alibi_tensor(attention_mask: jnp.ndarray, num_heads: int, dtype: Optional[jnp.dtype] = jnp.float32):
    """
    Flax implementation of the BLOOM Alibi tensor. BLOOM Alibi tensor is not causal as the original paper mentions, it
    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
    `softmax(l+a) = softmax(l)`. Based on
    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
    Link to paper: https://arxiv.org/abs/2108.12409

    Args:
        attention_mask (`jnp.ndarray`):
            Token-wise attention mask, this should be of shape `(batch_size, max_seq_len)`.
        num_heads (`int`):
            Number of attention heads.
        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
            The data type (dtype) of the output tensor.

    Returns: Alibi tensor of shape `(batch_size * num_heads, 1, max_seq_len)`.
    """
    # 获取注意力掩码的形状,batch_size 是批量大小,seq_length 是序列长度
    batch_size, seq_length = attention_mask.shape
    # 计算最接近 num_heads 的 2 的幂次方
    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
    # 计算基础值,用于调整 softmax 函数的实现
    base = jnp.array(2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), dtype=jnp.float32)
    # 生成一个包含最接近2的幂的整数的浮点数数组,从1到closest_power_of_2(包括)
    powers = jnp.arange(1, 1 + closest_power_of_2, dtype=jnp.float32)
    # 计算基数 base 的 powers 次幂,得到斜率数组
    slopes = jax.lax.pow(base, powers)

    # 如果 closest_power_of_2 不等于 num_heads
    if closest_power_of_2 != num_heads:
        # 计算额外的基数,用于增加的头部数量
        extra_base = jnp.array(2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), dtype=jnp.float32)
        # 计算剩余头部的数量
        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
        # 生成额外的幂,从1到2 * num_remaining_heads,步长为2的浮点数数组
        extra_powers = jnp.arange(1, 1 + 2 * num_remaining_heads, 2, dtype=jnp.float32)
        # 将额外计算得到的幂添加到斜率数组中
        slopes = jnp.cat([slopes, jax.lax.pow(extra_base, extra_powers)], axis=0)

    # 创建一个索引张量,用于生成 Alibi 张量,其形状为 (batch_size, num_heads, query_length, key_length)
    arange_tensor = ((attention_mask.cumsum(axis=-1) - 1) * attention_mask)[:, None, :]
    # 计算 Alibi 张量,将斜率数组乘以 arange_tensor
    alibi = slopes[..., None] * arange_tensor
    # 在第三个维度上扩展 Alibi 张量
    alibi = jnp.expand_dims(alibi, axis=2)
    # 返回 Alibi 张量的 numpy 数组表示,以指定的数据类型
    return jnp.asarray(alibi, dtype)
# 定义一个名为 `FlaxBloomAttention` 的神经网络模块
class FlaxBloomAttention(nn.Module):
    # 类变量:用于存储 BloomConfig 的配置信息
    config: BloomConfig
    # 默认数据类型为 jnp.float32
    dtype: jnp.dtype = jnp.float32

    # 初始化函数
    def setup(self):
        # 从配置中获取隐藏层大小
        self.hidden_size = self.config.hidden_size
        # 从配置中获取注意力头的数量
        self.num_heads = self.config.n_head
        # 计算每个注意力头的维度
        self.head_dim = self.hidden_size // self.num_heads
        # 检查隐藏层大小是否能被注意力头的数量整除
        self.attention_softmax_in_fp32 = self.dtype is not jnp.float32

        # 如果隐藏层大小不能被注意力头的数量整除,抛出数值错误异常
        if self.head_dim * self.num_heads != self.hidden_size:
            raise ValueError(
                f"`hidden_size` must be divisible by `num_heads` (got `hidden_size`: {self.hidden_size} and "
                f"`num_heads`: {self.num_heads})."
            )

        # 部分函数定义:Dense 层,设置数据类型和权重初始化方式
        dense = partial(
            nn.Dense,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
        )

        # 初始化查询、键、值的 Dense 层
        self.query_key_value = dense(self.hidden_size * 3)
        # 初始化输出 Dense 层
        self.dense = dense(self.hidden_size)
        # 初始化残差 Dropout 层
        self.resid_dropout = nn.Dropout(rate=self.config.hidden_dropout)

    # 将隐藏状态分割成多个注意力头
    def _split_heads(self, hidden_states):
        return hidden_states.reshape(hidden_states.shape[:-1] + (self.num_heads, self.head_dim * 3))

    # 合并多个注意力头为一个隐藏状态
    def _merge_heads(self, hidden_states):
        return hidden_states.reshape(hidden_states.shape[:2] + (self.hidden_size,))

    # 神经网络结构的定义,装饰器标记为 nn.compact
    @nn.compact
    # 从 transformers.models.gptj.modeling_flax_gptj.FlaxGPTJAttention._concatenate_to_cache 复制的方法
    def _concatenate_to_cache(self, key, value, query, attention_mask):
        """
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slighly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        """
        # 检测是否通过缺少现有缓存数据来进行初始化。
        is_initialized = self.has_variable("cache", "cached_key")
        # 获取缓存的键值,如果不存在则创建全零数组
        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
        # 获取缓存的值,如果不存在则创建全零数组
        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
        # 获取缓存索引,如果不存在则创建一个值为0的整数数组
        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))

        if is_initialized:
            # 解构缓存键的形状以获取批次维度、最大长度、注意力头数和每个头部的深度
            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
            # 使用新的一维空间切片更新键、值缓存
            cur_index = cache_index.value
            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
            key = lax.dynamic_update_slice(cached_key.value, key, indices)
            value = lax.dynamic_update_slice(cached_value.value, value, indices)
            # 更新缓存中的键值
            cached_key.value = key
            cached_value.value = value
            # 更新缓存索引以反映已更新的缓存向量数量
            num_updated_cache_vectors = query.shape[1]
            cache_index.value = cache_index.value + num_updated_cache_vectors
            # 生成用于缓存解码器自注意力的因果掩码:我们的单个查询位置应仅关注已生成和缓存的键位置,而不是剩余的零元素。
            pad_mask = jnp.broadcast_to(
                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
            )
            # 结合前面计算的掩码和输入的注意力掩码
            attention_mask = combine_masks(pad_mask, attention_mask)
        
        # 返回更新后的键、值和注意力掩码
        return key, value, attention_mask
# 定义一个基于 nn.Module 的类 BloomGELU
class BloomGELU(nn.Module):
    # 初始化函数,设置数据类型为 jnp.float32
    def setup(self):
        self.dtype = jnp.float32

    # 对象被调用时执行的函数,实现了 GELU 激活函数
    def __call__(self, x):
        # 计算 GELU 函数的输出:x * 0.5 * (1.0 + tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
        return x * 0.5 * (1.0 + tanh(0.79788456 * x * (1 + 0.044715 * x * x)))


# 定义一个基于 nn.Module 的类 FlaxBloomMLP
class FlaxBloomMLP(nn.Module):
    # 类型定义为 BloomConfig 类
    config: BloomConfig
    # 数据类型设置为 jnp.float32
    dtype: jnp.dtype = jnp.float32

    # 初始化函数,设置模型结构
    def setup(self):
        # 从配置中获取隐藏层大小
        hidden_size = self.config.hidden_size

        # 初始化权重的方式为正态分布,标准差为配置中的 initializer_range
        kernel_init = jax.nn.initializers.normal(self.config.initializer_range)

        # 创建全连接层对象,将输入维度为 hidden_size * 4,输出维度为 hidden_size
        self.dense_h_to_4h = nn.Dense(4 * hidden_size, dtype=self.dtype, kernel_init=kernel_init)
        # 创建全连接层对象,将输入维度为 hidden_size,输出维度为 hidden_size
        self.dense_4h_to_h = nn.Dense(hidden_size, dtype=self.dtype, kernel_init=kernel_init)
        # 创建 Dropout 层对象,丢弃率为配置中的 hidden_dropout
        self.hidden_dropout = nn.Dropout(self.config.hidden_dropout)
        # 创建 GELU 激活函数层对象
        self.act = BloomGELU()

    # 对象被调用时执行的函数,实现了多层感知机(MLP)的前向传播逻辑
    def __call__(self, hidden_states, residual, deterministic: bool = True):
        # 输入经过全连接层 dense_h_to_4h 和 GELU 激活函数 act
        hidden_states = self.dense_h_to_4h(hidden_states)
        hidden_states = self.act(hidden_states)

        # 经过全连接层 dense_4h_to_h 得到中间输出
        intermediate_output = self.dense_4h_to_h(hidden_states)

        # 将中间输出与残差相加
        intermediate_output = intermediate_output + residual
        # 应用 Dropout 操作
        hidden_states = self.hidden_dropout(intermediate_output, deterministic=deterministic)

        # 返回处理后的 hidden_states
        return hidden_states


# 定义一个基于 nn.Module 的类 FlaxBloomBlock
class FlaxBloomBlock(nn.Module):
    # 类型定义为 BloomConfig 类
    config: BloomConfig
    # 数据类型设置为 jnp.float32
    dtype: jnp.dtype = jnp.float32

    # 初始化函数,设置模型结构
    def setup(self):
        # 输入层的 LayerNorm 操作,epsilon 为配置中的 layer_norm_epsilon
        self.input_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)

        # 自注意力机制层对象 FlaxBloomAttention
        self.self_attention = FlaxBloomAttention(self.config, dtype=self.dtype)
        # 后自注意力层的 LayerNorm 操作,epsilon 为配置中的 layer_norm_epsilon
        self.post_attention_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)

        # 多层感知机(MLP)对象 FlaxBloomMLP
        self.mlp = FlaxBloomMLP(self.config, dtype=self.dtype)

        # 是否在 LayerNorm 后应用残差连接的标志,从配置中获取
        self.apply_residual_connection_post_layernorm = self.config.apply_residual_connection_post_layernorm
        # 隐藏层 Dropout 概率,从配置中获取
        self.hidden_dropout = self.config.hidden_dropout

    # 对象被调用时执行的函数,实现了 Bloom Transformer 中的一个 Block 的前向传播逻辑
    def __call__(
        self,
        hidden_states,
        alibi,
        attention_mask=None,
        deterministic: bool = True,
        init_cache: bool = False,
        output_attentions: bool = False,
        ):
        # 输入经过输入层的 LayerNorm 操作
        hidden_states = self.input_layernorm(hidden_states)

        # 自注意力机制的前向传播
        hidden_states, attention_output = self.self_attention(
            hidden_states,
            attention_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            init_cache=init_cache,
        )

        # 是否在自注意力后应用残差连接和 LayerNorm
        if self.apply_residual_connection_post_layernorm:
            hidden_states = hidden_states + alibi
            hidden_states = self.post_attention_layernorm(hidden_states)

        # 经过多层感知机(MLP)的前向传播
        hidden_states = self.mlp(hidden_states, alibi, deterministic=deterministic)

        # 返回处理后的 hidden_states
        return hidden_states
        ):
            # 对输入进行 layer normalization 处理
            layernorm_output = self.input_layernorm(hidden_states)

            # 如果配置要求在保存残差之前进行 layer normalization
            if self.apply_residual_connection_post_layernorm:
                # 将 layer normalization 后的结果作为残差
                residual = layernorm_output
            else:
                # 否则将未处理的隐藏状态作为残差
                residual = hidden_states

            # 进行自注意力机制
            attn_outputs = self.self_attention(
                layernorm_output,
                residual=residual,
                alibi=alibi,
                attention_mask=attention_mask,
                deterministic=deterministic,
                init_cache=init_cache,
                output_attentions=output_attentions,
            )

            # 获取自注意力机制的输出
            attention_output = attn_outputs[0]

            # 获取额外的输出
            outputs = attn_outputs[1:]

            # 在自注意力输出后进行 layer normalization
            post_layernorm = self.post_attention_layernorm(attention_output)

            # 根据配置设置残差
            if self.apply_residual_connection_post_layernorm:
                # 如果配置要求在后置 layer normalization 后使用残差
                residual = post_layernorm
            else:
                # 否则使用注意力输出作为残差
                residual = attention_output

            # 将 post-layernorm 结果和残差输入到 MLP 中进行处理
            output = self.mlp(post_layernorm, residual, deterministic=deterministic)

            # 将 MLP 的输出与其他输出合并
            outputs = (output,) + outputs

            # 返回所有输出
            return outputs
class FlaxBloomPreTrainedModel(FlaxPreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用 BloomConfig 类作为配置类
    config_class = BloomConfig
    # 基础模型的前缀名称
    base_model_prefix = "transformer"
    # 模块类,初始化为 None
    module_class: nn.Module = None

    def __init__(
        self,
        config: BloomConfig,
        input_shape: Tuple = (1, 1),
        seed: int = 0,
        dtype: jnp.dtype = jnp.float32,
        _do_init: bool = True,
        **kwargs,
    ):
        # 使用给定的配置和参数初始化模块对象
        module = self.module_class(config=config, dtype=dtype, **kwargs)
        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)

    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
        # 初始化输入张量
        input_ids = jnp.zeros(input_shape, dtype="i4")
        # 创建一个和 input_ids 相同形状的全1张量作为 attention_mask
        attention_mask = jnp.ones_like(input_ids)
        # 拆分随机数生成器为 params_rng 和 dropout_rng
        params_rng, dropout_rng = jax.random.split(rng)
        rngs = {"params": params_rng, "dropout": dropout_rng}

        # 使用模块的初始化方法生成随机参数
        random_params = self.module.init(rngs, input_ids, attention_mask, return_dict=False)["params"]

        if params is not None:
            # 如果提供了初始参数,则用随机生成的参数填充缺失的键
            random_params = flatten_dict(unfreeze(random_params))
            params = flatten_dict(unfreeze(params))
            for missing_key in self._missing_keys:
                params[missing_key] = random_params[missing_key]
            self._missing_keys = set()
            return freeze(unflatten_dict(params))
        else:
            # 如果没有提供初始参数,则直接返回随机生成的参数
            return random_params

    def init_cache(self, batch_size, max_length):
        """
        Args:
            batch_size (`int`):
                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
            max_length (`int`):
                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
                cache.
        """
        # 初始化用于检索缓存的输入变量
        input_ids = jnp.ones((batch_size, max_length), dtype="i4")
        # 创建一个和 input_ids 相同形状的全1张量作为 attention_mask
        attention_mask = jnp.ones_like(input_ids)

        # 使用模块的初始化方法,设置 init_cache=True 来初始化缓存
        init_variables = self.module.init(
            jax.random.PRNGKey(0), input_ids, attention_mask, return_dict=False, init_cache=True
        )
        return unfreeze(init_variables["cache"])

    @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING)
    def __call__(
        self,
        input_ids,
        attention_mask=None,
        past_key_values: dict = None,
        params: dict = None,
        dropout_rng: jax.random.PRNGKey = None,
        train: bool = False,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ):
            # 如果 output_attentions 不为 None,则使用指定的 output_attentions;否则使用配置中的默认值
            output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
            # 如果 output_hidden_states 不为 None,则使用指定的 output_hidden_states;否则使用配置中的默认值
            output_hidden_states = (
                output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
            )
            # 如果 return_dict 不为 None,则使用指定的 return_dict;否则使用配置中的默认值
            return_dict = return_dict if return_dict is not None else self.config.use_return_dict

            # 获取输入张量的批量大小和序列长度
            batch_size, sequence_length = input_ids.shape

            # 如果 attention_mask 为 None,则创建一个全为 1 的注意力掩码张量
            if attention_mask is None:
                attention_mask = jnp.ones((batch_size, sequence_length))

            # 如果 dropout_rng 不为 None,则将其作为随机数生成器加入到 rngs 字典中
            rngs = {}
            if dropout_rng is not None:
                rngs["dropout"] = dropout_rng

            # 准备输入参数字典,包括模型参数或者当前实例的参数
            inputs = {"params": params or self.params}

            # 如果传入了 past_key_values,则将其作为缓存传递给模型
            if past_key_values:
                inputs["cache"] = past_key_values
                # 设置 mutable 变量以确保缓存可以被修改
                mutable = ["cache"]
            else:
                mutable = False

            # 调用模型的 apply 方法,执行前向推断
            outputs = self.module.apply(
                inputs,
                jnp.array(input_ids, dtype="i4"),
                jnp.array(attention_mask, dtype="i4"),
                not train,
                False,
                output_attentions,
                output_hidden_states,
                return_dict,
                rngs=rngs,
                mutable=mutable,
            )

            # 如果 past_key_values 不为 None 且 return_dict 为 True,则将更新后的缓存添加到模型输出中
            if past_key_values is not None and return_dict:
                outputs, past_key_values = outputs
                outputs["past_key_values"] = unfreeze(past_key_values["cache"])
                return outputs
            # 如果 past_key_values 不为 None 且 return_dict 为 False,则将更新后的缓存插入到模型输出的适当位置
            elif past_key_values is not None and not return_dict:
                outputs, past_key_values = outputs
                outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]

            # 返回模型的输出
            return outputs
class FlaxBloomBlockCollection(nn.Module):
    config: BloomConfig
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        # 初始化模块,创建包含多个 FlaxBloomBlock 的层列表
        self.layers = [
            FlaxBloomBlock(self.config, name=str(layer_number), dtype=self.dtype)
            for layer_number in range(self.config.num_hidden_layers)
        ]

    def __call__(
        self,
        hidden_states,
        alibi,
        attention_mask=None,
        deterministic: bool = True,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
    ):
        # 根据是否输出注意力和隐藏状态,初始化空元组或者 None
        all_attentions = () if output_attentions else None
        all_hidden_states = () if output_hidden_states else None

        # 遍历每一层并执行前向传播
        for layer_number in range(self.config.num_hidden_layers):
            if output_hidden_states:
                # 如果输出隐藏状态,将当前隐藏状态添加到 all_hidden_states 中
                all_hidden_states += (hidden_states,)

            # 调用当前层的前向传播,并更新 hidden_states
            layer_outputs = self.layers[layer_number](
                hidden_states,
                alibi=alibi,
                attention_mask=attention_mask,
                deterministic=deterministic,
                init_cache=init_cache,
                output_attentions=output_attentions,
            )
            hidden_states = layer_outputs[0]

            if output_attentions:
                # 如果输出注意力,将当前层的注意力添加到 all_attentions 中
                all_attentions += (layer_outputs[1],)

        # 输出包含可能为 None 的元组,由 FlaxBloomModule 进一步处理
        outputs = (hidden_states, all_hidden_states, all_attentions)

        return outputs


class FlaxBloomModule(nn.Module):
    config: BloomConfig
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        # 初始化模块,设置词嵌入维度和初始化 word embeddings 和 layernorm
        self.embed_dim = self.config.hidden_size

        self.word_embeddings = nn.Embed(
            self.config.vocab_size,
            self.embed_dim,
            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
            dtype=self.dtype,
        )

        self.word_embeddings_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)

        # 初始化 transformer 层集合
        self.h = FlaxBloomBlockCollection(self.config, dtype=self.dtype)

        # 初始化最终 layernorm
        self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)

    def __call__(
        self,
        input_ids=None,
        attention_mask=None,
        deterministic=True,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 模块的前向传播方法,根据参数执行相应操作并返回结果
        ):
        # 使用输入的词嵌入层,将输入的词索引转换为词嵌入向量
        inputs_embeds = self.word_embeddings(input_ids)
        # 执行词嵌入向量后的层归一化操作
        hidden_states = self.word_embeddings_layernorm(inputs_embeds)

        # 根据注意力掩码构建alibi(假设),其形状和数据类型与隐藏状态相匹配
        alibi = build_alibi_tensor(attention_mask, self.config.n_head, dtype=hidden_states.dtype)

        # 将隐藏状态、alibi、注意力掩码以及其他参数传递给self.h函数进行处理
        outputs = self.h(
            hidden_states,
            alibi=alibi,
            attention_mask=attention_mask,
            deterministic=deterministic,
            init_cache=init_cache,
            output_hidden_states=output_hidden_states,
            output_attentions=output_attentions,
        )

        # 获取self.h函数的输出中的隐藏状态
        hidden_states = outputs[0]
        # 对最终的隐藏状态再进行一次层归一化
        hidden_states = self.ln_f(hidden_states)

        # 如果需要输出所有隐藏状态,则将其存储在all_hidden_states中
        if output_hidden_states:
            all_hidden_states = outputs[1] + (hidden_states,)
            outputs = (hidden_states, all_hidden_states) + outputs[2:]
        else:
            outputs = (hidden_states,) + outputs[1:]

        # 如果不需要使用字典形式返回结果,则将输出转换为元组并去除None值
        if not return_dict:
            return tuple(v for v in [outputs[0], outputs[-1]] if v is not None)

        # 使用自定义的输出类生成包含特定属性的输出对象
        return FlaxBaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            hidden_states=outputs[1],
            attentions=outputs[-1],
        )
@add_start_docstrings(
    "The bare Bloom Model transformer outputting raw hidden-states without any specific head on top.",
    BLOOM_START_DOCSTRING,
)
# 从transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoModel复制而来,将GPTNeo替换为Bloom
class FlaxBloomModel(FlaxBloomPreTrainedModel):
    module_class = FlaxBloomModule


append_call_sample_docstring(FlaxBloomModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)


class FlaxBloomForCausalLMModule(nn.Module):
    config: BloomConfig
    dtype: jnp.dtype = jnp.float32

    def setup(self):
        # 使用给定的配置创建Bloom模块
        self.transformer = FlaxBloomModule(self.config, dtype=self.dtype)
        # 创建语言模型头部,连接到Bloom模块的输出
        self.lm_head = nn.Dense(
            self.config.vocab_size,
            use_bias=False,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
        )

    def __call__(
        self,
        input_ids,
        attention_mask,
        deterministic: bool = True,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 调用Bloom模块进行前向传播
        outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            deterministic=deterministic,
            init_cache=init_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = outputs[0]

        if self.config.tie_word_embeddings:
            # 如果配置要求共享词嵌入矩阵,则获取共享的权重矩阵并应用于语言模型头部
            shared_kernel = self.transformer.variables["params"]["word_embeddings"]["embedding"].T
            lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
        else:
            # 否则直接将隐藏状态传递给语言模型头部
            lm_logits = self.lm_head(hidden_states)

        if not return_dict:
            # 如果不要求返回字典,则返回元组形式的结果
            return (lm_logits,) + outputs[1:]

        # 否则将结果封装成FlaxCausalLMOutput类型并返回
        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)


@add_start_docstrings(
    """
    The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    BLOOM_START_DOCSTRING,
)
# FlaxBloomForCausalLM的子类,添加了语言建模头部
class FlaxBloomForCausalLM(FlaxBloomPreTrainedModel):
    module_class = FlaxBloomForCausalLMModule
    # 为生成准备输入的函数,接受输入的ID,最大长度和可选的注意力掩码
    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
        # 初始化缓存,获取输入的批次大小和序列长度
        batch_size, seq_length = input_ids.shape

        # 使用self.init_cache方法初始化过去键值
        past_key_values = self.init_cache(batch_size, max_length)

        # 注意:通常需要在attention_mask中将超出input_ids.shape[-1]和cache_length之间的位置设置为0。
        # 但由于Bloom使用因果掩码,这些位置已经被屏蔽。因此,我们可以在这里创建一个静态的attention_mask,
        # 这样更有效地进行编译。
        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
        if attention_mask is not None:
            # 使用lax.dynamic_update_slice将attention_mask动态更新到extended_attention_mask中的指定位置
            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))

        # 返回一个包含过去键值和扩展的注意力掩码的字典
        return {
            "past_key_values": past_key_values,
            "attention_mask": extended_attention_mask,
        }

    # 更新用于生成的输入,将模型输出中的过去键值添加到模型参数中
    def update_inputs_for_generation(self, model_outputs, model_kwargs):
        model_kwargs["past_key_values"] = model_outputs.past_key_values
        return model_kwargs
# 调用一个函数来添加一个样例文档字符串
append_call_sample_docstring(FlaxBloomForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)

.\models\bloom\tokenization_bloom_fast.py

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for Bloom."""


import pickle
from typing import Optional, Tuple

from ...tokenization_utils_base import BatchEncoding
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging


logger = logging.get_logger(__name__)

# 定义用于存储tokenizer文件名的常量
VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}

# 定义预训练模型到tokenizer文件映射的常量
PRETRAINED_VOCAB_FILES_MAP = {
    "tokenizer_file": {
        "bigscience/tokenizer": "https://huggingface.co/bigscience/tokenizer/blob/main/tokenizer.json",
        "bigscience/bloom-560m": "https://huggingface.co/bigscience/bloom-560m/blob/main/tokenizer.json",
        "bigscience/bloom-1b1": "https://huggingface.co/bigscience/bloom-1b1/blob/main/tokenizer.json",
        "bigscience/bloom-1b7": "https://huggingface.co/bigscience/bloom-1b7/blob/main/tokenizer.json",
        "bigscience/bloom-3b": "https://huggingface.co/bigscience/bloom-3b/blob/main/tokenizer.json",
        "bigscience/bloom-7b1": "https://huggingface.co/bigscience/bloom-7b1/blob/main/tokenizer.json",
        "bigscience/bloom": "https://huggingface.co/bigscience/bloom/blob/main/tokenizer.json",
    },
}


class BloomTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a "fast" Bloom tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
    Byte-Pair-Encoding.

    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
    be encoded differently whether it is at the beginning of the sentence (without space) or not:

    ```
    >>> from transformers import BloomTokenizerFast

    >>> tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom")
    >>> tokenizer("Hello world")["input_ids"]
    [59414, 8876]

    >>> tokenizer(" Hello world")["input_ids"]
    [86153, 8876]
    ```

    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
    the model was not pretrained this way, it might yield a decrease in performance.

    <Tip>

    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.

    </Tip>

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.
    """
    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        errors (`str`, *optional*, defaults to `"replace"`):
            Paradigm to follow when decoding bytes to UTF-8. See
            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The end of sequence token.
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (Bloom tokenizer detect beginning of words by the preceding space).
        trim_offsets (`bool`, *optional*, defaults to `True`):
            Whether or not the post-processing step should trim offsets to avoid including whitespaces.
    """
    # 定义预训练模型所需的文件名称
    vocab_files_names = VOCAB_FILES_NAMES
    # 预训练模型的文件映射
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 模型输入的名称列表
    model_input_names = ["input_ids", "attention_mask"]
    # 慢速分词器类,默认为 None
    slow_tokenizer_class = None
    # 没有 `max_model_input_sizes`,因为 BLOOM 使用 ALiBi 位置嵌入

    def __init__(
        self,
        vocab_file=None,
        merges_file=None,
        tokenizer_file=None,
        unk_token="<unk>",
        bos_token="<s>",
        eos_token="</s>",
        pad_token="<pad>",
        add_prefix_space=False,
        clean_up_tokenization_spaces=False,
        **kwargs,
    ):
        # 调用父类的初始化方法,传递必要的参数和可选参数
        super().__init__(
            vocab_file,
            merges_file,
            tokenizer_file=tokenizer_file,
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            pad_token=pad_token,
            add_prefix_space=add_prefix_space,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            **kwargs,
        )
        # 序列化后的预分词器和解码器状态
        pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer)
        decoder_state = pickle.dumps(self.backend_tokenizer.decoder)

        # 如果需要添加前缀空格,则更新序列化状态以匹配配置
        if add_prefix_space:
            pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
            decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
        # 反序列化并更新后端分词器的预分词器和解码器
        self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
        self.backend_tokenizer.decoder = pickle.loads(decoder_state)

        # 设置类属性,记录是否添加前缀空格
        self.add_prefix_space = add_prefix_space
    # 定义一个方法 `_batch_encode_plus`,接受任意位置参数和关键字参数,并返回 `BatchEncoding` 对象
    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
        # 从关键字参数中获取 `is_split_into_words`,默认为 False
        is_split_into_words = kwargs.get("is_split_into_words", False)
        # 如果 `add_prefix_space` 为 False 并且 `is_split_into_words` 也为 False,则抛出异常
        if not (self.add_prefix_space or not is_split_into_words):
            raise Exception(
                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
                " pretokenized inputs."
            )

        # 调用父类的 `_batch_encode_plus` 方法,并传递所有位置参数和关键字参数
        return super()._batch_encode_plus(*args, **kwargs)

    # 定义一个方法 `_encode_plus`,接受任意位置参数和关键字参数,并返回 `BatchEncoding` 对象
    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
        # 从关键字参数中获取 `is_split_into_words`,默认为 False
        is_split_into_words = kwargs.get("is_split_into_words", False)

        # 如果 `add_prefix_space` 为 False 并且 `is_split_into_words` 也为 False,则抛出异常
        if not (self.add_prefix_space or not is_split_into_words):
            raise Exception(
                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
                " pretokenized inputs."
            )

        # 调用父类的 `_encode_plus` 方法,并传递所有位置参数和关键字参数
        return super()._encode_plus(*args, **kwargs)

    # 定义一个方法 `save_vocabulary`,接受一个保存目录路径 `save_directory` 和一个可选的文件名前缀 `filename_prefix`,返回一个包含文件名的元组
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 调用 `_tokenizer` 对象的 `model.save` 方法,将模型保存到指定的 `save_directory` 中,并指定文件名前缀 `filename_prefix`
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        # 返回保存的文件名构成的元组
        return tuple(files)

    @property
    # 定义一个属性 `default_chat_template`,返回一个简单的聊天模板字符串,该模板忽略角色信息,并用 EOS 标记连接消息
    def default_chat_template(self):
        """
        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
        """
        # 发出警告日志,提示用户未定义聊天模板,使用默认模板
        logger.warning_once(
            "\nNo chat template is defined for this tokenizer - using the default template "
            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
            "your model, please set `tokenizer.chat_template` to an appropriate template. "
            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
        )
        # 返回默认的聊天模板字符串,用于处理消息
        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"

.\models\bloom\__init__.py

# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import TYPE_CHECKING

from ...utils import (
    # 捕获未安装的依赖包的异常
    OptionalDependencyNotAvailable,
    _LazyModule,
    是Flax可获取的,
    是Tokenizers可获取的,
    是Torch可获取的,
)

_import_structure = {
    # 导入BLOOM配置相关结构
    "configuration_bloom": ["BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP", "BloomConfig", "BloomOnnxConfig"],
}

# 填充已存在的受限包可用性检查后
# 尝试导入和验证是否 Tokenizers 依赖可用
try:
    如果 not 是Tokenizers可获取的():
        抛出OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    # NaN 接下来继续执行不导入Tokenizers相关结构的代码
    pass
else:
    _import_structure["tokenization_bloom_fast"] = ["BloomTokenizerFast"]

# 填充已存在的受限包可用性检查后
# 尝试导入和验证Torch是否可用
try:
    If not 是Torch可获取的():
        抛出OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    # NaN 接下来继续执行不导入PyTorch相关结构的代码
    pass
else:
    _import_structure["modeling_bloom"] = [
        "BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST",
        "BloomForCausalLM",
        "BloomModel",
        "BloomPreTrainedModel",
        "BloomForSequenceClassification",
        "BloomForTokenClassification",
        "BloomForQuestionAnswering",
    ]

# 填充已存在的受限包可用性检查后
# 尝试导入和验证是否Flax可用
try:
    If not 是Flax可获取的():
        抛出OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    # NaN 接下来继续执行不导入Flax相关结构的代码
    pass
else:
    _import_structure["modeling_flax_bloom"] = [
        "FlaxBloomForCausalLM",
        "FlaxBloomModel",
        "FlaxBloomPreTrainedModel",
    ]

# 如果代码在模式检查中(例如,导入结构是静态的)
如果 "检查类型":
    从 .configuration_bloom 导入 BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP, BloomConfig, BloomOnnxConfig

    # 尝试导入 Tokenizers 相关结构的另一个方法
    try:
        If not 是Tokenizers可获取的():
            抛出OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        从 .tokenization_bloom_fast 导入 BloomTokenizerFast

    # 尝试导入 PyTorch 相关结构
    try:
        If not 是Torch可获取的():
            抛出OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        从 .modeling_bloom 导入 (
            BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST,
            BloomForCausalLM,
            BloomForQuestionAnswering,
            BloomForSequenceClassification,
            BloomForTokenClassification,
            BloomModel,
            BloomPreTrainedModel,
        )

    # 尝试导入 Flax 相关结构
    try:
        If not 是Flax可获取的():
            抛出OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        从 .modeling_flax_bloom 导入 (
            FlaxBloomForCausalLM,
            FlaxBloomModel,
            FlaxBloomPreTrainedModel,
        )

else:
    import 系统 as 系统
    # 实例化一个占位符类,用于当代码非类型检查模式时工作
_导入_structure = _懒模块(lambda: _导入_structure(), 属性("__version__"))
    # 将当前模块注册到 sys.modules 中,使用 _LazyModule 进行延迟加载
    # 设置模块名为当前模块的名字,文件路径为当前模块的文件路径
    # 使用 globals()["__file__"] 获取当前模块的文件路径作为参数传递给 _LazyModule
    # 使用 _import_structure 指定模块的导入结构
    # 将模块规范 (__spec__) 作为参数传递给 _LazyModule
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\bridgetower\configuration_bridgetower.py

# coding=utf-8
# 设置模块的版权声明和许可信息

""" BridgeTower model configuration"""
# 引入必要的库和模块
import os
from typing import Union

# 从相对路径引入配置工具和日志模块
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取日志记录器
logger = logging.get_logger(__name__)

# 预训练模型配置映射,将模型名称映射到其配置文件的下载链接
BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "BridgeTower/bridgetower-base": "https://huggingface.co/BridgeTower/bridgetower-base/blob/main/config.json",
    "BridgeTower/bridgetower-base-itm-mlm": (
        "https://huggingface.co/BridgeTower/bridgetower-base-itm-mlm/blob/main/config.json"
    ),
}

# 定义一个配置类 BridgeTowerVisionConfig,用于存储视觉编码器的配置信息
class BridgeTowerVisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the vision configuration of a [`BridgeTowerModel`]. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the bridgetower-base
    [BridgeTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in visual encoder model.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        image_size (`int`, *optional*, defaults to 288):
            The size (resolution) of each image.
        initializer_factor (`float`, *optional*, defaults to 1):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        stop_gradient (`bool`, *optional*, defaults to `False`):
            Whether to stop gradient for training.
        share_layernorm (`bool`, *optional*, defaults to `True`):
            Whether LayerNorm layers are shared.
        remove_last_layer (`bool`, *optional*, defaults to `False`):
            Whether to remove the last layer from the vision encoder.
    """

    # 初始化函数,设置各种可选参数的默认值
    def __init__(
        self,
        hidden_size=768,
        num_hidden_layers=12,
        patch_size=16,
        image_size=288,
        initializer_factor=1.0,
        layer_norm_eps=1e-05,
        stop_gradient=False,
        share_layernorm=True,
        remove_last_layer=False,
        **kwargs
    ):
        # 调用父类的初始化函数,传递配置参数
        super().__init__(**kwargs)
        # 设置实例变量,存储每个参数的值
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.patch_size = patch_size
        self.image_size = image_size
        self.initializer_factor = initializer_factor
        self.layer_norm_eps = layer_norm_eps
        self.stop_gradient = stop_gradient
        self.share_layernorm = share_layernorm
        self.remove_last_layer = remove_last_layer
    >>> from transformers import BridgeTowerVisionConfig

    # 导入 BridgeTowerVisionConfig 类

    >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration for the vision model
    # 初始化一个 BridgeTower 风格的视觉模型配置,使用 BridgeTower/bridgetower-base 风格

    >>> configuration = BridgeTowerVisionConfig()

    # 创建一个 BridgeTowerVisionConfig 的实例,用于配置视觉模型

    >>> # Accessing the configuration
    # 访问配置实例
    >>> configuration
    ```"""

    model_type = "bridgetower_vision_model"

    # 设置模型类型为 "bridgetower_vision_model"

    def __init__(
        self,
        hidden_size=768,
        num_hidden_layers=12,
        num_channels=3,
        patch_size=16,
        image_size=288,
        initializer_factor=1,
        layer_norm_eps=1e-05,
        stop_gradient=False,
        share_layernorm=True,
        remove_last_layer=False,
        **kwargs,
    ):
        # 初始化方法,接受多个参数用于配置模型的各个属性
        super().__init__(**kwargs)
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_channels = num_channels
        self.patch_size = patch_size
        self.image_size = image_size
        self.initializer_factor = initializer_factor
        self.layer_norm_eps = layer_norm_eps
        self.stop_gradient = stop_gradient
        self.share_layernorm = share_layernorm
        self.remove_last_layer = remove_last_layer

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 类方法,从预训练模型加载配置
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        if config_dict.get("model_type") == "bridgetower":
            config_dict = config_dict["text_config"]

        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        return cls.from_dict(config_dict, **kwargs)
# BridgeTowerTextConfig 类继承自 PretrainedConfig,用于存储文本模型的配置信息
class BridgeTowerTextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the text configuration of a [`BridgeTowerModel`]. The default values here
    are copied from RoBERTa. Instantiating a configuration with the defaults will yield a similar configuration to that
    of the bridgetower-base [BridegTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/)
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import BridgeTowerTextConfig

    >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration for the text model
    >>> configuration = BridgeTowerTextConfig()

    >>> # Accessing the configuration
    >>> configuration
    ```"""

    # 模型类型为 "bridgetower_text_model"
    model_type = "bridgetower_text_model"

    # 初始化方法,设置各种模型参数
    def __init__(
        self,
        vocab_size=50265,  # 词汇表大小,默认为 50265
        hidden_size=768,   # 隐藏层大小,默认为 768
        num_hidden_layers=12,  # 隐藏层数,默认为 12
        num_attention_heads=12,  # 注意力头数,默认为 12
        initializer_factor=1,    # 初始化因子,默认为 1
        intermediate_size=3072,  # 中间层大小,默认为 3072
        hidden_act="gelu",        # 隐藏层激活函数,默认为 "gelu"
        hidden_dropout_prob=0.1,  # 隐藏层 dropout 概率,默认为 0.1
        attention_probs_dropout_prob=0.1,  # 注意力 dropout 概率,默认为 0.1
        max_position_embeddings=514,       # 最大位置嵌入数,默认为 514
        type_vocab_size=1,                 # 类型词汇表大小,默认为 1
        layer_norm_eps=1e-05,              # 层归一化 epsilon,默认为 1e-05
        pad_token_id=1,                    # 填充 token 的 id,默认为 1
        bos_token_id=0,                    # 开始 token 的 id,默认为 0
        eos_token_id=2,                    # 结束 token 的 id,默认为 2
        position_embedding_type="absolute",  # 位置嵌入类型,默认为 "absolute"
        use_cache=True,                     # 是否使用缓存,默认为 True
        **kwargs,
    ):
        super().__init__(**kwargs)  # 调用父类 PretrainedConfig 的初始化方法

        # 设置各个参数
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.initializer_factor = initializer_factor
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.layer_norm_eps = layer_norm_eps
        self.position_embedding_type = position_embedding_type
        self.use_cache = use_cache
        self.pad_token_id = pad_token_id
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

    @classmethod
    # 根据预训练模型名称或路径获取配置字典和额外的关键字参数
    config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

    # 如果配置字典中的模型类型是 "bridgetower",则将配置字典更新为其"text_config"字段的内容
    if config_dict.get("model_type") == "bridgetower":
        config_dict = config_dict["text_config"]

    # 如果配置字典中包含"model_type"字段,并且类(cls)具有"model_type"属性,并且配置字典中的模型类型与类的模型类型不匹配,
    # 则发出警告,因为这种情况下并非所有模型配置都支持,可能导致错误
    if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
        logger.warning(
            f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
            f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
        )

    # 根据配置字典创建并返回预训练配置对象
    return cls.from_dict(config_dict, **kwargs)
# BridgeTowerConfig 类,用于存储 BridgeTowerModel 的配置信息
class BridgeTowerConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`BridgeTowerModel`]. It is used to instantiate a
    BridgeTower model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the bridgetower-base
    [BridgeTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        share_cross_modal_transformer_layers (`bool`, *optional*, defaults to `True`):
            Whether cross modal transformer layers are shared.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler.
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        initializer_factor (`float`, *optional*, defaults to 1):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        share_link_tower_layers (`bool`, *optional*, defaults to `False`):
            Whether the bride/link tower layers are shared.
        link_tower_type (`str`, *optional*, defaults to `"add"`):
            Type of the bridge/link layer.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer encoder.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie input and output embeddings.
        init_layernorm_from_vision_encoder (`bool`, *optional*, defaults to `False`):
            Whether to init LayerNorm from the vision encoder.
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`BridgeTowerTextConfig`].
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`BridgeTowerVisionConfig`].

    Example:

    ```
    >>> from transformers import BridgeTowerModel, BridgeTowerConfig

    >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration
    >>> configuration = BridgeTowerConfig()

    >>> # Initializing a model from the BridgeTower/bridgetower-base style configuration
    >>> model = BridgeTowerModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    # 设置模型类型为 "bridgetower"
    model_type = "bridgetower"
    
    # 定义一个名为 BridgeTowerConfig 的类,用于配置 Bridgetower 模型的参数
    class BridgeTowerConfig:
        # 初始化方法,用于设置 BridgetowerConfig 实例的各种参数
        def __init__(
            self,
            share_cross_modal_transformer_layers=True,  # 是否共享跨模态 Transformer 层
            hidden_act="gelu",  # 隐藏层激活函数类型,默认为 gelu
            hidden_size=768,  # 隐藏层大小,默认为 768
            initializer_factor=1,  # 初始化因子,默认为 1
            layer_norm_eps=1e-05,  # LayerNormalization 中的 epsilon,默认为 1e-05
            share_link_tower_layers=False,  # 是否共享链接塔层
            link_tower_type="add",  # 链接塔类型,默认为 add
            num_attention_heads=12,  # 注意力头数目,默认为 12
            num_hidden_layers=6,  # 隐藏层层数,默认为 6
            tie_word_embeddings=False,  # 是否绑定单词嵌入
            init_layernorm_from_vision_encoder=False,  # 是否从视觉编码器初始化层归一化
            text_config=None,  # 文本配置,如果为 None 则使用默认值
            vision_config=None,  # 视觉配置,如果为 None 则使用默认值
            **kwargs,  # 其他参数
        ):
            # TODO: remove this once the Hub files are updated.
            _ = kwargs.pop("text_config_dict", None)  # 从 kwargs 中移除 "text_config_dict" 键的值
            _ = kwargs.pop("vision_config_dict", None)  # 从 kwargs 中移除 "vision_config_dict" 键的值
    
            super().__init__(**kwargs)  # 调用父类的初始化方法,传入剩余的关键字参数
    
            # 设置类的实例变量
            self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers
            self.hidden_act = hidden_act
            self.hidden_size = hidden_size
            self.initializer_factor = initializer_factor
            self.layer_norm_eps = layer_norm_eps
            self.share_link_tower_layers = share_link_tower_layers
            self.link_tower_type = link_tower_type
            self.num_attention_heads = num_attention_heads
            self.num_hidden_layers = num_hidden_layers
            self.tie_word_embeddings = tie_word_embeddings
            self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder
    
            # 如果 text_config 为 None,则使用默认空字典,并记录日志消息
            if text_config is None:
                text_config = {}
                logger.info("`text_config` is `None`. Initializing the `BridgeTowerTextConfig` with default values.")
    
            # 如果 vision_config 为 None,则使用默认空字典,并记录日志消息
            if vision_config is None:
                vision_config = {}
                logger.info("`vision_config` is `None`. Initializing the `BridgeTowerVisionConfig` with default values.")
    
            # 根据给定的 text_config 创建 BridgeTowerTextConfig 的实例,并赋值给 self.text_config
            self.text_config = BridgeTowerTextConfig(**text_config)
    
            # 根据给定的 vision_config 创建 BridgeTowerVisionConfig 的实例,并赋值给 self.vision_config
            self.vision_config = BridgeTowerVisionConfig(**vision_config)
    
        @classmethod
        # 类方法,从 text_config 和 vision_config 创建 BridgeTowerConfig 的实例
        def from_text_vision_configs(
            cls, text_config: BridgeTowerTextConfig, vision_config: BridgeTowerVisionConfig, **kwargs
        ):
            r"""
            从 BridgeTower 文本模型配置实例化一个 [`BridgeTowerConfig`](或其派生类)。返回:
                [`BridgeTowerConfig`]: 配置对象的一个实例
            """
    
            # 调用类的构造函数,传入 text_config 和 vision_config 的字典表示,以及其他关键字参数
            return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)

.\models\bridgetower\image_processing_bridgetower.py

# 定义脚本的编码格式为 UTF-8
# 版权声明,指明版权归属和保留的权利

"""BridgeTower 的图像处理器类。"""

from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

import numpy as np  # 导入 NumPy 库

# 导入图像处理相关的工具和函数
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import PaddingMode, center_crop, pad, resize, to_channel_dimension_format
from ...image_utils import (
    OPENAI_CLIP_MEAN,
    OPENAI_CLIP_STD,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_batched,
    is_scaled_image,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)
# 导入通用的工具函数
from ...utils import TensorType, is_vision_available, logging

# 如果视觉功能可用,则导入 PIL 库
if is_vision_available():
    import PIL

# 获取 logger 对象用于记录日志信息
logger = logging.get_logger(__name__)


# 从 transformers 模块中复制的函数定义,计算可迭代值中每个索引的最大值并返回列表
def max_across_indices(values: Iterable[Any]) -> List[Any]:
    """
    Return the maximum value across all indices of an iterable of values.
    """
    return [max(values_i) for values_i in zip(*values)]


# 从 transformers 模块中复制的函数定义,为图像创建像素掩码,其中 1 表示有效像素,0 表示填充像素
def make_pixel_mask(
    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> np.ndarray:
    """
    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.

    Args:
        image (`np.ndarray`):
            Image to make the pixel mask for.
        output_size (`Tuple[int, int]`):
            Output size of the mask.
    """
    # 获取图像的高度和宽度
    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
    # 创建一个与输出大小相同的像素掩码数组,初始值为 0
    mask = np.zeros(output_size, dtype=np.int64)
    # 将有效图像区域标记为 1
    mask[:input_height, :input_width] = 1
    return mask


# 从 transformers 模块中复制的函数定义,获取批处理中所有图像的最大高度和宽度
def get_max_height_width(
    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
) -> List[int]:
    """
    Get the maximum height and width across all images in a batch.
    """
    # 如果未指定数据格式,则推断第一个图像的通道格式
    if input_data_format is None:
        input_data_format = infer_channel_dimension_format(images[0])
    # 如果输入数据格式为首先通道维度
    if input_data_format == ChannelDimension.FIRST:
        # 获取所有图像的形状,并取得最大的高度和宽度
        _, max_height, max_width = max_across_indices([img.shape for img in images])
    # 如果输入数据格式为最后通道维度
    elif input_data_format == ChannelDimension.LAST:
        # 获取所有图像的形状,并取得最大的高度和宽度
        max_height, max_width, _ = max_across_indices([img.shape for img in images])
    else:
        # 如果输入数据格式既不是首先也不是最后通道维度,则引发值错误异常
        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
    # 返回最大的高度和宽度作为元组
    return (max_height, max_width)
# 从transformers.models.vilt.image_processing_vilt.get_resize_output_image_size复制而来的函数
def get_resize_output_image_size(
    input_image: np.ndarray,
    shorter: int = 800,
    longer: int = 1333,
    size_divisor: int = 32,
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Tuple[int, int]:
    # 获取输入图像的高度和宽度
    input_height, input_width = get_image_size(input_image, input_data_format)
    
    # 定义最小和最大的尺寸
    min_size, max_size = shorter, longer

    # 计算缩放比例
    scale = min_size / min(input_height, input_width)

    # 根据图像高度与宽度的比较来调整新的高度和宽度
    if input_height < input_width:
        new_height = min_size
        new_width = scale * input_width
    else:
        new_height = scale * input_height
        new_width = min_size

    # 如果新的高度或宽度超过最大尺寸,则再次调整缩放比例
    if max(new_height, new_width) > max_size:
        scale = max_size / max(new_height, new_width)
        new_height = scale * new_height
        new_width = scale * new_width

    # 四舍五入并确保高度和宽度是size_divisor的倍数
    new_height, new_width = int(new_height + 0.5), int(new_width + 0.5)
    new_height = new_height // size_divisor * size_divisor
    new_width = new_width // size_divisor * size_divisor

    # 返回新的高度和宽度作为元组
    return new_height, new_width


class BridgeTowerImageProcessor(BaseImageProcessor):
    r"""
    构建一个BridgeTower图像处理器。

    """

    # 模型输入的名称列表
    model_input_names = ["pixel_values"]

    def __init__(
        self,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        size_divisor: int = 32,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_center_crop: bool = True,
        crop_size: Dict[str, int] = None,
        do_pad: bool = True,
        **kwargs,
    ):
        # 初始化BridgeTowerImageProcessor对象的各种属性
        super().__init__(**kwargs)
        self.do_resize = do_resize
        self.size = size
        self.size_divisor = size_divisor
        self.resample = resample
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.image_mean = image_mean
        self.image_std = image_std
        self.do_center_crop = do_center_crop
        self.crop_size = crop_size
        self.do_pad = do_pad
        # 其他参数传递给基类构造函数
        # kwargs 可以包含任何未在参数列表中指定的其他参数
    # 初始化图像处理器对象,接受各种参数以配置其行为
    def __init__(
        self,
        do_resize: bool = True,  # 是否进行图像重置
        size: Optional[Dict[str, int]] = None,  # 图像大小参数字典,短边至少为288像素
        size_divisor: int = 32,  # 图像大小的除数,用于确保尺寸可以被32整除
        resample: PILImageResampling = PILImageResampling.BICUBIC,  # 图像重采样方法,默认为双三次插值
        do_rescale: bool = True,  # 是否进行图像缩放
        rescale_factor: Optional[float] = None,  # 图像缩放因子
        do_normalize: bool = True,  # 是否进行图像标准化
        image_mean: Optional[List[float]] = None,  # 图像像素均值,若未指定则使用预设值
        image_std: Optional[List[float]] = None,  # 图像像素标准差,若未指定则使用预设值
        do_pad: bool = False,  # 是否进行图像填充
        do_center_crop: bool = False,  # 是否进行图像中心裁剪
        crop_size: Optional[Tuple[int, int]] = None,  # 图像裁剪尺寸
        **kwargs,  # 其他可选参数,用于灵活配置
    ) -> None:
        if "pad_and_return_pixel_mask" in kwargs:
            # 如果传入参数中包含"pad_and_return_pixel_mask",则获取并移除这个参数
            do_pad = kwargs.pop("pad_and_return_pixel_mask")
    
        super().__init__(**kwargs)  # 调用父类初始化方法,传递其他参数给父类
    
        size = size if size is not None else {"shortest_edge": 288}  # 如果未指定size参数,则设定短边至少为288像素
        size = get_size_dict(size, default_to_square=False)  # 调用函数获取处理后的size字典
    
        # 将初始化参数赋值给对象的属性
        self.do_resize = do_resize
        self.size = size
        self.size_divisor = size_divisor
        self.resample = resample
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
        self.do_pad = do_pad
        self.do_center_crop = do_center_crop
        self.crop_size = crop_size
    
        # 验证处理器参数的有效性,列出所有可能的有效键
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "size_divisor",
            "resample",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "do_pad",
            "do_center_crop",
            "crop_size",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        size_divisor: int = 32,
        resample: Optional[PILImageResampling] = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ) -> np.ndarray:
        """
        Resize an image.

        Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the
        longer side is larger than the max size `(int(size["shortest_edge"] * 1333 / 800))`, the longer side is then
        resized to the max size while preserving the aspect ratio.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
            size_divisor (`int`, defaults to 32):
                The image is resized to a size that is a multiple of this value.
            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
                Resampling filter to use when resizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        
        Returns:
            np.ndarray: Resized image.

        Raises:
            ValueError: If `size` dictionary does not contain the key `"shortest_edge"`.

        """
        # Ensure the `size` dictionary is properly formatted for resizing
        size = get_size_dict(size, default_to_square=False)
        
        # Check if the required key "shortest_edge" exists in the size dictionary
        if "shortest_edge" not in size:
            raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}")
        
        # Retrieve the value of the shortest edge size from the `size` dictionary
        shorter = size["shortest_edge"]
        
        # Calculate the longer side size based on the aspect ratio constraint
        longer = int(1333 / 800 * shorter)
        
        # Compute the final output size for resizing the image
        output_size = get_resize_output_image_size(
            image, shorter=shorter, longer=longer, size_divisor=size_divisor, input_data_format=input_data_format
        )
        
        # Perform the actual resizing operation using the specified parameters
        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )
    ) -> np.ndarray:
        """
        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
        any edge, the image is padded with 0's and then center cropped.

        Args:
            image (`np.ndarray`):
                Image to center crop.
            size (`Dict[str, int]`):
                Size of the output image in the form `{"height": h, "width": w}`.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred from the input
                image.
        """
        output_size = size["shortest_edge"]
        return center_crop(
            image,
            size=(output_size, output_size),
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )


    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
    def _pad_image(
        self,
        image: np.ndarray,
        output_size: Tuple[int, int],
        constant_values: Union[float, Iterable[float]] = 0,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Pad an image with zeros to the given size.

        Args:
            image (`np.ndarray`):
                Input image to be padded.
            output_size (`Tuple[int, int]`):
                Desired output size of the image in format `(height, width)`.
            constant_values (`Union[float, Iterable[float]]`, *optional*):
                Value or sequence of values to pad the image with. Default is 0.
            data_format (`ChannelDimension`, *optional*):
                Format of the output image channel dimension. If not specified, defaults to `None`.
            input_data_format (`Union[str, ChannelDimension]`, *optional*):
                Format of the input image channel dimension. If not specified, defaults to `None`.

        Returns:
            np.ndarray:
                Padded image of shape `(output_size[0], output_size[1], channels)`.
        """
        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
        output_height, output_width = output_size

        pad_bottom = output_height - input_height
        pad_right = output_width - input_width
        padding = ((0, pad_bottom), (0, pad_right))
        
        # Perform padding operation with constant values
        padded_image = pad(
            image,
            padding,
            mode=PaddingMode.CONSTANT,
            constant_values=constant_values,
            data_format=data_format,
            input_data_format=input_data_format,
        )
        return padded_image


    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
    def pad(
        self,
        images: List[np.ndarray],
        constant_values: Union[float, Iterable[float]] = 0,
        return_pixel_mask: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> BatchFeature:
        """
        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
        in the batch and optionally returns their corresponding pixel mask.

        Args:
            image (`np.ndarray`):
                Image to pad.
            constant_values (`float` or `Iterable[float]`, *optional*):
                The value to use for the padding if `mode` is `"constant"`.
            return_pixel_mask (`bool`, *optional*, defaults to `True`):
                Whether to return a pixel mask.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 获取批量图像中最大高度和宽度,并返回作为填充大小
        pad_size = get_max_height_width(images, input_data_format=input_data_format)

        # 对每张图像进行填充处理,保证它们达到批量中最大高度和宽度的大小,使用指定的常量值进行填充
        padded_images = [
            self._pad_image(
                image,
                pad_size,
                constant_values=constant_values,
                data_format=data_format,
                input_data_format=input_data_format,
            )
            for image in images
        ]
        # 构建返回的数据字典,包含填充后的图像数组
        data = {"pixel_values": padded_images}

        # 如果需要返回像素掩码
        if return_pixel_mask:
            # 对每张图像生成相应的像素掩码,并加入数据字典中
            masks = [
                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
                for image in images
            ]
            data["pixel_mask"] = masks

        # 返回一个 BatchFeature 对象,其中包含填充后的数据和指定类型的张量
        return BatchFeature(data=data, tensor_type=return_tensors)
    # 定义图像预处理方法,接受多个参数来控制不同的预处理步骤和参数
    def preprocess(
        self,
        images: ImageInput,  # 输入的图像数据,可以是单张图像或图像列表
        do_resize: Optional[bool] = None,  # 是否进行图像尺寸调整的标志
        size: Optional[Dict[str, int]] = None,  # 调整后的图像尺寸,以字典形式表示
        size_divisor: Optional[int] = None,  # 尺寸调整时的除数,用于确保尺寸是某个数的倍数
        resample: PILImageResampling = None,  # 图像调整大小时使用的重采样方法
        do_rescale: Optional[bool] = None,  # 是否进行图像尺度调整的标志
        rescale_factor: Optional[float] = None,  # 图像尺度调整的比例因子
        do_normalize: Optional[bool] = None,  # 是否进行图像标准化的标志
        image_mean: Optional[Union[float, List[float]]] = None,  # 图像标准化时的均值
        image_std: Optional[Union[float, List[float]]] = None,  # 图像标准化时的标准差
        do_pad: Optional[bool] = None,  # 是否进行图像填充的标志
        do_center_crop: Optional[bool] = None,  # 是否进行图像中心裁剪的标志
        crop_size: Dict[str, int] = None,  # 图像裁剪后的尺寸,以字典形式表示
        return_tensors: Optional[Union[str, TensorType]] = None,  # 返回数据的张量类型,如numpy数组或torch张量
        data_format: ChannelDimension = ChannelDimension.FIRST,  # 图像数据的通道顺序,FIRST表示通道在前
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 输入数据的通道顺序
        **kwargs,  # 其他未明确定义的参数,以字典形式接收

.\models\bridgetower\modeling_bridgetower.py

# coding=utf-8
# 版权声明及许可证信息

"""PyTorch BridgeTower Model"""

import math
from collections import OrderedDict
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss

# 导入自定义模块
from ...activations import ACT2FN, QuickGELUActivation
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    MaskedLMOutput,
    ModelOutput,
    SequenceClassifierOutput,
)
from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward
from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_bridgetower import BridgeTowerConfig, BridgeTowerTextConfig, BridgeTowerVisionConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 提供给文档的配置、检查点和分词器
_CONFIG_FOR_DOC = "BridgeTowerConfig"
_CHECKPOINT_FOR_DOC = "BridgeTower/bridgetower-base"
_TOKENIZER_FOR_DOC = "RobertaTokenizer"

# 预训练模型存档列表
BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "BridgeTower/bridgetower-base",
    "BridgeTower/bridgetower-base-itm-mlm",
    # 查看所有的 BridgeTower 模型:https://huggingface.co/BridgeTower
]

# BridgeTower 模型的起始文档字符串
BRIDGETOWER_START_DOCSTRING = r"""
    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ subclass. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`BridgeTowerConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# BridgeTower 模型输入文档字符串
BRIDGETOWER_INPUTS_DOCSTRING = r"""
"""

@dataclass
class BridgeTowerModelOutput(ModelOutput):
    """
    Output type of [`BridgeTowerModel`].
    Represents the output of the BridgeTowerModel.
    Inherits from ModelOutput defined in the modeling_outputs module.
    """
    # 定义函数参数:文本特征的隐藏状态,形状为 `(batch_size, text_sequence_length, hidden_size)`
    text_features: torch.FloatTensor = None
    
    # 定义函数参数:图像特征的隐藏状态,形状为 `(batch_size, image_sequence_length, hidden_size)`
    image_features: torch.FloatTensor = None
    
    # 定义函数参数:池化器输出,形状为 `(batch_size, hidden_size x 2)`
    # 这是文本序列和图像序列最后一层隐藏状态的分类标记(第一个标记)的连接,经过用于辅助预训练任务的进一步处理层处理后的结果
    pooler_output: torch.FloatTensor = None
    
    # 定义函数参数(可选):隐藏状态,是一个元组 `tuple(torch.FloatTensor)`
    # 当 `output_hidden_states=True` 或者 `config.output_hidden_states=True` 时返回
    # 包含模型每层输出的 `torch.FloatTensor`,形状为 `(batch_size, sequence_length, hidden_size)`,
    # 包括模型输出每一层的隐藏状态以及可选的初始嵌入输出
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    
    # 定义函数参数(可选):注意力权重,是一个元组 `tuple(torch.FloatTensor)`
    # 当 `output_attentions=True` 或者 `config.output_attentions=True` 时返回
    # 包含每一层注意力权重的 `torch.FloatTensor`,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`
    # 这些是经过注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值
    attentions: Optional[Tuple[torch.FloatTensor]] = None
# 使用 dataclass 装饰器定义一个数据类,表示桥塔对比学习任务的模型输出
@dataclass
class BridgeTowerContrastiveOutput(ModelOutput):
    """
    Output type of ['BridgeTowerForContrastiveLearning']

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`:
            Image-text contrastive loss. 图像与文本的对比损失值(当 `return_loss` 为 `True` 时返回)。
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
            语言建模头部的预测分数(SoftMax 前每个词汇标记的分数)。
        text_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
            The text embeddings obtained by applying the projection layer to the pooler_output.
            应用投影层到池化输出后得到的文本嵌入。
        image_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
            应用投影层到池化输出后得到的图像嵌入。
        cross_embeds  (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
            The text-image cross-modal embeddings obtained by applying the projection layer to the pooler_output.
            应用投影层到池化输出后得到的文本-图像跨模态嵌入。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
            the model at the output of each layer plus the optional initial embedding outputs.
            如果模型有嵌入层,输出嵌入和每一层的输出形成的元组,形状为 `(batch_size, sequence_length, hidden_size)`。
            模型每层的隐藏状态及可选的初始嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            如果传递了 `output_attentions=True` 或 `config.output_attentions=True`,返回每层的注意力分布,
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
    """

    loss: Optional[torch.FloatTensor] = None  # 图像与文本的对比损失值(可选)
    logits: torch.FloatTensor = None  # 语言建模头部的预测分数
    text_embeds: Optional[Tuple[torch.FloatTensor]] = None  # 文本嵌入(可选)
    image_embeds: Optional[Tuple[torch.FloatTensor]] = None  # 图像嵌入(可选)
    cross_embeds: Optional[Tuple[torch.FloatTensor]] = None  # 文本-图像跨模态嵌入(可选)
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None  # 模型每层的隐藏状态及可选的初始嵌入输出(可选)
    attentions: Optional[Tuple[torch.FloatTensor]] = None  # 每层的注意力分布(可选)

class BridgeTowerResidualAttention(nn.Module):
    # 初始化函数,接受配置对象 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()

        # 创建多头注意力机制对象,配置隐藏大小和头数
        self.attn = nn.MultiheadAttention(config.hidden_size, config.hidden_size // 64)
        
        # 创建第一个 LayerNorm 层,配置隐藏大小和层归一化的 epsilon 值
        self.ln_1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        # 创建包含线性层和激活函数的模块字典
        self.mlp = nn.ModuleDict(
            OrderedDict(
                [
                    ("c_fc", nn.Linear(config.hidden_size, config.hidden_size * 4)),  # 输入到隐藏大小乘以4的线性层
                    ("gelu", QuickGELUActivation()),  # GELU 激活函数
                    ("c_proj", nn.Linear(config.hidden_size * 4, config.hidden_size)),  # 将隐藏大小乘以4的结果线性映射回隐藏大小
                ]
            )
        )
        
        # 创建第二个 LayerNorm 层,配置隐藏大小和层归一化的 epsilon 值
        self.ln_2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        # 初始化注意力掩码为 None
        self.attn_mask = None

    # 注意力计算函数,接受隐藏状态和注意力掩码
    def attention(self, hidden_state: torch.Tensor, attention_mask: torch.Tensor):
        # 如果注意力掩码不为 None,则将其转换为布尔类型,并置于与 hidden_state 相同的设备上
        if attention_mask is not None:
            attention_mask = attention_mask.to(dtype=torch.bool, device=hidden_state.device)
        
        # 如果 self.attn_mask 不为 None,则将其转换为 hidden_state 的数据类型,并置于与 hidden_state 相同的设备上
        self.attn_mask = (
            self.attn_mask.to(dtype=hidden_state.dtype, device=hidden_state.device)
            if self.attn_mask is not None
            else None
        )
        
        # 调用多头注意力机制,传入 hidden_state 作为查询、键和值,返回注意力计算结果
        return self.attn(
            hidden_state,
            hidden_state,
            hidden_state,
            need_weights=False,
            attn_mask=self.attn_mask,
            key_padding_mask=attention_mask,
        )[0]

    # 前向传播函数,接受隐藏状态和注意力掩码,默认注意力掩码为 None
    def forward(self, hidden_state: torch.Tensor, attention_mask: torch.Tensor = None):
        # 计算残差连接的隐藏状态
        residual_state = hidden_state + self.attention(self.ln_1(hidden_state), attention_mask)
        
        # 对残差状态进行 LayerNorm
        hidden_state = self.ln_2(residual_state)
        
        # 遍历 MLP 模块字典中的每个层,并对隐藏状态进行处理
        for _, layer in self.mlp.items():
            hidden_state = layer(hidden_state)
        
        # 最终的隐藏状态是残差状态和经过 MLP 处理后的状态的和
        hidden_state = residual_state + hidden_state
        
        # 返回最终的隐藏状态
        return hidden_state
# 定义视觉Transformer模型类BridgeTowerVisionTransformer,继承自nn.Module
class BridgeTowerVisionTransformer(nn.Module):
    def __init__(self, config: BridgeTowerVisionConfig):
        super().__init__()
        # 初始化模型配置
        self.config = config
        # 设定嵌入维度为隐藏大小
        self.embed_dim = config.hidden_size

        # 图像尺寸和补丁大小从配置中获取
        self.image_size = config.image_size
        self.patch_size = config.patch_size

        # 类别嵌入为一个可学习的参数
        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))

        # 补丁嵌入为一个2D卷积层,将输入通道数转换为隐藏大小,不使用偏置
        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            bias=False,
        )

        # 计算图像中的补丁数量和位置嵌入的数量
        self.num_patches = (self.image_size // self.patch_size) ** 2
        self.num_positions = self.num_patches + 1

        # 位置嵌入为一个Embedding层,其索引从0到num_positions-1,维度为embed_dim
        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)

        # 注册一个缓冲区,存储位置ID张量,形状为[1, num_positions]
        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        # 获取批量大小
        batch_size = pixel_values.shape[0]

        # 目标数据类型为补丁嵌入的权重类型
        target_dtype = self.patch_embedding.weight.dtype

        # 对输入的像素值进行补丁嵌入,输出形状为[*, embed_dim, width//patch_size, grid//patch_size]
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))

        # 将补丁嵌入展平并转置以适应Transformer输入的形状,形状变为[*, num_patches, embed_dim]
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)

        # 类别嵌入扩展为(batch_size, 1, embed_dim)
        class_embeds = self.class_embedding.expand(batch_size, 1, -1)

        # 将类别嵌入和补丁嵌入连接在一起,形状为[batch_size, num_patches+1, embed_dim]
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)

        # 加上位置嵌入,使用先前注册的位置ID张量,形状为[batch_size, num_patches+1, embed_dim]
        embeddings = embeddings + self.position_embedding(self.position_ids)

        # 返回嵌入张量
        return embeddings
    # 初始化函数,接受配置参数并初始化模型的各个组件
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()

        # 创建视觉嵌入层对象,并传入配置参数
        self.embeddings = BridgeTowerVisionEmbeddings(config)
        
        # 创建 LayerNorm 层,用于在 Transformer 前后对隐藏状态进行归一化
        self.ln_pre = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        # 创建 Transformer 层对象,并传入配置参数
        self.transformer = BridgeTowerTransformer(config)
        
        # 创建另一个 LayerNorm 层,用于 Transformer 结束后对隐藏状态再次归一化
        self.ln_post = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        # 根据配置参数决定是否共享 LayerNorm 层
        self.share_layernorm = config.share_layernorm
        
        # 如果不共享 LayerNorm 层,则创建独立的 LayerNorm 层列表,数量与 Transformer 层数相同
        if not config.share_layernorm:
            self.ln_separate = nn.ModuleList(
                [nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) for _ in range(config.num_hidden_layers)]
            )

    # 前向传播函数,接收像素值张量和注意力掩码作为输入,返回处理后的隐藏状态张量
    def forward(self, pixel_values: torch.Tensor, attention_mask):
        # 将像素值张量传入视觉嵌入层进行处理
        hidden_states = self.embeddings(pixel_values)
        
        # 对视觉嵌入后的隐藏状态进行 LayerNorm 归一化
        hidden_states = self.ln_pre(hidden_states)
        
        # 将维度顺序从 [batch_size, seq_length, hidden_size] 调整为 [seq_length, batch_size, hidden_size]
        hidden_states = hidden_states.permute(1, 0, 2)

        # 将调整后的隐藏状态输入 Transformer 进行处理
        hidden_states = self.transformer(hidden_states, attention_mask)
        
        # 将 Transformer 输出的隐藏状态堆叠起来,形状变为 [num_hidden_layers, batch_size, hidden_size, seq_length]
        hidden_states = torch.stack(hidden_states, dim=0)
        
        # 将堆叠后的隐藏状态的维度从 [num_hidden_layers, batch_size, hidden_size, seq_length] 调整为 [num_hidden_layers, batch_size, seq_length, hidden_size]
        hidden_states = hidden_states.permute(0, 2, 1, 3)
        
        # 如果配置中指定共享 LayerNorm 层,则对输出的隐藏状态进行最终的 LayerNorm 归一化
        if self.share_layernorm:
            hidden_states = self.ln_post(hidden_states)
        else:
            # 如果不共享 LayerNorm 层,则分别对每层的隐藏状态进行独立的 LayerNorm 归一化
            hidden_states_stack = []
            for hidden_states, ln in zip(hidden_states, self.ln_separate):
                hidden_states = ln(hidden_states)
                hidden_states_stack.append(hidden_states)
            
            # 将独立归一化后的隐藏状态堆叠起来,形状为 [num_hidden_layers, batch_size, seq_length, hidden_size]
            hidden_states = torch.stack(hidden_states_stack, dim=0)
        
        # 返回最终处理后的隐藏状态张量
        return hidden_states

    # 前向传播函数的预处理部分,只包括视觉嵌入和初始 LayerNorm 归一化,返回处理后的隐藏状态张量
    def forward_pre(self, pixel_values: torch.Tensor):
        hidden_states = self.embeddings(pixel_values)
        hidden_states = self.ln_pre(hidden_states)
        hidden_states = hidden_states.permute(1, 0, 2)
        return hidden_states

    # 前向传播函数的后处理部分,接收隐藏状态张量作为输入,对其进行 LayerNorm 归一化,并返回处理后的输出张量
    def forward_post(self, hidden_state: torch.Tensor):
        # 将输入的隐藏状态张量维度从 [batch_size, seq_length, hidden_size] 调整为 [seq_length, batch_size, hidden_size]
        visual_output_post = hidden_state.permute(1, 0, 2)
        
        # 对调整后的隐藏状态进行最终的 LayerNorm 归一化处理
        visual_output_post = self.ln_post(visual_output_post)
        
        # 返回最终处理后的输出张量
        return visual_output_post
# 定义 BridgeTowerLinkTower 类,继承自 nn.Module
class BridgeTowerLinkTower(nn.Module):
    # 初始化方法,接收一个 config 对象作为参数
    def __init__(self, config):
        super().__init__()
        # 设置 link_tower_type 属性为传入 config 对象的 link_tower_type
        self.link_tower_type = config.link_tower_type
        # 设置 hidden_size 属性为传入 config 对象的 hidden_size
        self.hidden_size = config.hidden_size
        # 如果 link_tower_type 在 ["add", "scaled_add", "interpolate"] 中
        if config.link_tower_type in ["add", "scaled_add", "interpolate"]:
            # 如果 link_tower_type 是 "scaled_add"
            if config.link_tower_type == "scaled_add":
                # 创建一个可训练参数 scaled_factor,初始值为 1.0
                self.scaled_factor = nn.Parameter(torch.tensor(1.0))
            # 如果 link_tower_type 是 "interpolate"
            elif config.link_tower_type == "interpolate":
                # 创建一个可训练参数 beta,初始值为 0.5
                self.beta = nn.Parameter(torch.tensor(0.5))
            # 创建一个 LayerNorm 层,用于对 hidden_size 维度进行归一化,epsilon 值由 config 提供
            self.LayerNorm = nn.LayerNorm(self.hidden_size, eps=config.layer_norm_eps)
        else:
            # 如果 link_tower_type 不在支持的类型中,则抛出未实现异常
            raise NotImplementedError(f"link_tower_type {config.link_tower_type} is not implemented")

    # 前向传播方法,接收 hidden_states, cross_modal_hidden_states 和 attention_mask 作为参数
    def forward(self, hidden_states, cross_modal_hidden_states, attention_mask):
        # 根据 link_tower_type 执行不同的链接操作
        if self.link_tower_type == "add":
            # 返回 LayerNorm 应用于 hidden_states 与 cross_modal_hidden_states 相加的结果
            return self.LayerNorm(hidden_states + cross_modal_hidden_states)
        elif self.link_tower_type == "scaled_add":
            # 返回 LayerNorm 应用于 hidden_states 乘以 scaled_factor 加上 cross_modal_hidden_states 的结果
            return self.LayerNorm(hidden_states * self.scaled_factor + cross_modal_hidden_states)
        elif self.link_tower_type == "interpolate":
            # 返回 LayerNorm 应用于 hidden_states 与 (1 - beta) 相乘加上 cross_modal_hidden_states 与 beta 相乘的结果
            return self.LayerNorm(hidden_states * (1 - self.beta) + cross_modal_hidden_states * self.beta)
        else:
            # 如果 link_tower_type 不在支持的类型中,则抛出未实现异常
            raise NotImplementedError(f"link_tower_type {self.link_tower_type} is not implemented")


# 从 transformers.models.bert.modeling_bert.BertSelfOutput 复制并修改为 BridgeTowerSelfOutput
# 定义 BridgeTowerSelfOutput 类,继承自 nn.Module
class BridgeTowerSelfOutput(nn.Module):
    # 初始化方法,接收一个 config 对象作为参数
    def __init__(self, config):
        super().__init__()
        # 创建一个全连接层 dense,输入输出维度都是 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 创建一个 LayerNorm 层,用于对 config.hidden_size 维度进行归一化,epsilon 值由 config 提供
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个 Dropout 层,丢弃概率为 config.hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播方法,接收 hidden_states 和 input_tensor 作为参数
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将 hidden_states 输入到全连接层 dense 中
        hidden_states = self.dense(hidden_states)
        # 对 hidden_states 进行 Dropout 处理
        hidden_states = self.dropout(hidden_states)
        # 返回 LayerNorm 应用于 hidden_states 加上 input_tensor 的结果
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertIntermediate 复制并修改为 BridgeTowerIntermediate
# 定义 BridgeTowerIntermediate 类,继承自 nn.Module
class BridgeTowerIntermediate(nn.Module):
    # 初始化方法,接收一个 config 对象作为参数
    def __init__(self, config):
        super().__init__()
        # 创建一个全连接层 dense,输入维度为 config.hidden_size,输出维度为 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 如果 config.hidden_act 是字符串类型
        if isinstance(config.hidden_act, str):
            # 根据 config.hidden_act 的值选择相应的激活函数,并赋值给 intermediate_act_fn
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            # 否则直接使用 config.hidden_act 作为激活函数
            self.intermediate_act_fn = config.hidden_act

    # 前向传播方法,接收 hidden_states 作为参数
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将 hidden_states 输入到全连接层 dense 中
        hidden_states = self.dense(hidden_states)
        # 将全连接层的输出应用 intermediate_act_fn 激活函数后返回
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertOutput 复制并修改为 BridgeTowerOutput
# 定义 BridgeTowerOutput 类,继承自 nn.Module
class BridgeTowerOutput(nn.Module):
    # 初始化函数,用于初始化对象
    def __init__(self, config):
        # 调用父类(nn.Module)的初始化方法
        super().__init__()
        # 创建一个全连接层,输入尺寸为config.intermediate_size,输出尺寸为config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个 LayerNorm 层,对输入进行归一化处理,归一化维度为config.hidden_size,eps为归一化过程中的小数值偏移量
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个 Dropout 层,以config.hidden_dropout_prob的概率随机将输入置零,用于防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数,定义了数据流向和处理逻辑
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入的 hidden_states 经过全连接层 dense,得到新的隐藏状态
        hidden_states = self.dense(hidden_states)
        # 对新的隐藏状态进行 Dropout 处理,以防止过拟合
        hidden_states = self.dropout(hidden_states)
        # 将经过 Dropout 处理后的隐藏状态与输入的 input_tensor 相加,并经过 LayerNorm 处理,得到最终的隐藏状态
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回最终的隐藏状态作为输出
        return hidden_states
# 从 transformers.models.bert.modeling_bert.BertPooler 复制代码,将 Bert 改为 BridgeTower
class BridgeTowerPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一个全连接层,将输入维度为 config.hidden_size 的向量映射到相同维度
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 定义激活函数为 Tanh
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 取出每个样本的第一个 token 对应的隐藏状态作为池化输出
        first_token_tensor = hidden_states[:, 0]
        # 将池化输出输入到全连接层中进行线性变换
        pooled_output = self.dense(first_token_tensor)
        # 使用 Tanh 激活函数处理线性变换的结果
        pooled_output = self.activation(pooled_output)
        # 返回池化后的输出
        return pooled_output


# 从 transformers.models.roberta.modeling_roberta.RobertaSelfAttention 复制代码,将 Roberta 改为 BridgeTower
class BridgeTowerSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 检查 hidden_size 是否能被 num_attention_heads 整除,若不能则抛出 ValueError
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 设置注意力头数和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 定义 Query、Key、Value 的线性变换层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 定义 Dropout 层,用于在计算注意力分布时进行随机置零
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

        # 设置位置嵌入类型,默认为 absolute
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )

        # 若位置嵌入类型为 relative_key 或 relative_key_query,则使用距离嵌入
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 判断是否为解码器
        self.is_decoder = config.is_decoder

    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        # 调整张量形状,以便进行多头注意力计算
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
        # 从 transformers.models.bert.modeling_bert.BertAttention 复制代码,将 Bert 改为 BridgeTower
class BridgeTowerAttention(nn.Module):
    # 初始化函数,定义注意力模块的结构
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 创建自注意力层对象
        self.self = BridgeTowerSelfAttention(config, position_embedding_type=position_embedding_type)
        # 创建输出层对象
        self.output = BridgeTowerSelfOutput(config)
        # 初始化用于记录剪枝头部的集合
        self.pruned_heads = set()

    # 剪枝头部的方法
    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 找到可剪枝头部的索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并记录剪枝头部
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用自注意力层进行前向传播
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 将自注意力层的输出传递给输出层进行处理
        attention_output = self.output(self_outputs[0], hidden_states)
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要输出注意力信息,则添加到输出中
        return outputs


class BridgeTowerBertCrossLayer(nn.Module):
    # 初始化函数,定义BERT跨层连接模块的结构
    def __init__(self, config):
        super().__init__()
        # 设置前向传播中的分块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度
        self.seq_len_dim = 1
        # 创建注意力对象
        self.attention = BridgeTowerAttention(config)
        # 是否为解码器
        self.is_decoder = config.is_decoder
        # 是否添加交叉注意力
        self.add_cross_attention = config.add_cross_attention
        # 创建交叉注意力对象
        self.crossattention = BridgeTowerAttention(config)
        # 创建中间层对象
        self.intermediate = BridgeTowerIntermediate(config)
        # 创建输出层对象
        self.output = BridgeTowerOutput(config)

    # 前向传播函数
    def forward(
        self,
        hidden_states,
        encoder_hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
    ):
        # 使用注意力层进行前向传播
        outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 如果需要添加交叉注意力,则调用交叉注意力层
        if self.add_cross_attention:
            cross_attention_outputs = self.crossattention(
                outputs[0],
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                past_key_value,
                output_attentions,
            )
            # 使用中间层和输出层处理交叉注意力的输出
            intermediate_output = self.intermediate(cross_attention_outputs[0])
            layer_output = self.output(intermediate_output, outputs[0])
            outputs = (layer_output,) + cross_attention_outputs[1:] + outputs[1:]

        return outputs
        # 如果是 decoder,decoder uni-directional self-attention 缓存的键/值元组在位置 1 和 2
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask=attention_mask,
            head_mask=None,
            output_attentions=output_attentions,
            past_key_value=None,
        )
        # 获取自注意力机制的输出
        attention_output = self_attention_outputs[0]

        # 如果是 decoder,在最后一个输出中包含了自注意力机制的缓存元组
        # 如果需要输出注意力权重,则添加自注意力机制的输出
        outputs = self_attention_outputs[1:]

        # 执行跨注意力机制
        cross_attention_outputs = self.crossattention(
            attention_output,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
        )
        # 获取跨注意力机制的输出
        attention_output = cross_attention_outputs[0]
        # 如果需要输出注意力权重,则添加跨注意力机制的输出(排除最后一个元素)
        outputs = outputs + cross_attention_outputs[1:-1]

        # 对注意力输出应用分块处理
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        # 将处理后的输出与先前的输出合并
        outputs = (layer_output,) + outputs

        # 返回最终的输出结果
        return outputs

    # 定义前向传播的分块处理函数
    def feed_forward_chunk(self, attention_output):
        # 通过中间层处理注意力输出
        intermediate_output = self.intermediate(attention_output)
        # 使用输出层处理中间输出和注意力输出
        layer_output = self.output(intermediate_output, attention_output)
        # 返回处理后的层输出
        return layer_output
# 定义一个名为 BridgeTowerTextLayer 的神经网络模块,继承自 nn.Module
class BridgeTowerTextLayer(nn.Module):
    # 初始化函数,接受一个 config 参数
    def __init__(self, config):
        # 调用父类 nn.Module 的初始化函数
        super().__init__()
        # 设置类的属性 chunk_size_feed_forward,从 config 中获取前馈传递的块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 设置类的属性 seq_len_dim,指定序列长度的维度为 1
        self.seq_len_dim = 1
        # 创建一个 BridgeTowerAttention 的实例,并赋给类的属性 attention
        self.attention = BridgeTowerAttention(config)
        # 从 config 中获取是否是解码器,并赋给类的属性 is_decoder
        self.is_decoder = config.is_decoder
        # 从 config 中获取是否添加交叉注意力,如果是,则创建一个新的 BridgeTowerAttention 实例赋给类的属性 crossattention
        if self.add_cross_attention:
            if not self.is_decoder:
                # 如果不是解码器但添加了交叉注意力,抛出 ValueError 异常
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            self.crossattention = BridgeTowerAttention(config, position_embedding_type="absolute")
        # 创建一个 BridgeTowerIntermediate 的实例,并赋给类的属性 intermediate
        self.intermediate = BridgeTowerIntermediate(config)
        # 创建一个 BridgeTowerOutput 的实例,并赋给类的属性 output
        self.output = BridgeTowerOutput(config)

    # 前向传播函数,接受多个输入参数
    def forward(
        self,
        hidden_states: torch.Tensor,  # 隐藏状态张量
        attention_mask: Optional[torch.FloatTensor] = None,  # 注意力掩码张量(可选)
        head_mask: Optional[torch.FloatTensor] = None,  # 头部掩码张量(可选)
        encoder_hidden_states: Optional[torch.FloatTensor] = None,  # 编码器隐藏状态张量(可选)
        encoder_attention_mask: Optional[torch.FloatTensor] = None,  # 编码器注意力掩码张量(可选)
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,  # 过去键值元组的张量(可选)
        output_attentions: Optional[bool] = False,  # 输出注意力张量的标志(可选,默认为 False)
    ) -> Tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # Perform self-attention using the cached key/values if available
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        attention_output = self_attention_outputs[0]

        # if decoder, the last output is tuple of self-attn cache
        if self.is_decoder:
            # Extract all outputs except the last one as intermediate outputs
            outputs = self_attention_outputs[1:-1]
            # Retrieve the present key/value tuple for self-attention
            present_key_value = self_attention_outputs[-1]
        else:
            # Include self-attentions in outputs if we output attention weights
            outputs = self_attention_outputs[1:]

        cross_attn_present_key_value = None
        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # Perform cross-attention between decoder's self-attention output and encoder's hidden states
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            attention_output = cross_attention_outputs[0]
            # Include cross-attentions in outputs if we output attention weights
            outputs = outputs + cross_attention_outputs[1:-1]

            # Append cross-attn cache to present_key_value tuple
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # Apply chunking mechanism for feed-forward layer processing
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        outputs = (layer_output,) + outputs

        # if decoder, return the attn key/values as the last output
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        return outputs

    def feed_forward_chunk(self, attention_output):
        # Pass attention output through intermediate layer
        intermediate_output = self.intermediate(attention_output)
        # Apply feed-forward layer to get final layer output
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output
# 从transformers.models.roberta.modeling_roberta.RobertaEncoder复制过来,将Roberta替换为BridgeTowerText
class BridgeTowerTextEncoder(nn.Module):
    # 初始化函数,设置模型配置和层列表
    def __init__(self, config):
        super().__init__()
        # 保存配置信息
        self.config = config
        # 创建包含多个BridgeTowerTextLayer的模块列表,数量为config.num_hidden_layers
        self.layer = nn.ModuleList([BridgeTowerTextLayer(config) for _ in range(config.num_hidden_layers)])
        # 是否启用梯度检查点,默认为False
        self.gradient_checkpointing = False

    # 前向传播函数,接收多个输入参数并返回多个输出
    def forward(
        self,
        hidden_states: torch.Tensor,  # 输入的隐藏状态张量
        attention_mask: Optional[torch.FloatTensor] = None,  # 可选的注意力掩码张量
        head_mask: Optional[torch.FloatTensor] = None,  # 可选的头部掩码张量
        encoder_hidden_states: Optional[torch.FloatTensor] = None,  # 可选的编码器隐藏状态张量
        encoder_attention_mask: Optional[torch.FloatTensor] = None,  # 可选的编码器注意力掩码张量
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,  # 可选的过去的键值元组
        use_cache: Optional[bool] = None,  # 可选的使用缓存标志
        output_attentions: Optional[bool] = False,  # 是否输出注意力权重,默认为False
        output_hidden_states: Optional[bool] = False,  # 是否输出隐藏状态,默认为False
        return_dict: Optional[bool] = True,  # 是否返回字典格式的输出,默认为True
        ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        # 如果不需要输出隐藏状态,则初始化为空元组;否则为 None
        all_hidden_states = () if output_hidden_states else None
        # 如果不需要输出注意力权重,则初始化为空元组;否则为 None
        all_self_attentions = () if output_attentions else None
        # 如果不需要输出跨层注意力权重或配置不支持,则初始化为空元组;否则为 None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果开启了梯度检查点且处于训练模式下
        if self.gradient_checkpointing and self.training:
            # 如果设置了 use_cache=True,则警告并强制设置为 False
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # 如果不需要使用缓存,则初始化为空元组;否则为 None
        next_decoder_cache = () if use_cache else None
        # 遍历所有的 Transformer 层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态,则添加当前层的隐藏状态到 all_hidden_states
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果存在头部掩码,则使用对应的掩码;否则为 None
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 如果存在历史键值,则使用对应的键值;否则为 None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果开启了梯度检查点且处于训练模式下
            if self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数计算层的输出
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则直接调用层模块计算层的输出
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            # 更新隐藏状态为当前层输出的隐藏状态
            hidden_states = layer_outputs[0]
            # 如果使用缓存,则将当前层的输出添加到下一个解码器缓存中
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果需要输出注意力权重,则将当前层的自注意力权重添加到 all_self_attentions
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果配置支持添加跨层注意力权重,则将当前层的跨层注意力权重添加到 all_cross_attentions
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果需要输出隐藏状态,则添加最后一个层的隐藏状态到 all_hidden_states
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典形式的结果,则返回元组
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 否则返回包含详细输出的 BaseModelOutputWithPastAndCrossAttentions 对象
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
# 从 transformers.models.roberta.modeling_roberta.RobertaEmbeddings 复制过来的类 BridgeTowerTextEmbeddings
class BridgeTowerTextEmbeddings(nn.Module):
    """
    与 BertEmbeddings 相同,但稍作调整以适应位置嵌入的索引。
    """

    # 从 transformers.models.bert.modeling_bert.BertEmbeddings.__init__ 复制而来
    def __init__(self, config):
        super().__init__()
        # 词嵌入层,用于将输入的词汇 ID 转换为对应的隐藏表示
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 位置嵌入层,用于表示单词在句子中的位置信息
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 标记类型嵌入层,用于区分句子中不同类型的标记(如句子 A 和句子 B)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # self.LayerNorm 名称没有改为 snake-case,以保持与 TensorFlow 模型变量名称一致,以便能够加载任何 TensorFlow 检查点文件
        # LayerNorm 层,用于归一化隐藏表示,增加训练稳定性
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层,用于随机丢弃部分神经元的输出,防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # position_ids (1,len position emb)在内存中是连续的,并在序列化时导出
        # 位置嵌入类型,默认为绝对位置编码
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册位置 ID 张量,用于嵌入层的位置编码
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 注册标记类型 ID 张量,用于嵌入层的标记类型编码,默认全为零
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

        # End copy
        # 填充标记 ID,用于在输入序列中表示填充位置
        self.padding_idx = config.pad_token_id
        # 重新定义位置嵌入层,指定填充位置 ID
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )

    def forward(
        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
        ):
            # 如果未提供位置信息,但提供了输入标记信息,则根据输入标记信息创建位置信息,
            # 所有填充标记保持填充状态。
            if position_ids is None:
                if input_ids is not None:
                    position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
                else:
                    position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

            # 如果提供了输入标记信息,则获取其形状
            if input_ids is not None:
                input_shape = input_ids.size()
            else:
                input_shape = inputs_embeds.size()[:-1]

            seq_length = input_shape[1]

            # 将 token_type_ids 设置为构造函数中注册的缓冲区,通常为全零,
            # 当其自动生成时,注册的缓冲区有助于在跟踪模型时不传递 token_type_ids,解决问题 #5664
            if token_type_ids is None:
                if hasattr(self, "token_type_ids"):
                    buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                    buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                    token_type_ids = buffered_token_type_ids_expanded
                else:
                    token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

            # 如果未提供输入嵌入信息,则使用输入标记信息获取嵌入
            if inputs_embeds is None:
                inputs_embeds = self.word_embeddings(input_ids)
            token_type_embeddings = self.token_type_embeddings(token_type_ids)

            # 计算嵌入向量
            embeddings = inputs_embeds + token_type_embeddings

            # 如果位置嵌入类型为 "absolute",则添加位置嵌入
            if self.position_embedding_type == "absolute":
                position_embeddings = self.position_embeddings(position_ids)
                embeddings += position_embeddings

            # 应用 LayerNorm
            embeddings = self.LayerNorm(embeddings)
            # 应用 dropout
            embeddings = self.dropout(embeddings)
            # 返回嵌入向量
            return embeddings

    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
        """
        直接提供嵌入向量,无法推断填充标记,因此只生成顺序位置 id。

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        """
        input_shape = inputs_embeds.size()[:-1]
        sequence_length = input_shape[1]

        # 创建顺序位置 id
        position_ids = torch.arange(
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
        )
        return position_ids.unsqueeze(0).expand(input_shape)
# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        input_ids: torch.Tensor, input tensor containing symbol indices
        padding_idx: int, padding symbol index
        past_key_values_length: int, optional, length of past key values

    Returns:
        torch.Tensor, tensor containing position indices
    """
    # Create a mask where non-padding elements are marked as 1, padding elements as 0
    mask = input_ids.ne(padding_idx).int()
    # Calculate cumulative sum of the mask along the second dimension, type-cast to mask's type, and adjust by past_key_values_length
    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
    # Add padding_idx to obtain final position indices tensor
    return incremental_indices.long() + padding_idx


class BridgeTowerPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = BridgeTowerConfig  # Specify the configuration class for this model
    base_model_prefix = "bridgetower"  # Prefix used for the base model's attribute names
    supports_gradient_checkpointing = False  # Indicates if gradient checkpointing is supported
    _no_split_modules = ["BridgeTowerSelfAttention", "BridgeTowerResidualAttention"]  # List of modules not to split
    _skip_keys_device_placement = "past_key_values"  # Key for skipping device placement

    def _init_weights(self, module):
        """
        Initialize weights of the given module based on its type.

        Args:
            module: nn.Module, module to initialize weights for
        """
        if isinstance(module, BridgeTowerVisionModel):
            # Initialization for vision model's transformer components
            proj_std = (module.visual.transformer.hidden_size**-0.5) * (
                (2 * module.visual.transformer.num_hidden_layers) ** -0.5
            )
            attn_std = module.visual.transformer.hidden_size**-0.5
            fc_std = (2 * module.visual.transformer.hidden_size) ** -0.5
            # Initialize weights for attention, projection, and MLP layers in transformer blocks
            for block in module.visual.transformer.resblocks:
                nn.init.normal_(block.attn.in_proj_weight, std=attn_std * self.config.initializer_factor)
                nn.init.normal_(block.attn.out_proj.weight, std=proj_std * self.config.initializer_factor)
                nn.init.normal_(block.mlp.c_fc.weight, std=fc_std * self.config.initializer_factor)
                nn.init.normal_(block.mlp.c_proj.weight, std=proj_std * self.config.initializer_factor)

            # Initialize weights for class and position embeddings
            nn.init.normal_(module.visual.embeddings.class_embedding, std=attn_std * self.config.initializer_factor)
            nn.init.normal_(
                module.visual.embeddings.position_embedding.weight, std=attn_std * self.config.initializer_factor
            )
        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.Embedding)):
            # Initialize weights for linear, convolutional, and embedding layers
            module.weight.data.normal_(mean=0.0, std=0.05 * self.config.initializer_factor)
        elif isinstance(module, nn.LayerNorm):
            # Initialize weights for LayerNorm modules
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

        if isinstance(module, nn.Linear) and module.bias is not None:
            # Set biases to zero for Linear modules if they exist
            module.bias.data.zero_()


class BridgeTowerVisionModel(BridgeTowerPreTrainedModel):
    """
    Vision model class inheriting from BridgeTowerPreTrainedModel.

    Attributes:
        config_class: Class attribute specifying the configuration class for this model.
    """

    config_class = BridgeTowerVisionConfig

    def __init__(self, config):
        """
        Initialize the vision model with the given configuration.

        Args:
            config: BridgeTowerVisionConfig, configuration instance for the model
        """
        super().__init__(config)
        self.visual = BridgeTowerVisionTransformer(config)  # Initialize vision transformer
    # 定义属性访问器,返回 self.visual.embeddings.patch_embedding.weight 的数据类型
    @property
    def dtype(self):
        return self.visual.embeddings.patch_embedding.weight.dtype
    
    # 定义前向传播方法,接收图像数据和可选的图像掩码,使用 self.dtype 设置图像数据类型后调用 self.visual 进行处理
    def forward(self, image, image_mask=None):
        return self.visual(image.type(self.dtype), image_mask)
class BridgeTowerTextModel(BridgeTowerPreTrainedModel):
    """
    
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762

    """

    config_class = BridgeTowerTextConfig  # 设置配置类为 BridgeTowerTextConfig

    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)  # 调用父类构造函数初始化模型
        self.config = config  # 设置模型的配置信息

        self.embeddings = BridgeTowerTextEmbeddings(config)  # 初始化文本嵌入层
        self.encoder = BridgeTowerTextEncoder(config)  # 初始化文本编码器

        self.pooler = BridgeTowerPooler(config) if add_pooling_layer else None  # 初始化池化层,如果 add_pooling_layer 为 True

        # Initialize weights and apply final processing
        self.post_init()  # 调用后处理函数,用于初始化权重和应用最终处理

    def get_input_embeddings(self):
        return self.embeddings.word_embeddings  # 获取输入嵌入层的词嵌入向量

    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value  # 设置输入嵌入层的词嵌入向量为指定值

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)  # 剪枝模型中的注意力头部,根据给定的 heads_to_prune 字典

    # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.forward
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        This method defines the forward pass for the BridgeTowerTextModel.

        Args:
            input_ids (Optional[torch.Tensor]): Indices of input tokens in the vocabulary.
            attention_mask (Optional[torch.Tensor]): Mask to avoid performing attention on padding tokens.
            token_type_ids (Optional[torch.Tensor]): Segment token indices to differentiate sentences.
            position_ids (Optional[torch.Tensor]): Indices of positions of each input token in the sequence.
            head_mask (Optional[torch.Tensor]): Mask to nullify selected heads of the self-attention modules.
            inputs_embeds (Optional[torch.Tensor]): Optional tensor of embeddings to be used as input instead of
                                                    input_ids.
            encoder_hidden_states (Optional[torch.Tensor]): Sequence of hidden states of the encoder.
            encoder_attention_mask (Optional[torch.Tensor]): Mask to avoid performing attention on encoder padding tokens.
            past_key_values (Optional[List[torch.FloatTensor]]): Cached outputs of the model to speed up sequential
                                                                decoding.
            use_cache (Optional[bool]): Whether or not to use past_key_values to speed up decoding.
            output_attentions (Optional[bool]): Whether to return attentions weights.
            output_hidden_states (Optional[bool]): Whether to return hidden states.
            return_dict (Optional[bool]): Whether to return a dict instead of a tuple.

        Returns:
            Various outputs depending on the configuration (return_dict or not).
        """
        # Actual implementation of the forward pass is expected here in the derived model classes.
        pass
    # 初始化方法,接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类初始化方法,传入配置对象
        super().__init__(config)
        # 将配置对象存储为实例变量
        self.config = config
        # 从配置对象中获取视觉配置和文本配置
        vision_config = config.vision_config
        text_config = config.text_config

        # 根据配置决定是否共享跨模态变换层
        if config.share_cross_modal_transformer_layers:
            # 如果共享,创建一个线性变换层,将文本隐藏状态映射到全局隐藏状态
            self.cross_modal_text_transform = nn.Linear(text_config.hidden_size, config.hidden_size)
            # 创建一个线性变换层,将视觉隐藏状态映射到全局隐藏状态
            self.cross_modal_image_transform = nn.Linear(vision_config.hidden_size, config.hidden_size)
        else:
            # 如果不共享,创建一个模块列表,每个元素是一个线性变换层,用于每个隐藏层
            self.cross_modal_text_transform = nn.ModuleList(
                [nn.Linear(text_config.hidden_size, config.hidden_size) for _ in range(config.num_hidden_layers)]
            )
            self.cross_modal_image_transform = nn.ModuleList(
                [nn.Linear(vision_config.hidden_size, config.hidden_size) for _ in range(config.num_hidden_layers)]
            )

        # 创建一个大小为2的嵌入层,用于区分不同类型的标记(如类标记等)
        self.token_type_embeddings = nn.Embedding(2, config.hidden_size)

        # 创建视觉模型对象,使用给定的视觉配置
        self.vision_model = BridgeTowerVisionModel(vision_config)

        # 创建文本模型对象,使用给定的文本配置
        self.text_model = BridgeTowerTextModel(text_config)

        # 如果视觉配置要求不共享层归一化,且从视觉编码器初始化层归一化
        if not vision_config.share_layernorm and config.init_layernorm_from_vision_encoder:
            # 将视觉模型的后层归一化权重和偏置复制给跨模态层归一化对象
            for ln in self.vision_model.visual.cross_modal_ln_separate:
                ln.weight.data = self.vision_model.visual.ln_post.weight.data
                ln.bias.data = self.vision_model.visual.ln_post.bias.data

        # 创建文本的跨模态层对象列表,每个对象使用文本配置创建
        self.cross_modal_image_layers = nn.ModuleList(
            [BridgeTowerBertCrossLayer(text_config) for _ in range(config.num_hidden_layers)]
        )
        # 创建视觉的跨模态层对象列表,每个对象使用视觉配置创建
        self.cross_modal_text_layers = nn.ModuleList(
            [BridgeTowerBertCrossLayer(text_config) for _ in range(config.num_hidden_layers)]
        )

        # 创建跨模态文本池化器对象,使用给定的配置
        self.cross_modal_text_pooler = BridgeTowerPooler(config)
        # 创建跨模态视觉池化器对象,使用给定的配置
        self.cross_modal_image_pooler = BridgeTowerPooler(config)

        # 创建跨模态文本层归一化对象,使用给定的隐藏大小和层归一化的 epsilon 值
        self.cross_modal_text_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建跨模态视觉层归一化对象,使用给定的隐藏大小和层归一化的 epsilon 值
        self.cross_modal_image_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 根据配置决定是否共享连接塔层
        if config.share_link_tower_layers:
            # 如果共享,创建一个连接塔对象,用于文本跨模态连接
            self.cross_modal_text_link_tower = BridgeTowerLinkTower(config)
            # 创建一个连接塔对象,用于视觉跨模态连接
            self.cross_modal_image_link_tower = BridgeTowerLinkTower(config)
        else:
            # 如果不共享,创建一个模块列表,每个元素是一个连接塔对象,用于每个隐藏层的连接
            self.cross_modal_text_link_tower = nn.ModuleList(
                [BridgeTowerLinkTower(config) for _ in range(config.num_hidden_layers - 1)]
            )
            self.cross_modal_image_link_tower = nn.ModuleList(
                [BridgeTowerLinkTower(config) for _ in range(config.num_hidden_layers - 1)]
            )

        # 调用初始化后的方法,用于额外的初始化步骤
        self.post_init()

    # 获取输入嵌入层的方法,委托给文本模型的获取输入嵌入层方法
    def get_input_embeddings(self):
        return self.text_model.get_input_embeddings()

    # 设置输入嵌入层的方法,委托给文本模型的设置输入嵌入层方法
    def set_input_embeddings(self, value):
        self.text_model.set_input_embeddings(value)

    # 添加模型正向传播的文档字符串注释,使用指定的输入文档字符串模板
    @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BridgeTowerModelOutput, config_class=_CONFIG_FOR_DOC)
    # 使用装饰器,替换该方法的返回文档字符串,指定输出类型为BridgeTowerModelOutput,配置类为_CONFIG_FOR_DOC
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        pixel_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        image_embeds: Optional[torch.FloatTensor] = None,
        image_token_type_idx: Optional[int] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.LongTensor] = None,
    # 此方法定义了模型的前向传播过程,接收多个可选的输入参数,并根据装饰器指定的返回类型和配置类处理返回文档
    def get_cls_features(self, text_features, image_features):
        # 通过文本特征传递到交叉模态文本池化器,获取文本的CLS特征
        cls_features_text = self.cross_modal_text_pooler(text_features)
        # 通过图像特征传递到交叉模态图像池化器,获取图像的CLS特征
        cls_features_image = self.cross_modal_image_pooler(image_features)
        # 将文本和图像的CLS特征在最后一个维度上连接起来
        return torch.cat([cls_features_text, cls_features_image], dim=-1)
# 从 transformers.models.vilt.modeling_vilt.ViltPredictionHeadTransform 复制并改名为 BridgeTowerPredictionHeadTransform
class BridgeTowerPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个全连接层,输入和输出维度都为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 根据 config.hidden_act 类型选择激活函数 ACT2FN 中的对应项或直接使用给定的激活函数
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        # LayerNorm 层,对隐藏状态的每个元素进行归一化,输入维度为 config.hidden_size
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states):
        # 全连接层的前向传播
        hidden_states = self.dense(hidden_states)
        # 应用选定的激活函数
        hidden_states = self.transform_act_fn(hidden_states)
        # LayerNorm 的前向传播
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


# 包含 MLN(掩码语言建模)头部的模型
class BridgeTowerMLMHead(nn.Module):
    def __init__(self, config, weight=None):
        super().__init__()
        self.config = config
        # BridgeTowerPredictionHeadTransform 用于处理输入特征
        self.transform = BridgeTowerPredictionHeadTransform(config)
        # 全连接层用于预测文本的词汇量大小
        self.decoder = nn.Linear(config.hidden_size, config.text_config.vocab_size, bias=False)
        # 偏置项,用于加到 decoder 输出上
        self.bias = nn.Parameter(torch.zeros(config.text_config.vocab_size))
        if weight is not None:
            # 如果提供了预训练权重,则使用这些权重
            self.decoder.weight = weight

    def forward(self, x):
        # 使用头部变换处理输入数据
        mlm_score = self.transform(x)
        # 对处理后的数据进行解码和偏置处理
        mlm_score = self.decoder(mlm_score) + self.bias
        return mlm_score


# 包含 ITM(信息主题模型)头部的模型
class BridgeTowerITMHead(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        # 全连接层,输入维度为 hidden_size,输出为 2(用于二分类任务)
        self.fc = nn.Linear(hidden_size, 2)

    def forward(self, x):
        # 全连接层的前向传播
        itm_score = self.fc(x)
        return itm_score


# BridgeTowerForMaskedLM 是 BridgeTowerPreTrainedModel 的一个子类,用于掩码语言建模
@add_start_docstrings(
    """
    使用语言建模头部的 BridgeTower 模型,用于预训练期间的任务。
    """,
    BRIDGETOWER_START_DOCSTRING,
)
class BridgeTowerForMaskedLM(BridgeTowerPreTrainedModel):
    _tied_weights_keys = ["mlm_score.decoder.weight"]

    def __init__(self, config):
        super().__init__(config)

        # 创建 BridgeTowerModel 对象
        self.bridgetower = BridgeTowerModel(config)
        # 创建 BridgeTowerMLMHead 对象
        self.mlm_score = BridgeTowerMLMHead(config)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_output_embeddings(self):
        # 返回 MLN 头部的 decoder 层
        return self.mlm_score.decoder

    def set_output_embeddings(self, new_embeddings):
        # 设置 MLN 头部的 decoder 层的权重
        self.mlm_score.decoder = new_embeddings

    @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
    # 定义模型的前向传播方法,接收多个可选的输入参数
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的 token IDs,类型为长整型张量,可选
        attention_mask: Optional[torch.FloatTensor] = None,  # 注意力掩码张量,类型为浮点数张量,可选
        token_type_ids: Optional[torch.LongTensor] = None,  # token 类型 IDs,类型为长整型张量,可选
        pixel_values: Optional[torch.FloatTensor] = None,  # 图像像素数值张量,类型为浮点数张量,可选
        pixel_mask: Optional[torch.LongTensor] = None,  # 图像像素掩码张量,类型为长整型张量,可选
        head_mask: Optional[torch.FloatTensor] = None,  # 头部掩码张量,类型为浮点数张量,可选
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 嵌入的输入张量,类型为浮点数张量,可选
        image_embeds: Optional[torch.FloatTensor] = None,  # 图像嵌入张量,类型为浮点数张量,可选
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重,类型为布尔值,可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态,类型为布尔值,可选
        return_dict: Optional[bool] = None,  # 是否以字典形式返回结果,类型为布尔值,可选
        labels: Optional[torch.LongTensor] = None,  # 标签张量,类型为长整型张量,可选
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果 return_dict 不为 None,则使用它;否则使用配置中的 use_return_dict

        outputs = self.bridgetower(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            pixel_values=pixel_values,
            pixel_mask=pixel_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            image_embeds=image_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 调用 BridgeTower 模型的前向传播,传入输入数据和相关参数

        mlm_logits = self.mlm_score(outputs.text_features if return_dict else outputs[0])
        # 使用模型输出的文本特征计算 MLM (Masked Language Modeling) 的预测 logits

        masked_lm_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # 交叉熵损失函数,用于计算损失

            labels = labels.to(mlm_logits.device)
            # 将标签移动到与 mlm_logits 相同的设备上

            masked_lm_loss = loss_fct(mlm_logits.view(-1, self.config.text_config.vocab_size), labels.view(-1))
            # 计算 MLM 损失,将 logits 和标签视图展平为二维张量进行计算

        if not return_dict:
            output = tuple(mlm_logits)
            # 如果不返回字典,则输出 MLM 的 logits 元组

            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
            # 如果有损失,则返回损失和输出;否则只返回输出

        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=mlm_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
        # 返回包含损失、logits、隐藏状态和注意力的 MaskedLMOutput 对象
# 使用自定义的文档字符串为类添加注释,描述这是一个 BridgeTower 模型的变体,用于图像到文本匹配任务,其在顶部包含一个分类器头部
# (即一个线性层,放置在最终隐藏状态的 [CLS] 标记之上)。

@add_start_docstrings(
    """
    BridgeTower Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the
    [CLS] token) for image-to-text matching.
    """,
    BRIDGETOWER_START_DOCSTRING,
)
class BridgeTowerForImageAndTextRetrieval(BridgeTowerPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化 BridgeTower 模型
        self.bridgetower = BridgeTowerModel(config)

        # 初始化 BridgeTowerITMHead 作为图像到文本匹配任务的得分头部
        self.itm_score = BridgeTowerITMHead(config.hidden_size * 2)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        pixel_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        image_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.LongTensor] = None,
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果 return_dict 不为 None,则使用 return_dict;否则使用 self.config.use_return_dict

        outputs = self.bridgetower(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            pixel_values=pixel_values,
            pixel_mask=pixel_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            image_embeds=image_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 调用 self.bridgetower 方法,传入各种输入参数,返回模型输出结果 outputs

        pooler_output = outputs.pooler_output if return_dict else outputs[2]
        # 如果 return_dict 为 True,则使用 outputs.pooler_output;否则使用 outputs 的第三个元素作为 pooler_output

        logits = self.itm_score(pooler_output)
        # 将 pooler_output 作为输入,计算模型的 logits

        itm_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # 定义交叉熵损失函数对象

            labels = labels.to(logits.device)
            # 将 labels 移动到与 logits 相同的设备上

            itm_loss = loss_fct(logits, labels)
            # 计算模型预测的 logits 与实际 labels 之间的交叉熵损失

        if not return_dict:
            output = tuple(logits)
            # 如果 return_dict 为 False,则将 logits 转换为元组形式作为 output

            return ((itm_loss,) + output) if itm_loss is not None else output
            # 如果 itm_loss 不为 None,则返回包含 itm_loss 和 output 的元组;否则只返回 output

        return SequenceClassifierOutput(
            loss=itm_loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
        # 如果 return_dict 为 True,则返回一个 SequenceClassifierOutput 对象,包含 itm_loss、logits、hidden_states 和 attentions
# 定义一个自定义的 PyTorch 模型类 BridgeTowerContrastiveHead,继承自 nn.Module
class BridgeTowerContrastiveHead(nn.Module):
    def __init__(self, hidden_size, embed_size):
        super().__init__()
        # 创建一个全连接层,将输入特征维度 hidden_size 转换为 embed_size
        self.fc = nn.Linear(hidden_size, embed_size)

    # 前向传播函数,接收输入 x,通过全连接层进行线性变换后返回
    def forward(self, x):
        x = self.fc(x)
        return x


# 使用装饰器 @add_start_docstrings 和指定的文档字符串,为 BridgeTowerForContrastiveLearning 类添加说明
@add_start_docstrings(
    """
    BridgeTower Model with a image-text contrastive head on top computing image-text contrastive loss.
    """,
    BRIDGETOWER_START_DOCSTRING,
)
# 定义一个自定义的 PyTorch 模型类 BridgeTowerForContrastiveLearning,继承自 BridgeTowerPreTrainedModel
class BridgeTowerForContrastiveLearning(BridgeTowerPreTrainedModel):
    # 初始化函数,接收一个配置参数 config
    def __init__(self, config):
        # 调用父类的初始化函数,传入配置参数 config
        super().__init__(config)

        # 创建 BridgeTowerModel 类的实例,并保存在 self.bridgetower 属性中
        self.bridgetower = BridgeTowerModel(config)

        # 创建用于文本和图像对比学习的头部模块实例
        # 使用 BridgeTowerContrastiveHead 类创建 itc_text_head 和 itc_image_head 实例,
        # 分别使用配置中的 hidden_size 和 contrastive_hidden_size 参数作为输入和输出维度
        self.itc_text_head = BridgeTowerContrastiveHead(config.hidden_size, config.contrastive_hidden_size)
        self.itc_image_head = BridgeTowerContrastiveHead(config.hidden_size, config.contrastive_hidden_size)

        # 创建用于跨模态对比学习的头部模块实例
        # 使用 BridgeTowerContrastiveHead 类创建 itc_cross_modal_head 实例,
        # 使用配置中的 hidden_size * 2 和 contrastive_hidden_size 参数作为输入和输出维度
        self.itc_cross_modal_head = BridgeTowerContrastiveHead(config.hidden_size * 2, config.contrastive_hidden_size)

        # 创建一个可学习的标量参数 logit_scale,初始化值来自于配置参数 self.config.logit_scale_init_value
        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))

        # 调用模型初始化函数
        # 在此处执行额外的初始化任务,例如权重初始化和后处理步骤
        self.post_init()

    # 前向传播函数,接收多个输入参数,根据模型需要进行计算并返回结果
    @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BridgeTowerContrastiveOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        pixel_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        image_embeds: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = True,
        return_dict: Optional[bool] = None,
        return_loss: Optional[bool] = None,

.\models\bridgetower\processing_bridgetower.py

# coding=utf-8
# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for BridgeTower.
"""

from typing import List, Optional, Union

from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType


class BridgeTowerProcessor(ProcessorMixin):
    r"""
    Constructs a BridgeTower processor which wraps a Roberta tokenizer and BridgeTower image processor into a single
    processor.

    [`BridgeTowerProcessor`] offers all the functionalities of [`BridgeTowerImageProcessor`] and
    [`RobertaTokenizerFast`]. See the docstring of [`~BridgeTowerProcessor.__call__`] and
    [`~BridgeTowerProcessor.decode`] for more information.

    Args:
        image_processor (`BridgeTowerImageProcessor`):
            An instance of [`BridgeTowerImageProcessor`]. The image processor is a required input.
        tokenizer (`RobertaTokenizerFast`):
            An instance of ['RobertaTokenizerFast`]. The tokenizer is a required input.
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "BridgeTowerImageProcessor"
    tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")

    def __init__(self, image_processor, tokenizer):
        super().__init__(image_processor, tokenizer)

    def __call__(
        self,
        images,
        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs,
    ):
        # Callable method to preprocess inputs combining images and text for model input
        """
        Process images and optionally text into model input.

        Args:
            images: Input images to be processed.
            text: Optional text input, can be either TextInput or PreTokenizedInput format.
            add_special_tokens: Whether to add special tokens (like [CLS], [SEP]) to the inputs.
            padding: Padding strategy. Can be a bool, str, or PaddingStrategy enum.
            truncation: Truncation strategy. Can be a bool, str, or TruncationStrategy enum.
            max_length: Maximum length of the returned sequences.
            stride: Stride to use when overflowing tokens.
            pad_to_multiple_of: Pad to a multiple of specified value.
            return_token_type_ids: Whether to return token type ids.
            return_attention_mask: Whether to return attention mask.
            return_overflowing_tokens: Whether to return overflowing tokens.
            return_special_tokens_mask: Whether to return special tokens mask.
            return_offsets_mapping: Whether to return offsets mapping.
            return_length: Whether to return the lengths of processed inputs.
            verbose: Whether to output detailed logs during processing.
            return_tensors: Return tensors format (e.g., "pt" for PyTorch tensors).
            **kwargs: Additional keyword arguments for processing.

        Returns:
            BatchEncoding: Processed inputs formatted as BatchEncoding.

        Notes:
            This method processes images and optionally text into a format suitable for model input,
            handling tokenization, padding, truncation, and special token additions as specified.
        """
        pass  # Placeholder for actual implementation or further logic
    ) -> BatchEncoding:
        """
        使用 [`BridgeTowerImageProcessor.__call__`] 方法准备图像以供模型使用,
        使用 [`RobertaTokenizerFast.__call__`] 方法准备文本以供模型使用。

        更多信息请参考上述两个方法的文档字符串。
        """
        # 使用指定参数调用 tokenizer 方法,生成编码结果
        encoding = self.tokenizer(
            text=text,
            add_special_tokens=add_special_tokens,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
            stride=stride,
            pad_to_multiple_of=pad_to_multiple_of,
            return_token_type_ids=return_token_type_ids,
            return_attention_mask=return_attention_mask,
            return_overflowing_tokens=return_overflowing_tokens,
            return_special_tokens_mask=return_special_tokens_mask,
            return_offsets_mapping=return_offsets_mapping,
            return_length=return_length,
            verbose=verbose,
            return_tensors=return_tensors,
            **kwargs,
        )
        # 使用 image_processor 方法处理图像,获取处理后的编码结果
        encoding_image_processor = self.image_processor(
            images, return_tensors=return_tensors, do_normalize=True, do_center_crop=True, **kwargs
        )
        # 将图像处理的编码结果更新到文本处理的编码结果中
        encoding.update(encoding_image_processor)

        # 返回合并了文本和图像编码结果的最终编码结果
        return encoding

    def batch_decode(self, *args, **kwargs):
        """
        将所有参数转发给 RobertaTokenizerFast 的 [`~PreTrainedTokenizer.batch_decode`] 方法。
        更多信息请参考该方法的文档字符串。
        """
        # 调用 tokenizer 的 batch_decode 方法,并返回结果
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        将所有参数转发给 RobertaTokenizerFast 的 [`~PreTrainedTokenizer.decode`] 方法。
        更多信息请参考该方法的文档字符串。
        """
        # 调用 tokenizer 的 decode 方法,并返回结果
        return self.tokenizer.decode(*args, **kwargs)

    @property
    def model_input_names(self):
        # 获取 tokenizer 和 image_processor 的模型输入名称列表
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        # 合并去重后的模型输入名称列表,并返回
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

.\models\bridgetower\__init__.py

# 从 typing 模块中导入 TYPE_CHECKING 类型检查器
from typing import TYPE_CHECKING

# 从当前目录的 utils 模块中导入必要的异常和工具函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义一个字典结构 _import_structure,用于组织模块和对应的导入项
_import_structure = {
    "configuration_bridgetower": [
        "BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "BridgeTowerConfig",
        "BridgeTowerTextConfig",
        "BridgeTowerVisionConfig",
    ],
    "processing_bridgetower": ["BridgeTowerProcessor"],
}

# 尝试导入图像处理模块,如果 is_vision_available 返回 False,则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果成功导入,将图像处理模块添加到 _import_structure 中
    _import_structure["image_processing_bridgetower"] = ["BridgeTowerImageProcessor"]

# 尝试导入 Torch 模块,如果 is_torch_available 返回 False,则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果成功导入,将模型处理模块添加到 _import_structure 中
    _import_structure["modeling_bridgetower"] = [
        "BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "BridgeTowerForContrastiveLearning",
        "BridgeTowerForImageAndTextRetrieval",
        "BridgeTowerForMaskedLM",
        "BridgeTowerModel",
        "BridgeTowerPreTrainedModel",
    ]

# 如果是类型检查环境
if TYPE_CHECKING:
    # 从相关模块中导入配置和处理类
    from .configuration_bridgetower import (
        BRIDGETOWER_PRETRAINED_CONFIG_ARCHIVE_MAP,
        BridgeTowerConfig,
        BridgeTowerTextConfig,
        BridgeTowerVisionConfig,
    )
    from .processing_bridgetower import BridgeTowerProcessor

    # 尝试导入图像处理模块,如果 is_vision_available 返回 False,则跳过导入
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .image_processing_bridgetower import BridgeTowerImageProcessor

    # 尝试导入 Torch 模块,如果 is_torch_available 返回 False,则跳过导入
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_bridgetower import (
            BRIDGETOWER_PRETRAINED_MODEL_ARCHIVE_LIST,
            BridgeTowerForContrastiveLearning,
            BridgeTowerForImageAndTextRetrieval,
            BridgeTowerForMaskedLM,
            BridgeTowerModel,
            BridgeTowerPreTrainedModel,
        )

# 如果不是类型检查环境,则将当前模块设置为 LazyModule,用于延迟导入模块
else:
    import sys

    # 将当前模块替换为 LazyModule 对象,支持延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

.\models\bros\configuration_bros.py

# 导入所需模块和类
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取 logger 对象用于记录日志
logger = logging.get_logger(__name__)

# 预训练配置与 URL 映射表,用于不同的 Bros 模型
BROS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "jinho8345/bros-base-uncased": "https://huggingface.co/jinho8345/bros-base-uncased/blob/main/config.json",
    "jinho8345/bros-large-uncased": "https://huggingface.co/jinho8345/bros-large-uncased/blob/main/config.json",
}

# BrosConfig 类,继承自 PretrainedConfig,用于存储 Bros 模型的配置信息
class BrosConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`BrosModel`] or a [`TFBrosModel`]. It is used to
    instantiate a Bros model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the Bros
    [jinho8345/bros-base-uncased](https://huggingface.co/jinho8345/bros-base-uncased) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 定义 Bros 模型的配置类 BrosConfig,用于设置模型参数
    Args:
        vocab_size (`int`, *optional*, defaults to 30522):
            Bros 模型的词汇表大小,定义了在调用 `BrosModel` 或 `TFBrosModel` 时可以表示的不同 token 数量。
        hidden_size (`int`, *optional*, defaults to 768):
            编码器层和池化层的维度大小。
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Transformer 编码器中的隐藏层数量。
        num_attention_heads (`int`, *optional*, defaults to 12):
            Transformer 编码器中每个注意力层的注意力头数量。
        intermediate_size (`int`, *optional*, defaults to 3072):
            Transformer 编码器中“中间层”(通常称为前馈层)的维度大小。
        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
            编码器和池化器中的非线性激活函数(函数或字符串)。支持的字符串有 `"gelu"`, `"relu"`, `"silu"` 和 `"gelu_new"`。
        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
            嵌入层、编码器和池化器中所有全连接层的 dropout 概率。
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
            注意力概率的 dropout 比率。
        max_position_embeddings (`int`, *optional*, defaults to 512):
            此模型可能使用的最大序列长度。通常设置为较大的值(例如 512、1024 或 2048)以防万一。
        type_vocab_size (`int`, *optional*, defaults to 2):
            在调用 `BrosModel` 或 `TFBrosModel` 时传递的 `token_type_ids` 的词汇表大小。
        initializer_range (`float`, *optional*, defaults to 0.02):
            用于初始化所有权重矩阵的截断正态初始化器的标准差。
        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            层归一化层使用的 epsilon 值。
        pad_token_id (`int`, *optional*, defaults to 0):
            词汇表中填充 token 的索引。
        dim_bbox (`int`, *optional*, defaults to 8):
            边界框坐标的维度大小。 (x0, y1, x1, y0, x1, y1, x0, y1)
        bbox_scale (`float`, *optional*, defaults to 100.0):
            边界框坐标的缩放因子。
        n_relations (`int`, *optional*, defaults to 1):
            SpadeEE(实体提取)、SpadeEL(实体链接)头部的关系数量。
        classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
            分类器头部的 dropout 比率。
    
    Examples:
    
    ```
    >>> from transformers import BrosConfig, BrosModel
    
    >>> # Initializing a BROS jinho8345/bros-base-uncased style configuration
    >>> configuration = BrosConfig()
    
    
    # 创建一个BrosConfig的实例对象并赋值给configuration变量
    configuration = BrosConfig()
    
    
    
    >>> # 使用jinho8345/bros-base-uncased风格配置初始化一个模型
    >>> model = BrosModel(configuration)
    
    
    # 使用BrosConfig实例对象configuration初始化一个BrosModel模型
    model = BrosModel(configuration)
    
    
    
    >>> # 获取模型的配置信息
    >>> configuration = model.config
    
    
    # 获取模型model的配置信息,并赋值给configuration变量
    configuration = model.config
    
    
    
    model_type = "bros"
    
    def __init__(
        self,
        vocab_size=30522,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        pad_token_id=0,
        dim_bbox=8,
        bbox_scale=100.0,
        n_relations=1,
        classifier_dropout_prob=0.1,
        **kwargs,
    ):
    
    
    # 设置模型的类型为"bros"
    model_type = "bros"
    
    # 初始化函数,用于创建BrosConfig的实例
    def __init__(
        self,
        vocab_size=30522,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        pad_token_id=0,
        dim_bbox=8,
        bbox_scale=100.0,
        n_relations=1,
        classifier_dropout_prob=0.1,
        **kwargs,
    ):
    
    
    
        super().__init__(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            num_hidden_layers=num_hidden_layers,
            num_attention_heads=num_attention_heads,
            intermediate_size=intermediate_size,
            hidden_act=hidden_act,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_position_embeddings=max_position_embeddings,
            type_vocab_size=type_vocab_size,
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            pad_token_id=pad_token_id,
            **kwargs,
        )
    
    
    # 调用父类的初始化方法,初始化模型的各种参数
    super().__init__(
        vocab_size=vocab_size,
        hidden_size=hidden_size,
        num_hidden_layers=num_hidden_layers,
        num_attention_heads=num_attention_heads,
        intermediate_size=intermediate_size,
        hidden_act=hidden_act,
        hidden_dropout_prob=hidden_dropout_prob,
        attention_probs_dropout_prob=attention_probs_dropout_prob,
        max_position_embeddings=max_position_embeddings,
        type_vocab_size=type_vocab_size,
        initializer_range=initializer_range,
        layer_norm_eps=layer_norm_eps,
        pad_token_id=pad_token_id,
        **kwargs,
    )
    
    
    
        self.dim_bbox = dim_bbox
        self.bbox_scale = bbox_scale
        self.n_relations = n_relations
        self.dim_bbox_sinusoid_emb_2d = self.hidden_size // 4
        self.dim_bbox_sinusoid_emb_1d = self.dim_bbox_sinusoid_emb_2d // self.dim_bbox
        self.dim_bbox_projection = self.hidden_size // self.num_attention_heads
        self.classifier_dropout_prob = classifier_dropout_prob
    
    
    # 初始化模型的特定属性和超参数
    self.dim_bbox = dim_bbox
    self.bbox_scale = bbox_scale
    self.n_relations = n_relations
    self.dim_bbox_sinusoid_emb_2d = self.hidden_size // 4
    self.dim_bbox_sinusoid_emb_1d = self.dim_bbox_sinusoid_emb_2d // self.dim_bbox
    self.dim_bbox_projection = self.hidden_size // self.num_attention_heads
    self.classifier_dropout_prob = classifier_dropout_prob

.\models\bros\convert_bros_to_pytorch.py

# 设置脚本的编码格式为 UTF-8
# 版权声明,指明版权归属于 HuggingFace Inc. 团队
#
# 根据 Apache 许可证 2.0 版本,除非符合许可证规定,否则不得使用本文件
# 您可以在以下网址获取许可证的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则按“原样”提供本软件,不提供任何形式的明示或暗示保证或条件。
# 请参阅许可证获取特定语言的权限和限制。
"""将 Bros 检查点转换为 HuggingFace 模型格式"""

import argparse  # 导入命令行参数解析模块

import bros  # 原始仓库
import torch  # 导入 PyTorch 模块

from transformers import BrosConfig, BrosModel, BrosProcessor  # 导入转换所需的模块和类
from transformers.utils import logging  # 导入日志记录模块


logging.set_verbosity_info()  # 设置日志记录的详细级别为 info
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


def get_configs(model_name):
    """获取指定模型的配置信息"""
    bros_config = BrosConfig.from_pretrained(model_name)
    return bros_config


def remove_ignore_keys_(state_dict):
    """移除指定的忽略键(如果存在)"""
    ignore_keys = [
        "embeddings.bbox_sinusoid_emb.inv_freq",
    ]
    for k in ignore_keys:
        state_dict.pop(k, None)


def rename_key(name):
    """根据约定重命名给定的键"""
    if name == "embeddings.bbox_projection.weight":
        name = "bbox_embeddings.bbox_projection.weight"

    if name == "embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq":
        name = "bbox_embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq"

    if name == "embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq":
        name = "bbox_embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq"

    return name


def convert_state_dict(orig_state_dict, model):
    """将原始模型状态字典转换为适用于 HuggingFace 模型的格式"""
    # 重命名键
    for key in orig_state_dict.copy().keys():
        val = orig_state_dict.pop(key)
        orig_state_dict[rename_key(key)] = val

    # 移除忽略的键
    remove_ignore_keys_(orig_state_dict)

    return orig_state_dict


def convert_bros_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False):
    """将 Bros 模型检查点转换为 HuggingFace 模型格式"""
    # 加载原始的 Bros 模型
    original_model = bros.BrosModel.from_pretrained(model_name).eval()

    # 加载 HuggingFace 模型
    bros_config = get_configs(model_name)
    model = BrosModel.from_pretrained(model_name, config=bros_config)
    model.eval()

    state_dict = original_model.state_dict()
    new_state_dict = convert_state_dict(state_dict, model)
    model.load_state_dict(new_state_dict)

    # 验证结果

    # 原始的 BROS 模型需要每个边界框 4 个点(8 个浮点数),准备形状为 [batch_size, seq_len, 8] 的边界框
    # 创建一个包含边界框信息的张量,用于定义对象的位置和大小
    bbox = torch.tensor(
        [
            [
                [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
                [0.4396, 0.6720, 0.4659, 0.6720, 0.4659, 0.6850, 0.4396, 0.6850],
                [0.4698, 0.6720, 0.4843, 0.6720, 0.4843, 0.6850, 0.4698, 0.6850],
                [0.4698, 0.6720, 0.4843, 0.6720, 0.4843, 0.6850, 0.4698, 0.6850],
                [0.2047, 0.6870, 0.2730, 0.6870, 0.2730, 0.7000, 0.2047, 0.7000],
                [0.2047, 0.6870, 0.2730, 0.6870, 0.2730, 0.7000, 0.2047, 0.7000],
                [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
            ]
        ]
    )

    # 从预训练模型加载 BrosProcessor 对象,用于处理文本输入
    processor = BrosProcessor.from_pretrained(model_name)

    # 使用 processor 对象处理输入文本,将边界框信息添加到编码结果中
    encoding = processor("His name is Rocco.", return_tensors="pt")
    encoding["bbox"] = bbox

    # 使用原始模型生成输入编码的最后隐藏状态
    original_hidden_states = original_model(**encoding).last_hidden_state
    # pixel_values = processor(image, return_tensors="pt").pixel_values

    # 使用微调后的模型生成输入编码的最后隐藏状态
    last_hidden_states = model(**encoding).last_hidden_state

    # 断言原始模型和微调后模型的最后隐藏状态在一定误差范围内相等
    assert torch.allclose(original_hidden_states, last_hidden_states, atol=1e-4)

    # 如果指定了 PyTorch 模型保存路径,则保存微调后的模型和 processor 对象
    if pytorch_dump_folder_path is not None:
        print(f"Saving model and processor to {pytorch_dump_folder_path}")
        model.save_pretrained(pytorch_dump_folder_path)
        processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要将模型和 processor 推送到 Hub 上,则执行推送操作
    if push_to_hub:
        model.push_to_hub("jinho8345/" + model_name.split("/")[-1], commit_message="Update model")
        processor.push_to_hub("jinho8345/" + model_name.split("/")[-1], commit_message="Update model")
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行,则执行以下代码块

    # 创建参数解析器
    parser = argparse.ArgumentParser()

    # 添加必需的参数
    parser.add_argument(
        "--model_name",
        default="jinho8345/bros-base-uncased",
        required=False,
        type=str,
        help="Name of the original model you'd like to convert.",
    )
    # 添加参数:输出 PyTorch 模型目录的路径
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        required=False,
        type=str,
        help="Path to the output PyTorch model directory.",
    )
    # 添加参数:是否推送转换后的模型和处理器到 🤗 hub
    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Whether or not to push the converted model and processor to the 🤗 hub.",
    )

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数 convert_bros_checkpoint,传入解析后的参数
    convert_bros_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)

.\models\bros\modeling_bros.py

# coding=utf-8
# Copyright 2023-present NAVER Corp, The Microsoft Research Asia LayoutLM Team Authors and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Bros model."""

# Import necessary libraries
import math
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss

# Importing specific components from Hugging Face's library
from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# Import Bros configuration from local file
from .configuration_bros import BrosConfig

# Get logger for logging messages
logger = logging.get_logger(__name__)

# Constant variables for documentation and model checkpoints
_CHECKPOINT_FOR_DOC = "jinho8345/bros-base-uncased"
_CONFIG_FOR_DOC = "BrosConfig"

# List of pretrained model archives
BROS_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "jinho8345/bros-base-uncased",
    "jinho8345/bros-large-uncased",
    # See all Bros models at https://huggingface.co/models?filter=bros
]

# Start documentation string for Bros model
BROS_START_DOCSTRING = r"""
    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`BrosConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# Input documentation string placeholder
BROS_INPUTS_DOCSTRING = r"""
"""


@dataclass
class BrosSpadeOutput(ModelOutput):
    """
    Base class for outputs of token classification models.
    
    This class inherits from `ModelOutput` in Hugging Face's library and serves as a base for outputs
    from token classification models specific to the Bros model.
    
    Attributes:
        Inherits attributes from `ModelOutput`.
    """
    # 定义函数的参数及其类型注释,loss 是一个可选的浮点张量,表示分类损失
    # initial_token_logits 是一个张量,形状为 (batch_size, sequence_length, config.num_labels),表示实体初始标记的分类分数(SoftMax 之前)
    # subsequent_token_logits 是一个张量,形状为 (batch_size, sequence_length, sequence_length+1),表示实体序列标记的分类分数(SoftMax 之前)
    # hidden_states 是一个可选的张量元组,当传入参数 output_hidden_states=True 或者配置参数 config.output_hidden_states=True 时返回,包含每层模型输出的隐藏状态
    # attentions 是一个可选的张量元组,当传入参数 output_attentions=True 或者配置参数 config.output_attentions=True 时返回,包含每层模型输出的注意力权重
    """

    # loss 表示分类损失,默认为 None
    loss: Optional[torch.FloatTensor] = None
    # initial_token_logits 表示实体初始标记的分类分数,默认为 None
    initial_token_logits: torch.FloatTensor = None
    # subsequent_token_logits 表示实体序列标记的分类分数,默认为 None
    subsequent_token_logits: torch.FloatTensor = None
    # hidden_states 表示模型每层的隐藏状态的元组,默认为 None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # attentions 表示模型每层的注意力权重的元组,默认为 None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
class BrosPositionalEmbedding1D(nn.Module):
    # 引用:https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py#L15
    # 一维位置编码的模块定义

    def __init__(self, config):
        super(BrosPositionalEmbedding1D, self).__init__()
        # 初始化函数,接收配置参数 config

        self.dim_bbox_sinusoid_emb_1d = config.dim_bbox_sinusoid_emb_1d
        # 从配置中获取一维位置编码的维度大小

        # 计算正弦函数的频率逆数,用于位置编码
        inv_freq = 1 / (
            10000 ** (torch.arange(0.0, self.dim_bbox_sinusoid_emb_1d, 2.0) / self.dim_bbox_sinusoid_emb_1d)
        )
        # 将频率逆数作为缓冲区注册到模块中
        self.register_buffer("inv_freq", inv_freq)

    def forward(self, pos_seq: torch.Tensor) -> torch.Tensor:
        # 前向传播函数,输入位置序列,返回位置编码张量

        seq_size = pos_seq.size()
        b1, b2, b3 = seq_size
        # 获取位置序列的大小

        sinusoid_inp = pos_seq.view(b1, b2, b3, 1) * self.inv_freq.view(1, 1, 1, self.dim_bbox_sinusoid_emb_1d // 2)
        # 计算正弦输入,使用位置序列乘以频率逆数的张量,并广播到合适的形状

        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
        # 将正弦和余弦结果连接在一起,得到最终的位置编码张量

        return pos_emb


class BrosPositionalEmbedding2D(nn.Module):
    # 二维位置编码的模块定义

    def __init__(self, config):
        super(BrosPositionalEmbedding2D, self).__init__()
        # 初始化函数,接收配置参数 config

        self.dim_bbox = config.dim_bbox
        # 从配置中获取边界框维度的大小

        # 创建一维位置编码模块实例,用于X和Y方向
        self.x_pos_emb = BrosPositionalEmbedding1D(config)
        self.y_pos_emb = BrosPositionalEmbedding1D(config)

    def forward(self, bbox: torch.Tensor) -> torch.Tensor:
        # 前向传播函数,输入边界框张量,返回位置编码后的张量

        stack = []
        # 初始化一个空列表,用于存储位置编码的结果

        for i in range(self.dim_bbox):
            # 遍历边界框维度

            if i % 2 == 0:
                stack.append(self.x_pos_emb(bbox[..., i]))
                # 如果是偶数索引,使用X方向的位置编码模块
            else:
                stack.append(self.y_pos_emb(bbox[..., i]))
                # 如果是奇数索引,使用Y方向的位置编码模块

        bbox_pos_emb = torch.cat(stack, dim=-1)
        # 将所有位置编码结果连接在一起,形成最终的边界框位置编码张量

        return bbox_pos_emb


class BrosBboxEmbeddings(nn.Module):
    # 边界框嵌入的模块定义

    def __init__(self, config):
        super(BrosBboxEmbeddings, self).__init__()
        # 初始化函数,接收配置参数 config

        self.bbox_sinusoid_emb = BrosPositionalEmbedding2D(config)
        # 创建二维位置编码模块实例

        self.bbox_projection = nn.Linear(config.dim_bbox_sinusoid_emb_2d, config.dim_bbox_projection, bias=False)
        # 创建线性层,用于将二维位置编码映射到边界框投影维度

    def forward(self, bbox: torch.Tensor):
        # 前向传播函数,输入边界框张量,返回映射后的边界框嵌入张量

        bbox_t = bbox.transpose(0, 1)
        # 转置边界框张量,使得第一维度和第二维度交换

        bbox_pos = bbox_t[None, :, :, :] - bbox_t[:, None, :, :]
        # 计算边界框的位置关系张量,使用广播来扩展维度

        bbox_pos_emb = self.bbox_sinusoid_emb(bbox_pos)
        # 使用二维位置编码模块对位置关系张量进行编码

        bbox_pos_emb = self.bbox_projection(bbox_pos_emb)
        # 使用线性层对位置编码结果进行投影映射

        return bbox_pos_emb


class BrosTextEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""
    # 文本嵌入的模块定义
    # 初始化函数,接受一个配置参数 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()

        # 创建词嵌入层,用于将词的索引映射成词向量
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 创建位置嵌入层,用于将位置索引映射成位置向量
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 创建token类型嵌入层,用于将token类型索引映射成token类型向量
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # 创建LayerNorm层,用于对隐藏状态的归一化处理
        # 参数名不符合 snake-case 命名规范,是为了兼容 TensorFlow 的模型变量名
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建dropout层,用于在训练时进行随机失活处理
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 位置id (1, len position emb) 在序列化时是连续存储的,并且会被导出
        # 根据配置添加绝对或相对的位置嵌入
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册一个持久的缓冲区 position_ids ,存储连续的位置id
        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
        # 注册一个非持久的缓冲区 token_type_ids ,存储所有位置的token类型id是0
        self.register_buffer(
            "token_type_ids",
            torch.zeros(
                self.position_ids.size(),
                dtype=torch.long,
                device=self.position_ids.device,
            ),
            persistent=False,
        )

    # 前向传播函数
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的词的索引
        token_type_ids: Optional[torch.Tensor] = None,  # token的类型id
        position_ids: Optional[torch.Tensor] = None,  # 位置id
        inputs_embeds: Optional[torch.Tensor] = None,  # 输入的词的向量
        past_key_values_length: int = 0,  # 之前的键值对的长度
    ) -> torch.Tensor:  # 返回值是张量
        # 如果有输入的词的索引,获取其形状,否则获取输入的词向量的形状
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]  # 序列长度

        # 如果没有指定位置id,将位置id设置为连续的一段
        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        # 如果没有指定token类型id,根据情况获取token类型id的值
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果没有指定输入的词向量,获取输入词的索引对应的词向量
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        # 根据token类型id获取token类型的嵌入向量
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 计算总的嵌入向量,包括词向量、token类型嵌入、位置嵌入
        embeddings = inputs_embeds + token_type_embeddings
        # 如果使用绝对位置嵌入,计算并加上位置嵌入向量
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings
        # 对总的嵌入向量进行LayerNorm处理
        embeddings = self.LayerNorm(embeddings)
        # 对处理后的嵌入向量进行随机失活处理
        embeddings = self.dropout(embeddings)
        # 返回处理后的嵌入向量
        return embeddings
class BrosSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 检查隐藏大小是否能被注意力头数整除,同时没有嵌入大小属性
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 创建查询、键、值线性层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # Dropout 层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 如果位置嵌入类型是相对键或相对键查询,则创建距离嵌入层
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        self.is_decoder = config.is_decoder

    # 调整形状以便计算注意力分数
    def transpose_for_scores(self, x: torch.Tensor):
        new_x_shape = x.size()[:-1] + (
            self.num_attention_heads,
            self.attention_head_size,
        )
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(
        self,
        hidden_states: torch.Tensor,
        bbox_pos_emb: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[torch.Tensor] = False,
    ):
        # 此处是模型的前向传播函数,实现自注意力机制和额外的逻辑
        pass  # 这里可以根据具体实现添加详细的功能注释


# 从 transformers.models.bert.modeling_bert.BertSelfOutput 复制,将 Bert 改为 Bros
class BrosSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 全连接层
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # LayerNorm 层
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 线性层
        hidden_states = self.dense(hidden_states)
        # Dropout
        hidden_states = self.dropout(hidden_states)
        # LayerNorm
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class BrosAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建自注意力和输出对象
        self.self = BrosSelfAttention(config)
        self.output = BrosSelfOutput(config)
        self.pruned_heads = set()  # 用于存储被修剪的注意力头部集合
    # 对 self 对象的 heads 进行修剪操作
    def prune_heads(self, heads):
        # 如果 heads 列表为空,则直接返回,不进行操作
        if len(heads) == 0:
            return
        
        # 调用 find_pruneable_heads_and_indices 函数查找可修剪的 heads 和对应的索引
        heads, index = find_pruneable_heads_and_indices(
            heads,
            self.self.num_attention_heads,
            self.self.attention_head_size,
            self.pruned_heads,
        )

        # 修剪 self.query 线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        # 修剪 self.key 线性层
        self.self.key = prune_linear_layer(self.self.key, index)
        # 修剪 self.value 线性层
        self.self.value = prune_linear_layer(self.self.value, index)
        # 修剪 self.output.dense 线性层,dim=1 表示在第一个维度上进行修剪
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并记录被修剪的 heads
        # 减去被修剪的 heads 的数量,更新注意力头的数量
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        # 计算所有注意力头的新尺寸
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        # 将被修剪的 heads 添加到 pruned_heads 集合中
        self.pruned_heads = self.pruned_heads.union(heads)

    # 定义 forward 方法,实现模型的前向传播
    def forward(
        self,
        hidden_states: torch.Tensor,
        bbox_pos_emb: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用 self.self 方法进行自注意力机制计算
        self_outputs = self.self(
            hidden_states=hidden_states,
            bbox_pos_emb=bbox_pos_emb,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
        )
        # 将 self_outputs[0] 与 hidden_states 作为输入,调用 self.output 方法
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果需要输出注意力信息,则将 attentions 添加到 outputs 中
        outputs = (attention_output,) + self_outputs[1:]  # 如果有输出注意力信息,则添加到 outputs 中
        return outputs
# 从 transformers.models.bert.modeling_bert.BertIntermediate 复制而来,修改为 BrosIntermediate
class BrosIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个线性层,将输入特征维度 config.hidden_size 转换为 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置选择激活函数 ACT2FN[config.hidden_act] 或者直接使用给定的激活函数 config.hidden_act
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 输入 hidden_states 经过线性层变换
        hidden_states = self.dense(hidden_states)
        # 经过中间激活函数变换
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# 定义 BrosOutput 类,继承自 nn.Module
class BrosOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个线性层,将输入特征维度 config.intermediate_size 转换为 config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # LayerNorm 归一化层,对隐藏状态进行归一化,eps 是归一化过程中的小数值稳定项
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层,以 config.hidden_dropout_prob 概率丢弃隐藏状态
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 输入 hidden_states 经过线性层变换
        hidden_states = self.dense(hidden_states)
        # 经过 Dropout 层处理
        hidden_states = self.dropout(hidden_states)
        # 将输入张量 input_tensor 和处理后的 hidden_states 相加,并经过 LayerNorm 归一化
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# 定义 BrosLayer 类,继承自 nn.Module
class BrosLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 设置用于 feed forward 的块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度,用于注意力计算
        self.seq_len_dim = 1
        # BrosAttention 类的实例,用于处理注意力
        self.attention = BrosAttention(config)
        # 是否为解码器
        self.is_decoder = config.is_decoder
        # 是否添加交叉注意力
        self.add_cross_attention = config.add_cross_attention
        # 如果添加交叉注意力但不是解码器,则抛出异常
        if self.add_cross_attention:
            if not self.is_decoder:
                raise Exception(f"{self} should be used as a decoder model if cross attention is added")
            # 否则,创建 BrosAttention 类的实例,用于交叉注意力
            self.crossattention = BrosAttention(config)
        # BrosIntermediate 类的实例,用于处理中间层
        self.intermediate = BrosIntermediate(config)
        # BrosOutput 类的实例,用于处理输出层
        self.output = BrosOutput(config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        bbox_pos_emb: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,

        # 前向传播函数定义,接受多个输入参数,返回处理后的隐藏状态张量
        # hidden_states: 输入的隐藏状态张量
        # bbox_pos_emb: 边界框位置嵌入张量
        # attention_mask: 注意力掩码张量,可选
        # head_mask: 头部掩码张量,可选
        # encoder_hidden_states: 编码器隐藏状态张量,可选
        # encoder_attention_mask: 编码器注意力掩码张量,可选
        # past_key_value: 过去的键值对元组,可选
        # output_attentions: 是否输出注意力张量,默认为 False
    ) -> Tuple[torch.Tensor]:
        # 如果有缓存的过去的键/值对,则取前两个(用于自注意力)
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 执行自注意力计算,传入隐藏状态、边界框位置嵌入、注意力掩码、头部掩码等参数
        self_attention_outputs = self.attention(
            hidden_states,
            bbox_pos_emb=bbox_pos_emb,
            attention_mask=attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        # 获取自注意力输出
        attention_output = self_attention_outputs[0]

        # 如果是解码器,则最后一个输出是自注意力缓存的元组
        if self.is_decoder:
            # 输出中除了最后一个元素(自注意力缓存),其余都作为输出
            outputs = self_attention_outputs[1:-1]
            # 当前的键/值对是最后一个元素
            present_key_value = self_attention_outputs[-1]
        else:
            # 输出中包括自注意力的权重
            outputs = self_attention_outputs[1:]

        # 跨注意力的当前键/值对初始化为None
        cross_attn_present_key_value = None
        # 如果是解码器且有编码器的隐藏状态
        if self.is_decoder and encoder_hidden_states is not None:
            # 如果self对象具有crossattention属性,抛出异常
            if hasattr(self, "crossattention"):
                raise Exception(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
                )

            # 如果有缓存的过去的键/值对,则取后两个(用于跨注意力)
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # 执行跨注意力计算,传入自注意力输出、注意力掩码、头部掩码、编码器隐藏状态等参数
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            # 获取跨注意力的输出
            attention_output = cross_attention_outputs[0]
            # 将跨注意力的权重添加到输出中
            outputs = outputs + cross_attention_outputs[1:-1]

            # 将跨注意力的当前键/值对添加到现有的键/值对中
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # 应用分块机制到前向传播的输出
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk,
            self.chunk_size_feed_forward,
            self.seq_len_dim,
            attention_output,
        )
        # 将层输出添加到输出元组中
        outputs = (layer_output,) + outputs

        # 如果是解码器,将注意力的键/值对作为最后一个输出返回
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        # 返回所有输出
        return outputs

    # 定义前馈网络的分块函数,接收注意力输出并返回层输出
    def feed_forward_chunk(self, attention_output):
        # 执行中间层计算
        intermediate_output = self.intermediate(attention_output)
        # 执行输出层计算,传入中间输出和注意力输出
        layer_output = self.output(intermediate_output, attention_output)
        # 返回层输出
        return layer_output
# 定义一个用于编码的自定义 PyTorch 模块,继承自 nn.Module
class BrosEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化模块的配置参数
        self.config = config
        # 创建多个 BrosLayer 模块组成的列表,数量由配置参数决定
        self.layer = nn.ModuleList([BrosLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(
        self,
        hidden_states: torch.Tensor,
        bbox_pos_emb: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
# 以下是 BrosPooler 类定义,用于池化模型隐藏状态
class BrosPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 使用线性层将隐藏状态的大小转换为配置参数中的隐藏大小
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 使用双曲正切函数作为激活函数
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 简单地使用第一个标记对应的隐藏状态来“池化”模型
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output


class BrosRelationExtractor(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化关系抽取器模块的配置参数
        self.n_relations = config.n_relations
        self.backbone_hidden_size = config.hidden_size
        self.head_hidden_size = config.hidden_size
        self.classifier_dropout_prob = config.classifier_dropout_prob

        # 使用指定的 dropout 概率创建一个 dropout 层
        self.drop = nn.Dropout(self.classifier_dropout_prob)
        # 使用线性层定义查询(query)操作,将骨干隐藏状态大小映射到关系头大小的多个关系
        self.query = nn.Linear(self.backbone_hidden_size, self.n_relations * self.head_hidden_size)

        # 使用线性层定义键(key)操作,将骨干隐藏状态大小映射到关系头大小的多个关系
        self.key = nn.Linear(self.backbone_hidden_size, self.n_relations * self.head_hidden_size)

        # 定义一个虚拟节点,通过 nn.Parameter 创建,值为全零向量
        self.dummy_node = nn.Parameter(torch.zeros(1, self.backbone_hidden_size))

    def forward(self, query_layer: torch.Tensor, key_layer: torch.Tensor):
        # 对查询层进行查询操作,并应用 dropout
        query_layer = self.query(self.drop(query_layer))

        # 创建一个虚拟向量,将其添加到键层中
        dummy_vec = self.dummy_node.unsqueeze(0).repeat(1, key_layer.size(1), 1)
        key_layer = torch.cat([key_layer, dummy_vec], axis=0)
        
        # 对键层进行键操作,并应用 dropout
        key_layer = self.key(self.drop(key_layer))

        # 重新调整查询层和键层的形状以适应多头关系的表示
        query_layer = query_layer.view(
            query_layer.size(0), query_layer.size(1), self.n_relations, self.head_hidden_size
        )
        key_layer = key_layer.view(key_layer.size(0), key_layer.size(1), self.n_relations, self.head_hidden_size)

        # 计算查询层和键层之间的关系分数,采用矩阵乘法进行计算
        relation_score = torch.matmul(
            query_layer.permute(2, 1, 0, 3), key_layer.permute(2, 1, 3, 0)
        )  # 相当于 torch.einsum("ibnd,jbnd->nbij", (query_layer, key_layer))

        return relation_score
class BrosPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用 BrosConfig 作为配置类
    config_class = BrosConfig
    # 基础模型的名称前缀
    base_model_prefix = "bros"

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
            # 如果是线性层,使用正态分布初始化权重
            # 与 TF 版本稍有不同,TF 使用截断正态分布进行初始化
            # 参考 https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置项,则初始化为零向量
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 如果是嵌入层,使用正态分布初始化权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果指定了 padding_idx,则将对应位置的权重初始化为零向量
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 如果是 LayerNorm 层,初始化偏置为零向量,初始化权重为全1向量
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


@add_start_docstrings(
    "The bare Bros Model transformer outputting raw hidden-states without any specific head on top.",
    BROS_START_DOCSTRING,
)
class BrosModel(BrosPreTrainedModel):
    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        # 初始化 BrosModel 类
        self.config = config

        # 初始化文本嵌入层、边界框嵌入层和编码器
        self.embeddings = BrosTextEmbeddings(config)
        self.bbox_embeddings = BrosBboxEmbeddings(config)
        self.encoder = BrosEncoder(config)

        # 如果需要添加池化层,则初始化池化层
        self.pooler = BrosPooler(config) if add_pooling_layer else None

        # 初始化模型权重
        self.init_weights()

    def get_input_embeddings(self):
        # 返回文本嵌入层的权重
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        # 设置文本嵌入层的权重
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 对模型的注意力头进行剪枝
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    @add_start_docstrings_to_model_forward(BROS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC)
    # 定义一个方法 `forward`,用于执行模型的前向传播操作,通常在神经网络模型中使用
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的 token IDs,可以是一个 PyTorch Tensor,默认为 None
        bbox: Optional[torch.Tensor] = None,  # bounding box 数据,用于图像处理或对象识别任务,默认为 None
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码,指定模型注意力的作用范围,默认为 None
        token_type_ids: Optional[torch.Tensor] = None,  # token 类型 IDs,用于处理多句子任务时区分不同句子,默认为 None
        position_ids: Optional[torch.Tensor] = None,  # 位置 IDs,指定输入 token 的位置信息,默认为 None
        head_mask: Optional[torch.Tensor] = None,  # 头部掩码,用于屏蔽某些注意力头,默认为 None
        inputs_embeds: Optional[torch.Tensor] = None,  # 输入的嵌入向量,用于直接输入嵌入向量而不是 token IDs,默认为 None
        encoder_hidden_states: Optional[torch.Tensor] = None,  # 编码器的隐藏状态,默认为 None
        encoder_attention_mask: Optional[torch.Tensor] = None,  # 编码器的注意力掩码,默认为 None
        past_key_values: Optional[List[torch.FloatTensor]] = None,  # 过去的键值对,用于存储过去的注意力信息,默认为 None
        use_cache: Optional[bool] = None,  # 是否使用缓存,用于存储中间计算结果以加速反向传播,默认为 None
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重,默认为 None
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态,默认为 None
        return_dict: Optional[bool] = None,  # 是否以字典形式返回输出,默认为 None
# 为 BrosForTokenClassification 类添加文档字符串,描述其作为 Bros 模型的一个带有标记分类头的子类,用于命名实体识别(NER)等任务
@add_start_docstrings(
    """
    Bros Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    BROS_START_DOCSTRING,
)
class BrosForTokenClassification(BrosPreTrainedModel):
    # 在加载时忽略的键列表,遇到未预期的 "pooler" 键时不加载
    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
        # 调用父类的初始化方法,传入配置对象 config
        super().__init__(config)
        # 初始化模型的标签数量
        self.num_labels = config.num_labels

        # 初始化 BrosModel,传入配置对象 config
        self.bros = BrosModel(config)
        
        # 根据配置设置分类器的 dropout 概率,若配置对象中存在 "classifier_dropout" 属性则使用其值,否则使用隐藏层 dropout 的概率
        classifier_dropout = (
            config.classifier_dropout if hasattr(config, "classifier_dropout") else config.hidden_dropout_prob
        )
        # 定义一个 dropout 层,用于分类器
        self.dropout = nn.Dropout(classifier_dropout)
        
        # 定义一个线性层,将隐藏状态映射到标签数量的输出空间
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化模型权重
        self.init_weights()

    # 为 forward 方法添加文档字符串,描述输入参数和输出类型,参照 BROS_INPUTS_DOCSTRING 的格式
    @add_start_docstrings_to_model_forward(BROS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 替换返回值的文档字符串,指定输出类型为 TokenClassifierOutput,配置类为 _CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        bbox: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        bbox_first_token_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        Token classification model's forward method.
        
        Args:
            input_ids (torch.Tensor): Input token IDs.
            bbox (torch.Tensor): Bounding box coordinates for tokens.
            attention_mask (torch.Tensor, optional): Mask for attention mechanism.
            token_type_ids (torch.Tensor, optional): Type IDs for tokens.
            position_ids (torch.Tensor, optional): Positional embeddings.
            head_mask (torch.Tensor, optional): Mask for attention heads.
            inputs_embeds (torch.Tensor, optional): Embedded inputs.
            output_attentions (bool, optional): Whether to output attentions.
            output_hidden_states (bool, optional): Whether to output hidden states.
            return_dict (bool, optional): Whether to return as a dictionary.

        Returns:
            Union[Tuple[torch.Tensor], TokenClassifierOutput]: Model outputs.

        Examples:

        ```
        >>> import torch
        >>> from transformers import BrosProcessor, BrosForTokenClassification

        >>> processor = BrosProcessor.from_pretrained("jinho8345/bros-base-uncased")

        >>> model = BrosForTokenClassification.from_pretrained("jinho8345/bros-base-uncased")

        >>> encoding = processor("Hello, my dog is cute", add_special_tokens=False, return_tensors="pt")
        >>> bbox = torch.tensor([[[0, 0, 1, 1]]]).repeat(1, encoding["input_ids"].shape[-1], 1)
        >>> encoding["bbox"] = bbox

        >>> outputs = model(**encoding)
        ```
        """

        # Determine whether to use the return dictionary format or not
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass inputs to the model's token classification method
        outputs = self.bros(
            input_ids,
            bbox=bbox,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract the sequence output from the model's outputs
        sequence_output = outputs[0]

        # Apply dropout to the sequence output
        sequence_output = self.dropout(sequence_output)

        # Pass the sequence output through the classifier
        logits = self.classifier(sequence_output)

        # Calculate loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            if bbox_first_token_mask is not None:
                bbox_first_token_mask = bbox_first_token_mask.view(-1)
                loss = loss_fct(
                    logits.view(-1, self.num_labels)[bbox_first_token_mask], labels.view(-1)[bbox_first_token_mask]
                )
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # If return_dict is False, prepare the output as a tuple
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # If return_dict is True, prepare the output as a TokenClassifierOutput object
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 用于标注任务的 Bros 模型,其在隐藏状态输出之上添加了一个标记分类头部。
# initial_token_classifier 用于预测每个实体的第一个标记,subsequent_token_classifier 用于预测实体内部的后续标记。
# 与 BrosForTokenClassification 相比,这个模型对序列化错误更加健壮,因为它从一个标记预测下一个标记。

@add_start_docstrings(
    """
    Bros Model with a token classification head on top (a entity_linker layer on top of the hidden-states output) e.g.
    for Entity-Linking. The entity_linker is used to predict intra-entity links (one entity to another entity).
    """,
    BROS_START_DOCSTRING,
)
class BrosSpadeELForTokenClassification(BrosPreTrainedModel):
    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, config):
        super().__init__(config)
        self.config = config
        self.num_labels = config.num_labels
        self.n_relations = config.n_relations
        self.backbone_hidden_size = config.hidden_size

        self.bros = BrosModel(config)
        classifier_dropout = (
            config.classifier_dropout if hasattr(config, "classifier_dropout") else config.hidden_dropout_prob
        )

        # Initial token classification for Entity Linking
        self.initial_token_classifier = nn.Sequential(
            nn.Dropout(classifier_dropout),
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.Dropout(classifier_dropout),
            nn.Linear(config.hidden_size, config.num_labels),
        )

        # Entity linker for Entity Linking
        self.entity_linker = BrosRelationExtractor(config)

        self.init_weights()

    @add_start_docstrings_to_model_forward(BROS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=BrosSpadeOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        bbox: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        bbox_first_token_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        initial_token_labels: Optional[torch.Tensor] = None,
        subsequent_token_labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    # 初始化函数,接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法,将配置对象传递给父类
        super().__init__(config)
        # 将配置对象保存在实例中
        self.config = config
        # 从配置对象中获取标签数目并保存在实例中
        self.num_labels = config.num_labels
        # 从配置对象中获取关系数目并保存在实例中
        self.n_relations = config.n_relations
        # 从配置对象中获取隐藏层大小并保存在实例中
        self.backbone_hidden_size = config.hidden_size

        # 创建 BrosModel 的实例并保存在实例变量中
        self.bros = BrosModel(config)
        
        # 检查配置对象中是否有 classifier_dropout 属性,如果有则使用其值,否则使用 hidden_dropout_prob 的值(此行代码存在问题,可能需要修正)
        (config.classifier_dropout if hasattr(config, "classifier_dropout") else config.hidden_dropout_prob)

        # 创建 BrosRelationExtractor 的实例并保存在实例变量中
        self.entity_linker = BrosRelationExtractor(config)

        # 调用模型的初始化权重方法
        self.init_weights()

    # 前向传播方法,接受多个输入参数,并按照给定的格式返回结果
    @add_start_docstrings_to_model_forward(BROS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        bbox: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        bbox_first_token_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        返回值的类型标注,可以是一个包含 torch.Tensor 的元组或者 TokenClassifierOutput 对象。

        Returns:
            返回模型预测的输出结果。

        Examples:
        示例代码展示了如何使用该方法进行预测和处理输出结果。

        ```
        >>> import torch
        >>> from transformers import BrosProcessor, BrosSpadeELForTokenClassification

        >>> processor = BrosProcessor.from_pretrained("jinho8345/bros-base-uncased")

        >>> model = BrosSpadeELForTokenClassification.from_pretrained("jinho8345/bros-base-uncased")

        >>> encoding = processor("Hello, my dog is cute", add_special_tokens=False, return_tensors="pt")
        >>> bbox = torch.tensor([[[0, 0, 1, 1]]]).repeat(1, encoding["input_ids"].shape[-1], 1)
        >>> encoding["bbox"] = bbox

        >>> outputs = model(**encoding)
        ```"""

        # 检查是否使用用户指定的 return_dict,若未指定则使用模型配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用模型的主要推断方法,传入各种输入参数
        outputs = self.bros(
            input_ids=input_ids,
            bbox=bbox,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 提取模型输出中的最后一个隐藏状态,并转置以便进行后续处理
        last_hidden_states = outputs[0]
        last_hidden_states = last_hidden_states.transpose(0, 1).contiguous()

        # 使用实体链接器对最后一个隐藏状态进行实体链接,得到最终的预测 logits
        logits = self.entity_linker(last_hidden_states, last_hidden_states).squeeze(0)

        # 初始化损失值为 None
        loss = None
        # 如果提供了标签,则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()

            # 获取批处理大小和最大序列长度
            batch_size, max_seq_length = attention_mask.shape
            device = attention_mask.device

            # 创建自我注意力掩码,用于过滤掉当前标记的自注意力
            self_token_mask = torch.eye(max_seq_length, max_seq_length + 1).to(device).bool()

            # 创建 bbox_first_token_mask,用于标记第一个标记是否包含 bbox
            mask = bbox_first_token_mask.view(-1)
            bbox_first_token_mask = torch.cat(
                [
                    ~bbox_first_token_mask,
                    torch.zeros([batch_size, 1], dtype=torch.bool).to(device),
                ],
                axis=1,
            )
            # 使用最小的浮点数填充 logits,以便在损失计算中忽略这些位置
            logits = logits.masked_fill(bbox_first_token_mask[:, None, :], torch.finfo(logits.dtype).min)
            logits = logits.masked_fill(self_token_mask[None, :, :], torch.finfo(logits.dtype).min)

            # 计算损失值
            loss = loss_fct(logits.view(-1, max_seq_length + 1)[mask], labels.view(-1)[mask])

        # 如果不需要返回字典形式的输出,则返回一个元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 否则返回 TokenClassifierOutput 对象,其中包含损失值、预测 logits、隐藏状态和注意力值
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
posted @ 2024-06-29 15:49  绝不原创的飞龙  阅读(8)  评论(0编辑  收藏  举报