Transformers-源码解析-二十四-

Transformers 源码解析(二十四)

.\models\canine\tokenization_canine.py

# coding=utf-8
# Copyright Google AI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Tokenization classes for CANINE.
"""

from typing import Dict, List, Optional

from ...tokenization_utils import AddedToken, PreTrainedTokenizer
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 预训练模型位置嵌入的大小,这里仅有一个预训练模型 'nielsr/canine-s',其位置嵌入大小为 2048
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "nielsr/canine-s": 2048,
}

# Unicode 定义了总共 1,114,112 个“代码点”
UNICODE_VOCAB_SIZE = 1114112

# 下面是定义特殊伪字符的规范代码点的常量
# 从 https://github.com/google-research/language/blob/master/language/canine/special_codepoints.py 复制而来
PAD = 0           # 填充字符
CLS = 0xE000      # 序列的开始标记
SEP = 0xE001      # 序列的分隔符
BOS = 0xE002      # 句子的开始标记
MASK = 0xE003     # 掩码标记
RESERVED = 0xE004 # 保留标记

# 将特殊代码点映射到人类可读的名称
SPECIAL_CODEPOINTS: Dict[int, str] = {
    CLS: "[CLS]",
    SEP: "[SEP]",
    BOS: "[BOS]",
    MASK: "[MASK]",
    PAD: "[PAD]",
    RESERVED: "[RESERVED]",
}

# 将特殊代码点的人类可读名称映射回其代码点值
SPECIAL_CODEPOINTS_BY_NAME: Dict[str, int] = {name: codepoint for codepoint, name in SPECIAL_CODEPOINTS.items()}


class CanineTokenizer(PreTrainedTokenizer):
    """
    构建 CANINE 分词器(即字符分割器)。它将文本转换为字符序列,然后将每个字符转换为其 Unicode 代码点。

    [`CanineTokenizer`] 继承自 [`PreTrainedTokenizer`]。

    有关参数使用示例和文档,请参阅超类 [`PreTrainedTokenizer`]。

    Args:
        model_max_length (`int`, *optional*, 默认为 2048):
                模型接受的最大句子长度。
    """

    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(
        self,
        bos_token=chr(CLS),
        eos_token=chr(SEP),
        sep_token=chr(SEP),
        cls_token=chr(CLS),
        pad_token=chr(PAD),
        mask_token=chr(MASK),
        add_prefix_space=False,
        model_max_length=2048,
        **kwargs,
    ):
        # 如果提供的特殊符号是字符串,则将其封装为 AddedToken 对象,确保左右两侧的空格不会被去除
        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token

        # 如果提供的 mask_token 是字符串,则创建 AddedToken 对象,并确保去除左侧空格
        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

        # 创建一个字典,用于查找特殊符号的 ID
        self._special_codepoints: Dict[str, int] = {}
        for codepoint, name in SPECIAL_CODEPOINTS.items():
            self._special_codepoints[name] = codepoint

        # 创建一个字典,用于查找特殊符号 ID 对应的字符串形式
        self._special_codepoint_strings: Dict[int, str] = {
            codepoint: name for name, codepoint in self._special_codepoints.items()
        }

        # 设置 Unicode 词汇表大小
        self._unicode_vocab_size = UNICODE_VOCAB_SIZE
        # 计算特殊符号的数量
        self._num_special_tokens = len(self._special_codepoints)

        # 调用父类的构造函数,初始化对象
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            add_prefix_space=add_prefix_space,
            model_max_length=model_max_length,
            **kwargs,
        )

    @property
    def vocab_size(self) -> int:
        # 返回 Unicode 词汇表的大小
        return self._unicode_vocab_size

    def get_vocab(self):
        # 创建并返回一个词汇表,包括所有 Unicode 字符和额外添加的 tokens
        vocab = {chr(i): i for i in range(self.vocab_size)}
        vocab.update(self.added_tokens_encoder)
        return vocab

    def _tokenize(self, text: str) -> List[str]:
        """Tokenize a string (i.e. perform character splitting)."""
        # 将字符串拆分为单个字符,并返回列表形式
        return list(text)

    def _convert_token_to_id(self, token: str) -> int:
        """Converts a token (i.e. a Unicode character) in an id (i.e. its integer Unicode code point value)."""
        try:
            # 将 Unicode 字符转换为其整数 Unicode 码点值
            return ord(token)
        except TypeError:
            raise ValueError(f"invalid token: '{token}'")

    def _convert_id_to_token(self, index: int) -> str:
        """
        Converts a Unicode code point (integer) in a token (str). In case it's a special code point, convert to
        human-readable format.
        """
        try:
            # 如果索引是特殊代码点,则转换为人类可读的格式
            if index in SPECIAL_CODEPOINTS:
                return SPECIAL_CODEPOINTS[index]
            # 否则,将整数转换为 Unicode 字符
            return chr(index)
        except TypeError:
            raise ValueError(f"invalid id: {index}")

    def convert_tokens_to_string(self, tokens):
        # 将 token 列表连接成一个字符串并返回
        return "".join(tokens)
    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
        adding special tokens. A CANINE sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # Define special tokens
        sep = [self.sep_token_id]  # SEP token ID
        cls = [self.cls_token_id]  # CLS token ID

        # Construct input with special tokens
        result = cls + token_ids_0 + sep
        if token_ids_1 is not None:
            result += token_ids_1 + sep
        return result

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        # Check if special tokens are already present
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # Initialize mask with special tokens
        result = [1] + ([0] * len(token_ids_0)) + [1]
        if token_ids_1 is not None:
            result += ([0] * len(token_ids_1)) + [1]
        return result

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create token type IDs tensor from token list sequences.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs for the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of token type IDs indicating the type of each token in the input sequences.
        """
        # Initialize token type IDs
        token_type_ids = [0] * len(token_ids_0)

        # If token_ids_1 is provided, set its token type IDs to 1
        if token_ids_1 is not None:
            token_type_ids += [1] * len(token_ids_1)

        return token_type_ids
    # 定义函数,用于生成用于序列对分类任务的掩码。CANINE序列对掩码的格式如下:
    #
    # ```
    # 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
    # | 第一个序列       | 第二个序列       |
    # ```
    #
    # 如果 `token_ids_1` 是 `None`,则此方法只返回掩码的第一个部分(全为0)。
    #
    # Args:
    #     token_ids_0 (`List[int]`):
    #         第一个序列的ID列表。
    #     token_ids_1 (`List[int]`, *optional*):
    #         第二个序列的ID列表,用于序列对。
    #
    # Returns:
    #     `List[int]`: 根据给定序列(s)生成的 [token type IDs](../glossary#token-type-ids) 列表。
    def create_sequence_pair_mask(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
        sep = [self.sep_token_id]  # 分隔符的ID列表
        cls = [self.cls_token_id]  # 类别开始的ID列表

        result = len(cls + token_ids_0 + sep) * [0]  # 初始化结果为第一个序列部分全为0的掩码
        if token_ids_1 is not None:
            result += len(token_ids_1 + sep) * [1]  # 如果存在第二个序列,则将其加入到掩码中,第二部分全为1
        return result  # 返回生成的掩码列表

    # CanineTokenizer没有词汇文件
    # 定义函数,用于保存词汇表(空操作)
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
        return ()  # 返回空元组,表示保存操作无需实际执行

.\models\canine\__init__.py

# 导入类型检查工具
from typing import TYPE_CHECKING

# 导入自定义异常类和模块惰性加载工具
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available

# 定义模块的导入结构字典
_import_structure = {
    "configuration_canine": ["CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP", "CanineConfig"],  # 配置相关模块导入列表
    "tokenization_canine": ["CanineTokenizer"],  # 分词器模块导入列表
}

# 检查是否存在 Torch 库,若不存在则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若 Torch 可用,则添加模型相关模块到导入结构字典中
    _import_structure["modeling_canine"] = [
        "CANINE_PRETRAINED_MODEL_ARCHIVE_LIST",
        "CanineForMultipleChoice",
        "CanineForQuestionAnswering",
        "CanineForSequenceClassification",
        "CanineForTokenClassification",
        "CanineLayer",
        "CanineModel",
        "CaninePreTrainedModel",
        "load_tf_weights_in_canine",
    ]

# 如果类型检查开启
if TYPE_CHECKING:
    # 从相应模块导入特定的配置和分词器类
    from .configuration_canine import CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP, CanineConfig
    from .tokenization_canine import CanineTokenizer

    # 再次检查 Torch 是否可用
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从模型相关模块导入特定的模型类
        from .modeling_canine import (
            CANINE_PRETRAINED_MODEL_ARCHIVE_LIST,
            CanineForMultipleChoice,
            CanineForQuestionAnswering,
            CanineForSequenceClassification,
            CanineForTokenClassification,
            CanineLayer,
            CanineModel,
            CaninePreTrainedModel,
            load_tf_weights_in_canine,
        )

# 如果不是类型检查状态,则配置惰性加载模块
else:
    import sys

    # 将当前模块替换为惰性加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\chinese_clip\configuration_chinese_clip.py

# coding=utf-8
# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""" Chinese-CLIP model configuration"""

import os
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union

# 如果 TYPE_CHECKING 为 True,则导入以下模块
if TYPE_CHECKING:
    from ...processing_utils import ProcessorMixin
    from ...utils import TensorType

# 导入 Transformers 库中的预训练配置类和 ONNX 配置类
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig
from ...utils import logging

# 获取模块专用的日志记录器
logger = logging.get_logger(__name__)

# 定义预训练模型名称到配置文件链接的映射字典
CHINESE_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "OFA-Sys/chinese-clip-vit-base-patch16": (
        "https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16/resolve/main/config.json"
    ),
}

# 定义 ChineseCLIPTextConfig 类,继承自 PretrainedConfig 类
class ChineseCLIPTextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ChineseCLIPModel`]. It is used to instantiate a
    Chinese CLIP model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the Chinese CLIP
    [OFA-Sys/chinese-clip-vit-base-patch16](https:
        //huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```
    >>> from transformers import ChineseCLIPTextConfig, ChineseCLIPTextModel

    >>> # Initializing a ChineseCLIPTextConfig with OFA-Sys/chinese-clip-vit-base-patch16 style configuration
    >>> configuration = ChineseCLIPTextConfig()

    >>> # Initializing a ChineseCLIPTextModel (with random weights) from the OFA-Sys/chinese-clip-vit-base-patch16 style configuration
    >>> model = ChineseCLIPTextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型标识符
    model_type = "chinese_clip_text_model"
    # 初始化函数,设置模型配置参数
    def __init__(
        self,
        vocab_size=30522,  # 词汇表大小,默认为30522
        hidden_size=768,   # 隐藏层大小,默认为768
        num_hidden_layers=12,  # 隐藏层层数,默认为12
        num_attention_heads=12,  # 注意力头数,默认为12
        intermediate_size=3072,  # 中间层大小,默认为3072
        hidden_act="gelu",  # 隐藏层激活函数,默认为GELU
        hidden_dropout_prob=0.1,  # 隐藏层dropout概率,默认为0.1
        attention_probs_dropout_prob=0.1,  # 注意力机制的dropout概率,默认为0.1
        max_position_embeddings=512,  # 最大位置嵌入数,默认为512
        type_vocab_size=2,  # 类型词汇表大小,默认为2
        initializer_range=0.02,  # 初始化范围,默认为0.02
        initializer_factor=1.0,  # 初始化因子,默认为1.0
        layer_norm_eps=1e-12,  # 层归一化的epsilon值,默认为1e-12
        pad_token_id=0,  # 填充标记ID,默认为0
        position_embedding_type="absolute",  # 位置嵌入类型,默认为绝对位置嵌入
        use_cache=True,  # 是否使用缓存,默认为True
        **kwargs,  # 其他关键字参数
    ):
        # 调用父类的初始化方法,设置填充标记ID和其他关键字参数
        super().__init__(pad_token_id=pad_token_id, **kwargs)

        # 初始化各个模型配置参数
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.initializer_factor = initializer_factor
        self.layer_norm_eps = layer_norm_eps
        self.position_embedding_type = position_embedding_type
        self.use_cache = use_cache

    @classmethod
    # 从预训练模型加载配置参数
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 设置kwargs中的token
        cls._set_token_in_kwargs(kwargs)

        # 获取配置字典和更新后的kwargs
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典的模型类型为"chinese_clip",则获取文本配置字典
        if config_dict.get("model_type") == "chinese_clip":
            config_dict = config_dict["text_config"]

        # 如果配置字典中包含模型类型,并且类具有model_type属性,并且配置字典的模型类型与类的模型类型不同,发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 从配置字典和kwargs创建类的实例
        return cls.from_dict(config_dict, **kwargs)
# 定义一个配置类 ChineseCLIPVisionConfig,继承自 PretrainedConfig 类
class ChineseCLIPVisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ChineseCLIPModel`]. It is used to instantiate an
    ChineseCLIP model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the ChineseCLIP
    [OFA-Sys/chinese-clip-vit-base-patch16](https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    
    # 构造函数,初始化 ChineseCLIPModel 的配置参数
    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        projection_dim (`int`, *optional*, defaults to 512):
            Dimentionality of text and vision projection layers.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 32):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
    Example:
    ```
    >>> from transformers import ChineseCLIPVisionConfig, ChineseCLIPVisionModel

    >>> # Initializing a ChineseCLIPVisionConfig with OFA-Sys/chinese-clip-vit-base-patch16 style configuration
    >>> configuration = ChineseCLIPVisionConfig()
    # 设置模型类型为 "chinese_clip_vision_model"
    model_type = "chinese_clip_vision_model"

    # 定义 ChineseCLIPVisionModel 类,继承自父类
    def __init__(
        self,
        hidden_size=768,
        intermediate_size=3072,
        projection_dim=512,
        num_hidden_layers=12,
        num_attention_heads=12,
        num_channels=3,
        image_size=224,
        patch_size=32,
        hidden_act="quick_gelu",
        layer_norm_eps=1e-5,
        attention_dropout=0.0,
        initializer_range=0.02,
        initializer_factor=1.0,
        **kwargs,
    ):
        # 调用父类构造函数
        super().__init__(**kwargs)

        # 初始化模型的各种参数
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.projection_dim = projection_dim
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_channels = num_channels
        self.patch_size = patch_size
        self.image_size = image_size
        self.initializer_range = initializer_range
        self.initializer_factor = initializer_factor
        self.attention_dropout = attention_dropout
        self.layer_norm_eps = layer_norm_eps
        self.hidden_act = hidden_act

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 从预训练模型名称或路径中获取配置信息
        cls._set_token_in_kwargs(kwargs)

        # 获取配置字典和额外的关键字参数
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典的模型类型为 "chinese_clip",则获取视觉配置字典
        if config_dict.get("model_type") == "chinese_clip":
            config_dict = config_dict["vision_config"]

        # 如果配置字典中包含模型类型,并且模型类型不等于当前类的模型类型,则发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 从配置字典和关键字参数中创建类的实例
        return cls.from_dict(config_dict, **kwargs)
# 定义一个用于存储 ChineseCLIPModel 配置的类,继承自 PretrainedConfig 类
class ChineseCLIPConfig(PretrainedConfig):
    r"""
    [`ChineseCLIPConfig`] 是用来存储 [`ChineseCLIPModel`] 的配置信息的类。它用于根据指定的参数实例化
    Chinese-CLIP 模型,定义了文本模型和视觉模型的配置。使用默认参数实例化配置将生成与
    Chinese-CLIP [OFA-Sys/chinese-clip-vit-base-patch16](https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16)
    架构类似的配置。

    配置对象继承自 [`PretrainedConfig`],可用于控制模型输出。阅读 [`PretrainedConfig`] 的文档获取更多信息。

    Args:
        text_config (`dict`, *optional*):
            用于初始化 [`ChineseCLIPTextConfig`] 的配置选项字典。
        vision_config (`dict`, *optional*):
            用于初始化 [`ChineseCLIPVisionConfig`] 的配置选项字典。
        projection_dim (`int`, *optional*, 默认为 512):
            文本和视觉投影层的维度。
        logit_scale_init_value (`float`, *optional*, 默认为 2.6592):
            *logit_scale* 参数的初始值。根据原始的 ChineseCLIP 实现使用默认值。
        kwargs (*optional*):
            关键字参数的字典。

    Example:

    ```
    >>> from transformers import ChineseCLIPConfig, ChineseCLIPModel

    >>> # 使用 OFA-Sys/chinese-clip-vit-base-patch16 风格的配置初始化 ChineseCLIPConfig
    >>> configuration = ChineseCLIPConfig()

    >>> # 使用 OFA-Sys/chinese-clip-vit-base-patch16 风格的配置初始化一个具有随机权重的 ChineseCLIPModel
    >>> model = ChineseCLIPModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config

    >>> # 也可以从 ChineseCLIPTextConfig 和 ChineseCLIPVisionConfig 初始化 ChineseCLIPConfig

    >>> # 初始化 ChineseCLIPTextConfig 和 ChineseCLIPVisionConfig 配置
    >>> config_text = ChineseCLIPTextConfig()
    >>> config_vision = ChineseCLIPVisionConfig()

    >>> config = ChineseCLIPConfig.from_text_vision_configs(config_text, config_vision)
    ```"""
    
    # 类变量,指定模型类型为 "chinese_clip"
    model_type = "chinese_clip"

    # 构造方法,初始化配置
    def __init__(
        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
    ):
        # 父类构造方法,使用传入的参数初始化配置
        super().__init__(**kwargs)

    @classmethod
    def from_text_vision_configs(
        cls, text_config: ChineseCLIPTextConfig, vision_config: ChineseCLIPVisionConfig, **kwargs
    ):
        # 类方法,从文本和视觉配置初始化 ChineseCLIPConfig 实例
        pass
        ):
        r"""
        Instantiate a [`ChineseCLIPConfig`] (or a derived class) from Chinese-CLIP text model configuration and
        Chinese-CLIP vision model configuration. Returns:
            [`ChineseCLIPConfig`]: An instance of a configuration object
        """

        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)


注释:


# 定义一个类方法,用于从给定的中文 CLIP 文本模型配置和视觉模型配置实例化一个 [`ChineseCLIPConfig`] 或其派生类的对象。
# 返回一个 [`ChineseCLIPConfig`] 的实例化对象。
# 定义一个名为 ChineseCLIPOnnxConfig 的类,继承自 OnnxConfig 类
class ChineseCLIPOnnxConfig(OnnxConfig):
    
    # 返回一个有序字典,描述模型的输入规格
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
                ("input_ids", {0: "batch", 1: "sequence"}),  # 输入的文本序列
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),  # 输入的图像像素值
                ("attention_mask", {0: "batch", 1: "sequence"}),  # 输入的注意力掩码
            ]
        )
    
    # 返回一个有序字典,描述模型的输出规格
    @property
    def outputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
                ("logits_per_image", {0: "batch"}),  # 图像输出的逻辑回归
                ("logits_per_text", {0: "batch"}),   # 文本输出的逻辑回归
                ("text_embeds", {0: "batch"}),       # 文本的嵌入表示
                ("image_embeds", {0: "batch"}),      # 图像的嵌入表示
            ]
        )
    
    # 返回用于验证时的绝对误差容限
    @property
    def atol_for_validation(self) -> float:
        return 1e-4
    
    # 生成模型的虚拟输入,包括文本和图像输入
    def generate_dummy_inputs(
        self,
        processor: "ProcessorMixin",
        batch_size: int = -1,
        seq_length: int = -1,
        framework: Optional["TensorType"] = None,
    ) -> Mapping[str, Any]:
        # 调用父类的方法生成文本输入
        text_input_dict = super().generate_dummy_inputs(
            processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
        )
        # 调用父类的方法生成图像输入
        image_input_dict = super().generate_dummy_inputs(
            processor.image_processor, batch_size=batch_size, framework=framework
        )
        # 合并文本和图像输入字典并返回
        return {**text_input_dict, **image_input_dict}
    
    # 返回默认的 ONNX 操作集版本
    @property
    def default_onnx_opset(self) -> int:
        return 14

.\models\chinese_clip\convert_chinese_clip_original_pytorch_to_hf.py

# 设置文件编码为 UTF-8,确保代码中的中文字符可以正确处理
# 版权声明和许可条款,指明代码的版权归属和使用许可
# 导入命令行参数解析模块
import argparse

# 导入 PyTorch 深度学习框架
import torch

# 导入 Transformers 库中的中文 CLIP 模型配置和模型类
from transformers import ChineseCLIPConfig, ChineseCLIPModel

# 定义函数:复制注意力层参数
def copy_attn_layer(hf_attn_layer, pt_weights, prefix):
    # 将 PyTorch 权重参数按照注意力头进行分块
    q_proj, k_proj, v_proj = pt_weights[f"{prefix}.in_proj_weight"].chunk(3, dim=0)
    # 分离偏置项
    q_proj_bias, k_proj_bias, v_proj_bias = pt_weights[f"{prefix}.in_proj_bias"].chunk(3, dim=0)

    # 复制权重和偏置到 HF 注意力层对象
    hf_attn_layer.q_proj.weight.data = q_proj
    hf_attn_layer.q_proj.bias.data = q_proj_bias

    hf_attn_layer.k_proj.weight.data = k_proj
    hf_attn_layer.k_proj.bias.data = k_proj_bias

    hf_attn_layer.v_proj.weight.data = v_proj
    hf_attn_layer.v_proj.bias.data = v_proj_bias

    # 复制输出投影层的权重和偏置
    out_proj_weights = pt_weights[f"{prefix}.out_proj.weight"]
    out_proj_bias = pt_weights[f"{prefix}.out_proj.bias"]
    hf_attn_layer.out_proj.weight.data = out_proj_weights
    hf_attn_layer.out_proj.bias.data = out_proj_bias

# 定义函数:复制 MLP 层参数
def copy_mlp(hf_mlp, pt_weights, prefix):
    # 复制线性变换层的权重和偏置
    copy_linear(hf_mlp.fc1, pt_weights, f"{prefix}.c_fc")
    copy_linear(hf_mlp.fc2, pt_weights, f"{prefix}.c_proj")

# 定义函数:复制线性变换层参数
def copy_linear(hf_linear, pt_weights, prefix):
    # 复制权重和偏置到 HF 线性层对象
    hf_linear.weight.data = pt_weights[f"{prefix}.weight"].data
    hf_linear.bias.data = pt_weights[f"{prefix}.bias"].data

# 定义函数:复制整个层的参数
def copy_layer(hf_layer, pt_weights, prefix):
    # 复制层归一化层的参数
    copy_linear(hf_layer.layer_norm1, pt_weights, f"{prefix}.ln_1")
    copy_linear(hf_layer.layer_norm2, pt_weights, f"{prefix}.ln_2")

    # 复制 MLP 层参数
    copy_mlp(hf_layer.mlp, pt_weights, f"{prefix}.mlp")

    # 复制注意力层参数
    copy_attn_layer(hf_layer.self_attn, pt_weights, f"{prefix}.attn")

# 定义函数:复制多层的参数
def copy_layers(hf_layers, pt_weights, prefix):
    # 遍历 HF 模型的每一层并复制参数
    for layer_id, hf_layer in enumerate(hf_layers):
        copy_layer(hf_layer, pt_weights, f"{prefix}.{layer_id}")

# 定义函数:复制文本模型和投影参数
def copy_text_model_and_projection(hf_model, pt_weights):
    # 复制文本投影层的权重,并转置数据
    hf_model.text_projection.weight.data = pt_weights["text_projection"].data.T

    # 遍历 HF 文本模型的每个参数并复制对应的 PyTorch 参数
    for name, param in hf_model.text_model.named_parameters():
        param.data = pt_weights[f"bert.{name}"].data

# 定义函数:复制视觉模型和投影参数
def copy_vision_model_and_projection(hf_model, pt_weights):
    # 复制视觉投影层的权重,并转置数据
    hf_model.visual_projection.weight.data = pt_weights["visual.proj"].data.T

    # 复制视觉模型的层归一化层参数
    copy_linear(hf_model.vision_model.pre_layrnorm, pt_weights, "visual.ln_pre")
    # 将 PyTorch 模型的后层归一化权重复制到 HF 模型的视觉模型的后层归一化
    copy_linear(hf_model.vision_model.post_layernorm, pt_weights, "visual.ln_post")

    # 复制嵌入层权重
    hf_model.vision_model.embeddings.patch_embedding.weight.data = pt_weights["visual.conv1.weight"].data
    hf_model.vision_model.embeddings.class_embedding.data = pt_weights["visual.class_embedding"].data
    hf_model.vision_model.embeddings.position_embedding.weight.data = pt_weights["visual.positional_embedding"].data

    # 复制编码器层
    copy_layers(hf_model.vision_model.encoder.layers, pt_weights, "visual.transformer.resblocks")
@torch.no_grad()
def convert_chinese_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
    """
    Copy/paste/tweak model's weights to transformers design.
    """

    # 确保传入的配置路径不为空,用于加载对应模型大小的 ChineseCLIP 配置
    assert config_path is not None, "Please specify the ChineseCLIP model config of the corresponding model size."
    # 从预训练的配置文件中加载 ChineseCLIPConfig
    config = ChineseCLIPConfig.from_pretrained(config_path)

    # 创建 ChineseCLIPModel 实例并设置为评估模式
    hf_model = ChineseCLIPModel(config).eval()

    # 使用 torch.load 加载模型权重,指定在 CPU 上加载
    pt_weights = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
    # 将模型权重字典中的键名处理为非多 GPU 情况下的模型名称格式
    pt_weights = {(name[7:] if name.startswith("module.") else name): value for name, value in pt_weights.items()}

    # 复制文本模型和投影层的权重到 hf_model
    copy_text_model_and_projection(hf_model, pt_weights)
    # 复制视觉模型和投影层的权重到 hf_model
    copy_vision_model_and_projection(hf_model, pt_weights)
    # 设置 hf_model 的 logit_scale 数据为 pt_weights 中的 logit_scale 数据
    hf_model.logit_scale.data = pt_weights["logit_scale"].data

    # 将转换后的模型保存到指定的 PyTorch 转储文件夹路径
    hf_model.save_pretrained(pytorch_dump_folder_path)


if __name__ == "__main__":
    # 创建命令行参数解析器
    parser = argparse.ArgumentParser()
    # 添加输出的 PyTorch 模型文件夹路径参数
    parser.add_argument(
        "--pytorch_dump_folder_path",
        default=None,
        type=str,
        help="Path to the output folder storing converted hf PyTorch model.",
    )
    # 添加原始 GitHub 格式 ChineseCLIP 检查点路径参数
    parser.add_argument(
        "--checkpoint_path", default=None, type=str, help="Path to original github format ChineseCLIP checkpoint."
    )
    # 添加必需的 hf 配置文件路径参数,用于模型转换
    parser.add_argument(
        "--config_path", default=None, required=True, type=str, help="Path to hf config.json of model to convert."
    )
    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数进行 ChineseCLIP 检查点的转换
    convert_chinese_clip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)
    # 打印转换完成的提示信息
    print("The conversion is finished!")

.\models\chinese_clip\feature_extraction_chinese_clip.py

# coding=utf-8
# 版权所有 2021 年 OFA-Sys 团队作者和 HuggingFace 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本(“许可证”)许可;
# 除非符合许可证的要求,否则您不能使用此文件。
# 您可以在以下网址获取许可证的副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则根据“许可证”分发的软件是基于“原样”提供的,
# 不提供任何形式的明示或暗示的担保或条件。
# 有关详细信息,请参阅许可证。
"""Chinese-CLIP 的特征提取器类。"""

import warnings

from ...utils import logging
from .image_processing_chinese_clip import ChineseCLIPImageProcessor

# 获取 logger 对象
logger = logging.get_logger(__name__)

class ChineseCLIPFeatureExtractor(ChineseCLIPImageProcessor):
    def __init__(self, *args, **kwargs) -> None:
        # 发出未来警告,表明 ChineseCLIPFeatureExtractor 类将在 Transformers 的第五个版本中移除
        warnings.warn(
            "The class ChineseCLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
            " Please use ChineseCLIPImageProcessor instead.",
            FutureWarning,
        )
        # 调用父类的初始化方法
        super().__init__(*args, **kwargs)

.\models\chinese_clip\image_processing_chinese_clip.py

# 设置文件编码为 UTF-8
# 版权声明,声明代码版权归 OFA-Sys 团队作者和 HuggingFace 团队所有
#
# 根据 Apache 许可证 2.0 版本使用本文件
# 除非符合许可证的条款,否则不得使用本文件
# 可以在以下网址获取许可证的
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
            `do_resize` in the `preprocess` method.
        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
            method.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
        do_center_crop (`bool`, *optional*, defaults to `True`):
            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
            `preprocess` method.
        crop_size (`Dict[str, int]` *optional*, defaults to 224):
            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
            method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
            the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
            method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
    """
    # 定义模型输入名称列表,用于模型输入的像素值
    model_input_names = ["pixel_values"]
    # 初始化函数,用于设置图像处理器的各种参数和属性
    def __init__(
        self,
        do_resize: bool = True,  # 是否进行图像尺寸调整的标志
        size: Dict[str, int] = None,  # 图像调整后的尺寸字典,包含宽和高
        resample: PILImageResampling = PILImageResampling.BICUBIC,  # 图像调整时的重采样方法,默认为双三次插值
        do_center_crop: bool = True,  # 是否进行中心裁剪的标志
        crop_size: Dict[str, int] = None,  # 裁剪后的尺寸字典,包含裁剪后的宽和高
        do_rescale: bool = True,  # 是否进行图像像素值缩放的标志
        rescale_factor: Union[int, float] = 1 / 255,  # 像素值缩放的因子,默认为1/255
        do_normalize: bool = True,  # 是否进行图像归一化的标志
        image_mean: Optional[Union[float, List[float]]] = None,  # 图像归一化的均值,默认使用OpenAI的均值
        image_std: Optional[Union[float, List[float]]] = None,  # 图像归一化的标准差,默认使用OpenAI的标准差
        do_convert_rgb: bool = True,  # 是否进行RGB格式转换的标志
        **kwargs,  # 其他可能的参数
    ) -> None:
        # 调用父类的初始化方法,传递可能的其他参数
        super().__init__(**kwargs)
        # 如果未提供图像调整后的尺寸字典,则设置默认最短边为224像素
        size = size if size is not None else {"shortest_edge": 224}
        # 获取标准化后的图像尺寸字典,确保不强制为正方形
        size = get_size_dict(size, default_to_square=False)
        # 如果未提供裁剪后的尺寸字典,则设置默认裁剪尺寸为224x224像素
        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
        # 获取标准化后的裁剪尺寸字典
        crop_size = get_size_dict(crop_size)

        # 将参数赋值给对象的属性
        self.do_resize = do_resize
        self.size = size
        self.resample = resample
        self.do_center_crop = do_center_crop
        self.crop_size = crop_size
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
        self.do_convert_rgb = do_convert_rgb
        # 定义有效的图像处理器键列表,用于后续验证
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "resample",
            "do_center_crop",
            "crop_size",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "do_convert_rgb",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]
    def resize_image(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
    ) -> np.ndarray:
        """
        Resize an image maintaining aspect ratio based on the shortest edge.

        Args:
            image (`np.ndarray`):
                The input image to be resized.
            size (`Dict[str, int]`):
                Dictionary containing target height and width.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                Resampling filter used during resizing.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the output image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image.

        Returns:
            np.ndarray:
                Resized image as a NumPy array.
        """
        # Obtain the resized size dictionary ensuring aspect ratio preservation
        size = get_size_dict(size, default_to_square=False)
        
        # Calculate the output size based on the input image and target size
        output_size = get_resize_output_image_size(
            image, size=(size["height"], size["width"]), default_to_square=False, input_data_format=input_data_format
        )
        
        # Perform the resizing operation
        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )

    def preprocess(
        self,
        images: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        resample: PILImageResampling = None,
        do_center_crop: bool = None,
        crop_size: int = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):
        """
        Preprocesses a batch of images based on specified parameters.

        Args:
            images (`ImageInput`):
                Input images to be preprocessed.
            do_resize (`bool`, *optional*):
                Whether to resize the images.
            size (`Dict[str, int]`, *optional*):
                Target size for resizing the images.
            resample (`PILImageResampling`, *optional*):
                Resampling filter used during image resizing.
            do_center_crop (`bool`, *optional*):
                Whether to perform center cropping.
            crop_size (`int`, *optional*):
                Size of the center crop.
            do_rescale (`bool`, *optional*):
                Whether to rescale the images.
            rescale_factor (`float`, *optional*):
                Factor by which to rescale the images.
            do_normalize (`bool`, *optional*):
                Whether to normalize the images.
            image_mean (`Optional[Union[float, List[float]]]`, *optional*):
                Mean value(s) for image normalization.
            image_std (`Optional[Union[float, List[float]]]`, *optional*):
                Standard deviation value(s) for image normalization.
            do_convert_rgb (`bool`, *optional*):
                Whether to convert images to RGB format.
            return_tensors (`Optional[Union[str, TensorType]]`, *optional*):
                Format of output tensors.
            data_format (`Optional[ChannelDimension]`, *optional*):
                Channel dimension format of the images.
            input_data_format (`Optional[Union[str, ChannelDimension]]`, *optional*):
                Channel dimension format of the input images.

        Returns:
            Processed images according to specified preprocessing steps.
        """

.\models\chinese_clip\modeling_chinese_clip.py

# 定义了编码为 UTF-8 的文件头声明
# 版权声明及许可信息,使用 Apache License, Version 2.0 许可协议
# 导入所需的库和模块
import math
from dataclasses import dataclass
from typing import Any, List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn

# 导入特定的自定义模块和类
from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPooling,
    BaseModelOutputWithPoolingAndCrossAttentions,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_chinese_clip import ChineseCLIPConfig, ChineseCLIPTextConfig, ChineseCLIPVisionConfig

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 用于文档化的模型检查点信息
_CHECKPOINT_FOR_DOC = "OFA-Sys/chinese-clip-vit-base-patch16"
# 用于文档化的配置信息
_CONFIG_FOR_DOC = "ChineseCLIPConfig"

# 可用的预训练模型列表
CHINESE_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "OFA-Sys/chinese-clip-vit-base-patch16",
    # 可在 https://huggingface.co/models?filter=chinese_clip 查看所有 Chinese-CLIP 模型
]


# 定义对比损失函数,来自 transformers.models.clip.modeling_clip.contrastive_loss
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
    """
    对比损失函数,计算交叉熵损失

    Args:
        logits (torch.Tensor): 模型预测的对比结果

    Returns:
        torch.Tensor: 计算的对比损失
    """
    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))


# 定义 Chinese-CLIP 的损失函数,包括文本和图像对比损失的平均值
def chinese_clip_loss(similarity: torch.Tensor) -> torch.Tensor:
    """
    Chinese-CLIP 损失函数,包括文本和图像对比损失的平均值

    Args:
        similarity (torch.Tensor): 模型预测的相似性分数

    Returns:
        torch.Tensor: 计算的 Chinese-CLIP 总损失
    """
    caption_loss = contrastive_loss(similarity)
    image_loss = contrastive_loss(similarity.t())  # 转置后计算图像损失
    return (caption_loss + image_loss) / 2.0


@dataclass
class ChineseCLIPOutput(ModelOutput):
    """
    Chinese-CLIP 模型输出类,继承自 ModelOutput 类
    """
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image: (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text: (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds: (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
            The text embeddings obtained by applying the projection layer to the pooled output of
            [`ChineseCLIPTextModel`].
        image_embeds: (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`ChineseCLIPVisionModel`].
        text_model_output: (`BaseModelOutputWithPoolingAndCrossAttentions`):
            The output of the [`ChineseCLIPTextModel`].
        vision_model_output: (`BaseModelOutputWithPoolingAndCrossAttentions`):
            The output of the [`ChineseCLIPVisionModel`].
    """

    # Optional attribute: Loss value representing contrastive loss for image-text similarity
    loss: Optional[torch.FloatTensor] = None
    # Attribute: Scores representing similarity between image and text embeddings
    logits_per_image: torch.FloatTensor = None
    # Attribute: Scores representing similarity between text and image embeddings
    logits_per_text: torch.FloatTensor = None
    # Attribute: Embeddings of text data after projection from ChineseCLIPTextModel
    text_embeds: torch.FloatTensor = None
    # Attribute: Embeddings of image data after projection from ChineseCLIPVisionModel
    image_embeds: torch.FloatTensor = None
    # Attribute: Output object from ChineseCLIPTextModel, including pooling and cross-attentions
    text_model_output: BaseModelOutputWithPoolingAndCrossAttentions = None
    # Attribute: Output object from ChineseCLIPVisionModel, including pooling and cross-attentions
    vision_model_output: BaseModelOutputWithPoolingAndCrossAttentions = None

    def to_tuple(self) -> Tuple[Any]:
        # Method: Converts all attributes to a tuple; certain attributes are converted to tuples recursively
        return tuple(
            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )
# 从 transformers.models.bert.modeling_bert.BertEmbeddings 复制并修改为 ChineseCLIPTextEmbeddings 类
class ChineseCLIPTextEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""

    def __init__(self, config):
        super().__init__()
        # 定义词嵌入层,将词汇 ID 映射到隐藏表示大小的向量空间
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 定义位置嵌入层,将位置 ID 映射到隐藏表示大小的向量空间
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 定义类型嵌入层,将类型 ID(如句子 A 或句子 B)映射到隐藏表示大小的向量空间
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # 使用 TensorFlow 模型变量名,保持与 TensorFlow 模型兼容,方便加载 TensorFlow 检查点文件
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 定义 dropout 层,用于随机失活以防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 位置编码类型,通常是绝对位置编码
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册缓冲区,保存位置 ID 的张量,用于序列化时持久化保存
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 注册缓冲区,保存类型 ID 的张量,初始为全零张量,用于序列化时持久化保存
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        past_key_values_length: int = 0,
    # 定义方法forward,接收输入参数input_ids, inputs_embeds, token_type_ids, position_ids, past_key_values_length,并返回torch.Tensor对象
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        past_key_values_length: int = 0,
    ) -> torch.Tensor:
        # 如果输入参数input_ids不为None,则获取其形状作为input_shape
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            # 否则,获取inputs_embeds的形状除了最后一个维度的所有维度作为input_shape
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度seq_length,从input_shape的第二个维度获取
        seq_length = input_shape[1]

        # 如果position_ids为None,则从self.position_ids中获取一部分切片,其范围从past_key_values_length到seq_length+past_key_values_length
        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        # 设置token_type_ids为已注册缓冲区中的值,通常是全零。这种情况通常发生在自动生成时,注册缓冲区帮助在模型跟踪过程中没有传递token_type_ids时的用户,解决问题#5664
        if token_type_ids is None:
            # 如果self对象具有属性"token_type_ids"
            if hasattr(self, "token_type_ids"):
                # 则从self.token_type_ids中获取一部分切片,其范围从第二维度的第一个元素到seq_length
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                # 将buffered_token_type_ids在第一维度复制input_shape[0]次,在第二维度复制seq_length次
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                # 否则,将token_type_ids设置为全零的torch.Tensor对象,形状为input_shape,数据类型为torch.long,设备为self.position_ids所在的设备
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果inputs_embeds为None,则使用self.word_embeddings对input_ids进行嵌入处理得到inputs_embeds
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        
        # 使用self.token_type_embeddings对token_type_ids进行嵌入处理得到token_type_embeddings
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # embeddings为inputs_embeds和token_type_embeddings的加和
        embeddings = inputs_embeds + token_type_embeddings

        # 如果位置嵌入类型为"absolute",则使用self.position_embeddings对position_ids进行嵌入处理并加到embeddings上
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings

        # 对embeddings进行LayerNorm归一化处理
        embeddings = self.LayerNorm(embeddings)

        # 对embeddings进行dropout处理
        embeddings = self.dropout(embeddings)

        # 返回处理后的embeddings作为方法的输出
        return embeddings
# 从 transformers.models.clip.modeling_clip.CLIPVisionEmbeddings 复制而来,将 CLIP 模型改为了 ChineseCLIP
class ChineseCLIPVisionEmbeddings(nn.Module):
    def __init__(self, config: ChineseCLIPVisionConfig):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size  # 设置嵌入维度为隐藏大小
        self.image_size = config.image_size  # 图像大小
        self.patch_size = config.patch_size  # 补丁大小

        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))  # 类别嵌入作为可学习参数

        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            bias=False,
        )  # 创建卷积层用于图像补丁嵌入

        self.num_patches = (self.image_size // self.patch_size) ** 2  # 计算图像中补丁的数量
        self.num_positions = self.num_patches + 1  # 位置嵌入的数量为补丁数加一
        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)  # 创建位置嵌入层
        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)  # 注册位置 ID 缓冲区,非持久性

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        batch_size = pixel_values.shape[0]  # 获取批次大小
        target_dtype = self.patch_embedding.weight.dtype  # 目标数据类型为补丁嵌入权重的数据类型
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # 将像素值通过卷积层得到补丁嵌入

        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)  # 对补丁嵌入进行展平和转置

        class_embeds = self.class_embedding.expand(batch_size, 1, -1)  # 扩展类别嵌入以匹配批次大小
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)  # 拼接类别嵌入和补丁嵌入

        embeddings = embeddings + self.position_embedding(self.position_ids)  # 添加位置嵌入到嵌入张量中
        return embeddings


# 从 transformers.models.bert.modeling_bert.BertSelfAttention 复制而来,将 Bert 自注意力机制改为了 ChineseCLIPText
class ChineseCLIPTextSelfAttention(nn.Module):
    # 初始化函数,接受配置和位置嵌入类型作为参数
    def __init__(self, config, position_embedding_type=None):
        # 调用父类的初始化方法
        super().__init__()
        # 检查隐藏大小是否能被注意力头的数量整除,如果不是则引发错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 设置注意力头的数量和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 初始化查询、键、值的线性变换层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 初始化 dropout 层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        # 设置位置嵌入类型,默认为绝对位置编码
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        # 如果位置嵌入类型是相对位置编码,则初始化距离嵌入层
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 设置是否为解码器
        self.is_decoder = config.is_decoder
# 从 transformers.models.bert.modeling_bert.BertSelfOutput 复制并修改为 ChineseCLIPTextSelfOutput 类
class ChineseCLIPTextSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 线性变换层,输入和输出维度都是 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # Layer normalization 层,归一化操作,eps 是归一化过程中的稳定性参数
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层,以 config.hidden_dropout_prob 的概率随机置零输入张量
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 线性变换操作,将 hidden_states 映射到相同维度空间
        hidden_states = self.dense(hidden_states)
        # Dropout 操作,防止过拟合
        hidden_states = self.dropout(hidden_states)
        # Layer normalization,通过归一化操作来稳定和加速训练
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertAttention 复制并修改为 ChineseCLIPTextAttention 类
class ChineseCLIPTextAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # ChineseCLIPTextSelfAttention 对象,用于计算自注意力机制
        self.self = ChineseCLIPTextSelfAttention(config, position_embedding_type=position_embedding_type)
        # ChineseCLIPTextSelfOutput 对象,将自注意力机制的输出进行线性变换、归一化和 dropout 处理
        self.output = ChineseCLIPTextSelfOutput(config)
        # 存储需要剪枝的注意力头的集合
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 找到可以剪枝的注意力头并获取对应的索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储剪枝的头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 执行自注意力机制,计算输出
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 对自注意力机制的输出进行处理,再次线性变换和归一化
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果需要输出注意力权重,则将其包含在输出中
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要,添加注意力权重到输出中
        return outputs


class ChineseCLIPVisionAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    # 初始化方法,接受一个配置对象作为参数
    def __init__(self, config):
        super().__init__()  # 调用父类的初始化方法
        self.config = config  # 保存配置对象
        self.embed_dim = config.hidden_size  # 获取隐藏大小作为嵌入维度
        self.num_heads = config.num_attention_heads  # 获取注意力头的数量
        self.head_dim = self.embed_dim // self.num_heads  # 计算每个头的维度
        # 检查嵌入维度是否可以被注意力头的数量整除
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.scale = self.head_dim ** -0.5  # 缩放因子,按照论文设定
        self.dropout = config.attention_dropout  # 注意力机制中的丢弃率

        # 初始化线性层,用于投影键、值、查询和输出
        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

    # 将张量重塑为适合多头注意力的形状
    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    # 前向传播方法,接受隐藏状态张量和是否输出注意力权重的选项
    def forward(
        self,
        hidden_states: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""
        # 获取隐藏状态张量的形状信息:批量大小、时间步长、嵌入维度
        bsz, tgt_len, embed_dim = hidden_states.size()

        # 获取查询投影
        query_states = self.q_proj(hidden_states) * self.scale
        # 获取键投影,并调整形状以匹配多头注意力的计算需求
        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
        # 获取值投影,并调整形状以匹配多头注意力的计算需求
        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)

        # 定义投影后张量的形状
        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
        # 调整查询状态的形状,以便进行多头注意力的计算
        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
        # 调整键状态的形状,以便进行多头注意力的计算
        key_states = key_states.view(*proj_shape)
        # 调整值状态的形状,以便进行多头注意力的计算
        value_states = value_states.view(*proj_shape)

        # 获取源序列的长度
        src_len = key_states.size(1)
        # 计算注意力权重
        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))

        # 检查注意力权重的尺寸是否符合预期
        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
                f" {attn_weights.size()}"
            )

        # 对注意力权重进行 softmax 操作
        attn_weights = nn.functional.softmax(attn_weights, dim=-1)

        # 如果需要输出注意力权重,则进行形状调整以保留梯度
        if output_attentions:
            # 这个操作有些笨拙,但是必须执行,以确保 attn_weights 保持其梯度
            # 为了实现这一点,需要两次重塑,并在接下来的操作中重复使用 attn_weights
            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
        else:
            attn_weights_reshaped = None

        # 对注意力权重应用 dropout
        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)

        # 计算注意力输出
        attn_output = torch.bmm(attn_probs, value_states)

        # 检查注意力输出的尺寸是否符合预期
        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        # 将注意力输出的形状调整为预期形状
        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
        attn_output = attn_output.transpose(1, 2)
        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)

        # 对输出进行最终投影
        attn_output = self.out_proj(attn_output)

        return attn_output, attn_weights_reshaped
# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->ChineseCLIPText
class ChineseCLIPTextIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层,将输入的 hidden_size 维度转换为 intermediate_size 维度
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置文件中的激活函数名称选择相应的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入的 hidden_states 应用全连接层
        hidden_states = self.dense(hidden_states)
        # 应用选定的激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->ChineseCLIPText
class ChineseCLIPTextOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化一个全连接层,将 intermediate_size 维度转换为 hidden_size 维度
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 初始化 LayerNorm 层,对 hidden_size 维度进行归一化
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 Dropout 层,应用概率为 hidden_dropout_prob 的 dropout
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入的 hidden_states 应用全连接层
        hidden_states = self.dense(hidden_states)
        # 应用 dropout
        hidden_states = self.dropout(hidden_states)
        # 应用 LayerNorm,并将结果与输入的 input_tensor 相加
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->ChineseCLIPVision
class ChineseCLIPVisionMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 根据配置文件中的激活函数名称选择相应的激活函数
        self.activation_fn = ACT2FN[config.hidden_act]
        # 初始化两个全连接层,分别将 hidden_size 维度转换为 intermediate_size 维度和 intermediate_size 维度转换为 hidden_size 维度
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入的 hidden_states 应用第一个全连接层
        hidden_states = self.fc1(hidden_states)
        # 应用选定的激活函数
        hidden_states = self.activation_fn(hidden_states)
        # 将结果再次应用到第二个全连接层
        hidden_states = self.fc2(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->ChineseCLIPText
class ChineseCLIPTextLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 设置 feed forward 操作的分块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度的维度,通常为 1
        self.seq_len_dim = 1
        # 初始化 self.attention 为 ChineseCLIPTextAttention 实例,使用给定的配置
        self.attention = ChineseCLIPTextAttention(config)
        # 是否为解码器模型
        self.is_decoder = config.is_decoder
        # 是否添加跨注意力机制
        self.add_cross_attention = config.add_cross_attention
        # 如果添加跨注意力机制但不是解码器模型,则引发异常
        if self.add_cross_attention:
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            # 初始化 self.crossattention 为 ChineseCLIPTextAttention 实例,使用给定的配置和绝对位置嵌入
            self.crossattention = ChineseCLIPTextAttention(config, position_embedding_type="absolute")
        # 初始化 self.intermediate 为 ChineseCLIPTextIntermediate 实例,使用给定的配置
        self.intermediate = ChineseCLIPTextIntermediate(config)
        # 初始化 self.output 为 ChineseCLIPTextOutput 实例,使用给定的配置
        self.output = ChineseCLIPTextOutput(config)
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 获取自注意力的缓存的键/值对,位置在 past_key_value 的第1和第2个位置
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 执行自注意力计算
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        # 获取自注意力输出
        attention_output = self_attention_outputs[0]

        # 如果是解码器,最后一个输出是自注意力缓存的元组
        if self.is_decoder:
            outputs = self_attention_outputs[1:-1]  # 排除最后一个元素,因为它是自注意力缓存
            present_key_value = self_attention_outputs[-1]
        else:
            outputs = self_attention_outputs[1:]  # 如果输出注意力权重,则添加自注意力
                                              
        cross_attn_present_key_value = None
        # 如果是解码器且存在编码器隐藏状态
        if self.is_decoder and encoder_hidden_states is not None:
            # 检查是否具有交叉注意力层,若没有则抛出异常
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # 获取交叉注意力的缓存的键/值对,位置在 past_key_value 的倒数第2和倒数第1个位置
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # 执行交叉注意力计算
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            # 获取交叉注意力的输出
            attention_output = cross_attention_outputs[0]
            outputs = outputs + cross_attention_outputs[1:-1]  # 如果输出注意力权重,则添加交叉注意力

            # 将交叉注意力缓存添加到 present_key_value 的第3、第4个位置
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # 对 attention_output 应用前向传播分块处理
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        outputs = (layer_output,) + outputs

        # 如果是解码器,将注意力的键/值对作为最后一个输出返回
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        return outputs
    # 定义一个方法,用于执行前向传播的一个步骤,接收注意力输出作为输入参数
    def feed_forward_chunk(self, attention_output):
        # 调用模型的 intermediate 方法处理注意力输出,得到中间层的输出
        intermediate_output = self.intermediate(attention_output)
        # 调用模型的 output 方法处理中间层输出和注意力输出,得到最终层的输出
        layer_output = self.output(intermediate_output, attention_output)
        # 返回最终层的输出作为该方法的结果
        return layer_output
# 定义一个用于处理视觉输入的自定义层,继承自 `nn.Module`
class ChineseCLIPVisionLayer(nn.Module):
    def __init__(self, config: ChineseCLIPConfig):
        super().__init__()
        # 设置嵌入维度为配置中的隐藏大小
        self.embed_dim = config.hidden_size
        # 初始化自注意力层,使用给定配置
        self.self_attn = ChineseCLIPVisionAttention(config)
        # 初始化第一个层归一化层,对隐藏状态进行归一化,使用给定的 epsilon
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
        # 初始化MLP(多层感知机)层,用于处理视觉输入
        self.mlp = ChineseCLIPVisionMLP(config)
        # 初始化第二个层归一化层,对隐藏状态进行归一化,使用给定的 epsilon
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)

    def forward(
        self,
        hidden_states: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        # 保存残差连接
        residual = hidden_states

        # 应用第一个层归一化层到隐藏状态
        hidden_states = self.layer_norm1(hidden_states)
        # 使用自注意力层处理归一化后的隐藏状态,根据需要输出注意力权重
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            output_attentions=output_attentions,
        )
        # 添加残差连接
        hidden_states = residual + hidden_states

        # 保存当前隐藏状态作为下一步的残差连接输入
        residual = hidden_states
        # 应用第二个层归一化层到隐藏状态
        hidden_states = self.layer_norm2(hidden_states)
        # 使用MLP处理归一化后的隐藏状态
        hidden_states = self.mlp(hidden_states)
        # 添加残差连接
        hidden_states = residual + hidden_states

        # 输出包含处理后的隐藏状态
        outputs = (hidden_states,)

        # 如果需要输出注意力权重,将注意力权重添加到输出中
        if output_attentions:
            outputs += (attn_weights,)

        return outputs


# 从 `transformers.models.bert.modeling_bert.BertPooler` 复制代码,并替换 Bert 为 ChineseCLIPText
class ChineseCLIPTextPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 使用线性层将隐藏状态转换为与隐藏大小相同的输出
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 使用双曲正切激活函数
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 通过简单地取第一个标记对应的隐藏状态来“汇集”模型
        first_token_tensor = hidden_states[:, 0]
        # 将第一个标记对应的隐藏状态通过线性层
        pooled_output = self.dense(first_token_tensor)
        # 应用双曲正切激活函数
        pooled_output = self.activation(pooled_output)
        # 返回汇集后的输出
        return pooled_output


class ChineseCLIPPreTrainedModel(PreTrainedModel):
    """
    一个处理权重初始化和预训练模型下载加载的抽象类。
    """

    # 指定配置类为 ChineseCLIPConfig
    config_class = ChineseCLIPConfig
    # 设置基础模型前缀为 "chinese_clip"
    base_model_prefix = "chinese_clip"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    # 初始化模型的权重
    def _init_weights(self, module):
        """Initialize the weights"""
        factor = self.config.initializer_factor
        # 如果是视觉嵌入模块
        if isinstance(module, ChineseCLIPVisionEmbeddings):
            factor = self.config.initializer_factor
            # 初始化类别嵌入的权重
            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
            # 初始化补丁嵌入的权重
            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
            # 初始化位置嵌入的权重
            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
        # 如果是文本嵌入模块
        elif isinstance(module, ChineseCLIPTextEmbeddings):
            # 初始化词嵌入的权重
            nn.init.normal_(module.word_embeddings.weight, mean=0.0, std=self.config.initializer_range)
            # 初始化位置嵌入的权重
            nn.init.normal_(module.position_embeddings.weight, mean=0.0, std=self.config.initializer_range)
            # 初始化标记类型嵌入的权重
            nn.init.normal_(module.token_type_embeddings.weight, mean=0.0, std=self.config.initializer_range)
            # 如果有填充索引,则将填充索引对应的权重置零
            for embedding in [module.word_embeddings, module.position_embeddings, module.token_type_embeddings]:
                if embedding.padding_idx is not None:
                    embedding.weight.data[embedding.padding_idx].zero_()
        # 如果是视觉注意力模块
        elif isinstance(module, ChineseCLIPVisionAttention):
            factor = self.config.initializer_factor
            # 输入投影的标准差
            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            # 输出投影的标准差
            out_proj_std = (module.embed_dim**-0.5) * factor
            # 初始化查询投影的权重
            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
            # 初始化键投影的权重
            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
            # 初始化值投影的权重
            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
            # 初始化输出投影的权重
            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
        # 如果是视觉MLP模块
        elif isinstance(module, ChineseCLIPVisionMLP):
            factor = self.config.initializer_factor
            # 输入投影的标准差
            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            # 全连接层的标准差
            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
            # 初始化第一个全连接层的权重
            nn.init.normal_(module.fc1.weight, std=fc_std)
            # 初始化第二个全连接层的权重
            nn.init.normal_(module.fc2.weight, std=in_proj_std)
        # 如果是整体模型
        elif isinstance(module, ChineseCLIPModel):
            # 初始化文本投影层的权重
            nn.init.normal_(
                module.text_projection.weight,
                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
            )
            # 初始化视觉投影层的权重
            nn.init.normal_(
                module.visual_projection.weight,
                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
            )

        # 如果是 LayerNorm 层
        if isinstance(module, nn.LayerNorm):
            # 将偏置项置零
            module.bias.data.zero_()
            # 将权重初始化为1
            module.weight.data.fill_(1.0)
        # 如果是线性层
        if isinstance(module, nn.Linear):
            # 初始化线性层的权重
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果有偏置项,则将偏置项置零
            if module.bias is not None:
                module.bias.data.zero_()
# 中文 CLIP 模型的起始文档字符串,描述了该模型是 PyTorch 中的一个子类,应当像常规的 PyTorch Module 一样使用。
# 模型的具体使用和行为相关的事项应参考 PyTorch 文档。
CHINESE_CLIP_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`ChineseCLIPConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# 中文 CLIP 模型输入文本的文档字符串部分,暂未填写具体内容。
CHINESE_CLIP_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            # 输入序列的标记索引,用于词汇表中的词汇。
            Indices of input sequence tokens in the vocabulary.

            # 可以使用 `AutoTokenizer` 获取这些索引。参见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__` 获取详细信息。

            [What are input IDs?](../glossary#input-ids)

        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 注意力遮罩,用于避免对填充标记索引执行注意力操作。遮罩值选择在 `[0, 1]` 之间:

            - 1 表示**未遮罩**的标记,
            - 0 表示**遮罩**的标记。

            [What are attention masks?](../glossary#attention-mask)

        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 段落标记索引,指示输入的第一部分和第二部分。索引在 `[0, 1]` 之间选择:

            - 0 对应于 *句子 A* 的标记,
            - 1 对应于 *句子 B* 的标记。

            [What are token type IDs?](../glossary#token-type-ids)

        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 输入序列标记在位置嵌入中的位置索引。在范围 `[0, config.max_position_embeddings - 1]` 中选择。

            [What are position IDs?](../glossary#position-ids)

        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            # 用于将自注意力模块中的选定头部置零的遮罩。遮罩值选择在 `[0, 1]` 之间:

            - 1 表示**未遮罩**的头部,
            - 0 表示**遮罩**的头部。

        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            # 可选项,可以直接传递嵌入表示而不是 `input_ids`。如果需要对如何将 `input_ids` 索引转换为相关向量进行更多控制,这是很有用的。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。详细信息请参见返回张量中的 `attentions`。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。详细信息请参见返回张量中的 `hidden_states`。

        return_dict (`bool`, *optional*):
            # 是否返回 `~utils.ModelOutput` 而不是普通元组。
"""
CHINESE_CLIP_VISION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`ChineseCLIPImageProcessor.__call__`] for details.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

CHINESE_CLIP_INPUTS_DOCSTRING = r"""
"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            # 输入序列标记的索引,用于表示词汇表中的标记。默认情况下会忽略填充部分。
            # 可以使用 `AutoTokenizer` 获得这些索引。详情请见 `PreTrainedTokenizer.encode` 和 `PreTrainedTokenizer.__call__`。

            [What are input IDs?](../glossary#input-ids)

        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 遮罩,用于在填充的标记索引上避免执行注意力操作。遮罩值在 `[0, 1]` 之间:

            - 1 表示**未被遮罩**的标记,
            - 0 表示**被遮罩**的标记。

            [What are attention masks?](../glossary#attention-mask)

        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 段标记索引,指示输入的第一部分和第二部分。索引值在 `[0, 1]` 之间:

            - 0 对应*句子 A* 的标记,
            - 1 对应*句子 B* 的标记。

            [What are token type IDs?](../glossary#token-type-ids)

        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 每个输入序列标记在位置嵌入中的位置索引。选取范围为 `[0, config.max_position_embeddings - 1]`。

            [What are position IDs?](../glossary#position-ids)

        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            # 像素值。默认情况下会忽略填充部分。像素值可以使用 `AutoImageProcessor` 获得。
            # 详情请见 `ChineseCLIPImageProcessor.__call__`。

        return_loss (`bool`, *optional*):
            # 是否返回对比损失。

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。详情请见返回的张量中的 `attentions` 部分。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。详情请见返回的张量中的 `hidden_states` 部分。

        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而非普通元组。
"""
# 从 transformers.models.bert.modeling_bert.BertEncoder 复制并修改为 ChineseCLIPTextEncoder

class ChineseCLIPTextEncoder(nn.Module):
    """
    ChineseCLIPTextEncoder 类,继承自 nn.Module,用于处理中文文本的编码器。

    Args:
        config (object): 配置对象,包含编码器的参数设置。

    Attributes:
        config (object): 编码器的配置参数。
        layer (nn.ModuleList): 包含多个 ChineseCLIPTextLayer 层的模块列表,用于构建编码器的层。
        gradient_checkpointing (bool): 是否启用梯度检查点,默认为 False,表示不启用。

    Methods:
        forward: 前向传播方法,接受多个输入和参数,返回编码器的输出结果。
    """

    def __init__(self, config):
        """
        ChineseCLIPTextEncoder 的初始化方法。

        Args:
            config (object): 配置对象,包含编码器的参数设置。
        """
        super().__init__()
        self.config = config
        # 创建多个 ChineseCLIPTextLayer 层,构成编码器的层列表
        self.layer = nn.ModuleList([ChineseCLIPTextLayer(config) for _ in range(config.num_hidden_layers)])
        # 默认关闭梯度检查点
        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        # 如果设置了输出隐藏状态,初始化空元组;否则置为None
        all_hidden_states = () if output_hidden_states else None
        # 如果设置了输出注意力权重,初始化空元组;否则置为None
        all_self_attentions = () if output_attentions else None
        # 如果设置了输出注意力权重并且模型配置中包含跨注意力,初始化空元组;否则置为None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果开启了梯度检查点且处于训练模式
        if self.gradient_checkpointing and self.training:
            # 如果use_cache为True,则给出警告并设置use_cache为False
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # 如果use_cache为True,则初始化空元组;否则置为None
        next_decoder_cache = () if use_cache else None
        # 遍历每个解码层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态,将当前隐藏状态加入all_hidden_states
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果有头部掩码,则获取当前解码层的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 如果有过去的键值对,则获取当前解码层的过去键值对
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果开启了梯度检查点且处于训练模式
            if self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数进行前向传播
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则,直接调用解码层模块进行前向传播
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            # 更新隐藏状态为解码层输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果use_cache为True,则将解码层输出的最后一个元素加入next_decoder_cache
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果需要输出注意力权重,将解码层输出的第二个元素加入all_self_attentions
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果模型配置中包含跨注意力,将解码层输出的第三个元素加入all_cross_attentions
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果需要输出隐藏状态,将最终的隐藏状态加入all_hidden_states
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不需要返回字典格式结果,以元组形式返回多个元素的结果
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 否则,以BaseModelOutputWithPastAndCrossAttentions格式返回结果
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
class ChineseCLIPVisionEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`ChineseCLIPVisionEncoderLayer`].

    Args:
        config: ChineseCLIPConfig
    """

    def __init__(self, config: ChineseCLIPConfig):
        super().__init__()
        self.config = config
        # 创建一个由多个 `ChineseCLIPVisionLayer` 实例组成的列表,每个实例代表一个编码器层
        self.layers = nn.ModuleList([ChineseCLIPVisionLayer(config) for _ in range(config.num_hidden_layers)])
        # 梯度检查点默认关闭
        self.gradient_checkpointing = False

    def forward(
        self,
        inputs_embeds,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, BaseModelOutput]:
        r"""
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
        # Determine whether to use the provided `output_attentions` or fall back to the model's default setting
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # Determine whether to use the provided `output_hidden_states` or fall back to the model's default setting
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # Determine whether to return a dictionary (`return_dict`) or a plain tuple based on the provided setting or model default
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Initialize empty tuples to store encoder states and attentions if not requested
        encoder_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        # Start with the input embeddings for processing through the transformer layers
        hidden_states = inputs_embeds
        # Iterate through each transformer layer
        for idx, encoder_layer in enumerate(self.layers):
            # Store hidden states of each layer if requested
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            
            # Use gradient checkpointing during training if enabled
            if self.gradient_checkpointing and self.training:
                # Apply the encoder layer function with gradient checkpointing
                layer_outputs = self._gradient_checkpointing_func(
                    encoder_layer.__call__,
                    hidden_states,
                    output_attentions,
                )
            else:
                # Apply the encoder layer function without gradient checkpointing
                layer_outputs = encoder_layer(
                    hidden_states,
                    output_attentions=output_attentions,
                )

            # Update hidden states with the outputs of the current layer
            hidden_states = layer_outputs[0]

            # Store attentions of each layer if requested
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        # Store final encoder states if requested
        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)

        # Return either a plain tuple or a `BaseModelOutput` depending on `return_dict` setting
        if not return_dict:
            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
        )
class ChineseCLIPVisionTransformer(nn.Module):
    def __init__(self, config: ChineseCLIPVisionConfig):
        super().__init__()
        self.config = config
        embed_dim = config.hidden_size

        # 初始化中文CLIP视觉嵌入层
        self.embeddings = ChineseCLIPVisionEmbeddings(config)
        # 添加预层归一化层
        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
        # 初始化中文CLIP视觉编码器
        self.encoder = ChineseCLIPVisionEncoder(config)
        # 添加后层归一化层
        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

    @add_start_docstrings_to_model_forward(CHINESE_CLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ChineseCLIPVisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        前向传播函数
        Args:
            pixel_values (Optional[torch.FloatTensor]): 输入像素值的张量
            output_attentions (Optional[bool]): 是否输出注意力权重
            output_hidden_states (Optional[bool]): 是否输出隐藏状态
            return_dict (Optional[bool]): 是否使用返回字典格式

        Returns:
            Union[Tuple, BaseModelOutputWithPooling]: 如果不使用返回字典,返回元组;否则返回带池化的基础模型输出
        """
        # 如果未提供像素值,抛出数值错误
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 将像素值嵌入到嵌入层中
        hidden_states = self.embeddings(pixel_values)
        # 应用预层归一化
        hidden_states = self.pre_layrnorm(hidden_states)

        # 将隐藏状态传递给编码器
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取最后的隐藏状态
        last_hidden_state = encoder_outputs[0]
        # 池化输出,取第一个位置的隐藏状态
        pooled_output = last_hidden_state[:, 0, :]
        # 应用后层归一化
        pooled_output = self.post_layernorm(pooled_output)

        # 如果不使用返回字典格式,返回元组
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 使用返回字典格式,返回带池化的基础模型输出
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )


@add_start_docstrings(
    "CHINESE_CLIP的文本模型,不包含任何顶部头部或投影。",
    CHINESE_CLIP_START_DOCSTRING,
)
class ChineseCLIPTextModel(ChineseCLIPPreTrainedModel):
    """
    
    模型可以作为编码器(仅自注意力)或解码器使用,在后一种情况下,会在自注意力层之间添加交叉注意力层,遵循[Attention is
    all you need](https://arxiv.org/abs/1706.03762)的架构描述,作者为Ashish Vaswani, Noam Shazeer, Niki Parmar,
    Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser和Illia Polosukhin。
    """
    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    """



    # 设置配置类为 ChineseCLIPTextConfig
    config_class = ChineseCLIPTextConfig

    # 模型初始化函数,接受配置和是否添加池化层的标志
    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config

        # 初始化文本嵌入层
        self.embeddings = ChineseCLIPTextEmbeddings(config)
        
        # 初始化文本编码器
        self.encoder = ChineseCLIPTextEncoder(config)

        # 根据是否添加池化层来初始化池化层,或者设为 None
        self.pooler = ChineseCLIPTextPooler(config) if add_pooling_layer else None

        # 调用后续初始化函数
        self.post_init()

    # 获取输入嵌入层
    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    # 设置输入嵌入层
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    # 剪枝模型中的注意力头
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            # 对指定层的注意力头进行剪枝
            self.encoder.layer[layer].attention.prune_heads(heads)

    # 前向传播函数,接受多个输入参数,详细见下方装饰器说明
    @add_start_docstrings_to_model_forward(CHINESE_CLIP_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
@add_start_docstrings(
    """The vision model from CHINESE_CLIP without any head or projection on top.""",
    CHINESE_CLIP_START_DOCSTRING,
)
class ChineseCLIPVisionModel(ChineseCLIPPreTrainedModel):
    # 设置配置类
    config_class = ChineseCLIPVisionConfig
    # 主要输入名称
    main_input_name = "pixel_values"

    def __init__(self, config: ChineseCLIPVisionConfig):
        # 调用父类构造函数初始化
        super().__init__(config)
        # 初始化视觉模型
        self.vision_model = ChineseCLIPVisionTransformer(config)
        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        # 返回嵌入的补丁嵌入层
        return self.vision_model.embeddings.patch_embedding

    @add_start_docstrings_to_model_forward(CHINESE_CLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ChineseCLIPVisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        Returns:

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import CLIPProcessor, ChineseCLIPVisionModel

        >>> model = ChineseCLIPVisionModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
        >>> processor = CLIPProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")

        >>> url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```"""
        # 如果未指定返回字典,则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 委托给视觉模型进行前向传播
        return self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )


@add_start_docstrings(CHINESE_CLIP_START_DOCSTRING)
class ChineseCLIPModel(ChineseCLIPPreTrainedModel):
    # 设置配置类
    config_class = ChineseCLIPConfig
    def __init__(self, config: ChineseCLIPConfig):
        # 调用父类的初始化方法,传入配置对象
        super().__init__(config)

        # 检查文本配置是否为正确类型,若不是则引发数值错误异常
        if not isinstance(config.text_config, ChineseCLIPTextConfig):
            raise ValueError(
                "config.text_config is expected to be of type ChineseCLIPTextConfig but is of type"
                f" {type(config.text_config)}."
            )

        # 检查视觉配置是否为正确类型,若不是则引发数值错误异常
        if not isinstance(config.vision_config, ChineseCLIPVisionConfig):
            raise ValueError(
                "config.vision_config is expected to be of type ChineseCLIPVisionConfig but is of type"
                f" {type(config.vision_config)}."
            )

        # 从配置对象中获取文本配置和视觉配置
        text_config = config.text_config
        vision_config = config.vision_config

        # 设置投影维度、文本嵌入维度和视觉嵌入维度
        self.projection_dim = config.projection_dim
        self.text_embed_dim = text_config.hidden_size
        self.vision_embed_dim = vision_config.hidden_size

        # 初始化文本模型和视觉模型,其中文本模型不添加池化层
        self.text_model = ChineseCLIPTextModel(text_config, add_pooling_layer=False)
        self.vision_model = ChineseCLIPVisionTransformer(vision_config)

        # 初始化视觉投影层和文本投影层,不包含偏置项
        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)

        # 初始化对数尺度参数
        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(CHINESE_CLIP_TEXT_INPUTS_DOCSTRING)
    def get_text_features(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> torch.FloatTensor:
        r"""
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the final [CLS] hidden state of Text-Transformer.

        Examples:

        ```
        >>> from transformers import AutoTokenizer, ChineseCLIPModel

        >>> model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
        >>> tokenizer = AutoTokenizer.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")

        >>> inputs = tokenizer(["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        >>> text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
        ```"""
        # 根据需要使用 CHINESE_CLIP 模型的配置中的一些字段(如果指定),而不是视觉和文本组件的配置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用文本模型处理输入,获取文本输出
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从文本输出中提取池化后的特征表示
        pooled_output = text_outputs[0][:, 0, :]
        # 将池化后的特征表示投影到文本特征空间
        text_features = self.text_projection(pooled_output)

        # 返回文本特征表示
        return text_features
    ) -> torch.FloatTensor:
        r"""
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the final [CLS] hidden state of Vision-Transformer.

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, ChineseCLIPModel

        >>> model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
        >>> processor = AutoProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")

        >>> url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        >>> image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
        ```"""
        # 使用 CHINESE_CLIP 模型的配置来覆盖一些字段(如果指定了的话),而不是视觉和文本组件的字段。
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用视觉模型,传入像素值、是否输出注意力权重、是否输出隐藏状态以及是否返回字典等参数
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从视觉模型的输出中获取第二个元素,即汇总的输出(pooled_output)
        pooled_output = vision_outputs[1]  # pooled_output
        # 使用视觉投影层对汇总的输出进行投影,得到最终的图像特征表示
        image_features = self.visual_projection(pooled_output)

        # 返回图像特征表示
        return image_features

    @add_start_docstrings_to_model_forward(CHINESE_CLIP_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=ChineseCLIPOutput, config_class=ChineseCLIPConfig)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        return_loss: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

.\models\chinese_clip\processing_chinese_clip.py

# coding=utf-8
# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Image/Text processor class for Chinese-CLIP
"""

import warnings

from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding


class ChineseCLIPProcessor(ProcessorMixin):
    r"""
    Constructs a Chinese-CLIP processor which wraps a Chinese-CLIP image processor and a Chinese-CLIP tokenizer into a
    single processor.

    [`ChineseCLIPProcessor`] offers all the functionalities of [`ChineseCLIPImageProcessor`] and [`BertTokenizerFast`].
    See the [`~ChineseCLIPProcessor.__call__`] and [`~ChineseCLIPProcessor.decode`] for more information.

    Args:
        image_processor ([`ChineseCLIPImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`BertTokenizerFast`], *optional*):
            The tokenizer is a required input.
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "ChineseCLIPImageProcessor"
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")

    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        # Deprecated feature_extractor warning and migration
        feature_extractor = None
        if "feature_extractor" in kwargs:
            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")

        # Determine the image_processor from feature_extractor or provided argument
        image_processor = image_processor if image_processor is not None else feature_extractor
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        # Initialize the processor with image_processor and tokenizer
        super().__init__(image_processor, tokenizer)
        self.current_processor = self.image_processor

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        # Delegate batch decoding to the underlying tokenizer
        return self.tokenizer.batch_decode(*args, **kwargs)
    # 将所有参数转发给 BertTokenizerFast 的 `~PreTrainedTokenizer.decode` 方法,并返回其结果
    def decode(self, *args, **kwargs):
        return self.tokenizer.decode(*args, **kwargs)

    # 返回模型输入名称的列表,这些名称由分词器和图像处理器的模型输入名称合并而成,且保持唯一性
    @property
    def model_input_names(self):
        # 获取分词器的模型输入名称列表
        tokenizer_input_names = self.tokenizer.model_input_names
        # 获取图像处理器的模型输入名称列表
        image_processor_input_names = self.image_processor.model_input_names
        # 将两个列表合并并去除重复项,返回结果列表
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

    # 返回图像处理器的类名,并发出关于该属性即将移除的警告
    @property
    def feature_extractor_class(self):
        warnings.warn(
            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
            FutureWarning,
        )
        # 返回图像处理器的类名作为特征提取器类的代理
        return self.image_processor_class

.\models\chinese_clip\__init__.py

# 导入必要的模块和函数
from typing import TYPE_CHECKING
# 从当前包中导入自定义异常和模块惰性加载类
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块的导入结构,包括不同功能模块的导入列表
_import_structure = {
    "configuration_chinese_clip": [
        "CHINESE_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "ChineseCLIPConfig",
        "ChineseCLIPOnnxConfig",
        "ChineseCLIPTextConfig",
        "ChineseCLIPVisionConfig",
    ],
    "processing_chinese_clip": ["ChineseCLIPProcessor"],
}

# 检查视觉处理模块是否可用,若不可用则抛出自定义异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用,则添加视觉特征提取和图像处理模块到导入结构中
    _import_structure["feature_extraction_chinese_clip"] = ["ChineseCLIPFeatureExtractor"]
    _import_structure["image_processing_chinese_clip"] = ["ChineseCLIPImageProcessor"]

# 检查是否Torch模块可用,若不可用则抛出自定义异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若Torch可用,则添加模型相关模块到导入结构中
    _import_structure["modeling_chinese_clip"] = [
        "CHINESE_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
        "ChineseCLIPModel",
        "ChineseCLIPPreTrainedModel",
        "ChineseCLIPTextModel",
        "ChineseCLIPVisionModel",
    ]

# 如果类型检查开启,导入相关配置和处理模块
if TYPE_CHECKING:
    from .configuration_chinese_clip import (
        CHINESE_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
        ChineseCLIPConfig,
        ChineseCLIPOnnxConfig,
        ChineseCLIPTextConfig,
        ChineseCLIPVisionConfig,
    )
    from .processing_chinese_clip import ChineseCLIPProcessor

    # 检查视觉处理模块是否可用,若不可用则跳过导入
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若可用,则导入视觉特征提取和图像处理模块
        from .feature_extraction_chinese_clip import ChineseCLIPFeatureExtractor, ChineseCLIPImageProcessor

    # 检查Torch模块是否可用,若不可用则跳过导入
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若Torch可用,则导入模型相关模块
        from .modeling_chinese_clip import (
            CHINESE_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
            ChineseCLIPModel,
            ChineseCLIPPreTrainedModel,
            ChineseCLIPTextModel,
            ChineseCLIPVisionModel,
        )

# 若不是类型检查模式,则将当前模块定义为惰性加载模块
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\clap\configuration_clap.py

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" CLAP model configuration"""

# 导入必要的模块
import os
from typing import Union

# 导入配置工具类和日志记录工具
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取logger对象用于日志记录
logger = logging.get_logger(__name__)

# 预训练模型的配置信息,包含模型名称和对应的配置文件URL
CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = {
    "laion/clap-htsat-fused": "https://huggingface.co/laion/clap-htsat-fused/resolve/main/config.json",
    "laion/clap-htsat-unfused": "https://huggingface.co/laion/clap-htsat-unfused/resolve/main/config.json",
}

# CLAP模型的配置类,继承自PretrainedConfig
class ClapTextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ClapTextModel`]. It is used to instantiate a CLAP
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the CLAP
    [calp-hsat-fused](https://huggingface.co/laion/clap-hsat-fused) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Examples:

    ```
    >>> from transformers import ClapTextConfig, ClapTextModel

    >>> # Initializing a CLAP text configuration
    >>> configuration = ClapTextConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = ClapTextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    # 模型类型的字符串标识
    model_type = "clap_text_model"

    # 初始化方法,定义了模型的各种参数配置
    def __init__(
        self,
        vocab_size=50265,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=514,
        type_vocab_size=1,
        initializer_factor=1.0,
        layer_norm_eps=1e-12,
        projection_dim=512,
        pad_token_id=1,
        bos_token_id=0,
        eos_token_id=2,
        position_embedding_type="absolute",
        use_cache=True,
        projection_hidden_act="relu",
        **kwargs,
    ):
        # 调用父类的初始化方法,设置特殊的令牌 ID 和其他关键字参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        # 设置配置类的属性
        self.vocab_size = vocab_size  # 词汇表大小
        self.hidden_size = hidden_size  # 隐藏层大小
        self.num_hidden_layers = num_hidden_layers  # 隐藏层数量
        self.num_attention_heads = num_attention_heads  # 注意力头的数量
        self.hidden_act = hidden_act  # 隐藏层激活函数类型
        self.intermediate_size = intermediate_size  # 中间层大小
        self.hidden_dropout_prob = hidden_dropout_prob  # 隐藏层的dropout概率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob  # 注意力概率dropout概率
        self.max_position_embeddings = max_position_embeddings  # 最大位置嵌入长度
        self.type_vocab_size = type_vocab_size  # 类型词汇表大小
        self.initializer_factor = initializer_factor  # 初始化因子
        self.layer_norm_eps = layer_norm_eps  # 层归一化 epsilon 值
        self.position_embedding_type = position_embedding_type  # 位置嵌入类型
        self.use_cache = use_cache  # 是否使用缓存
        self.projection_hidden_act = projection_hidden_act  # 投影隐藏层激活函数类型
        self.projection_dim = projection_dim  # 投影维度

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 在 kwargs 中设置令牌相关参数
        cls._set_token_in_kwargs(kwargs)

        # 获取配置字典和可能更新后的 kwargs
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果从 ClapConfig 加载,获取文本配置字典
        if config_dict.get("model_type") == "clap":
            config_dict = config_dict["text_config"]

        # 检查模型类型是否与当前类匹配,如果不匹配则发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 根据配置字典和 kwargs 创建配置实例
        return cls.from_dict(config_dict, **kwargs)
# 定义一个配置类 `ClapAudioConfig`,用于存储 `ClapAudioModel` 的配置信息。
# 继承自 `PretrainedConfig` 类,可以用来控制模型的输出。
class ClapAudioConfig(PretrainedConfig):
    # 模型类型标识为 "clap_audio_model"
    model_type = "clap_audio_model"

    # 初始化方法,用于设置配置类的各项参数
    def __init__(
        self,
        window_size=8,  # 滑动窗口大小,默认为 8
        num_mel_bins=64,  # Mel 频谱的 bin 数量,默认为 64
        spec_size=256,  # 音频谱图的尺寸,默认为 256
        hidden_act="gelu",  # 隐藏层激活函数,默认为 "gelu"
        patch_size=4,  # Patch 的大小,默认为 4
        patch_stride=[4, 4],  # Patch 的步幅,默认为 [4, 4]
        num_classes=527,  # 分类的类别数量,默认为 527
        hidden_size=768,  # 隐藏层大小,默认为 768
        projection_dim=512,  # 投影维度,默认为 512
        depths=[2, 2, 6, 2],  # 不同 Transformer 层的堆叠深度,默认为 [2, 2, 6, 2]
        num_attention_heads=[4, 8, 16, 32],  # 不同 Transformer 层的注意力头数,默认为 [4, 8, 16, 32]
        enable_fusion=False,  # 是否启用融合,默认为 False
        hidden_dropout_prob=0.1,  # 隐藏层的 dropout 概率,默认为 0.1
        fusion_type=None,  # 融合类型,默认为 None
        patch_embed_input_channels=1,  # Patch 嵌入的输入通道数,默认为 1
        flatten_patch_embeds=True,  # 是否展平 Patch 嵌入,默认为 True
        patch_embeds_hidden_size=96,  # Patch 嵌入的隐藏层大小,默认为 96
        enable_patch_layer_norm=True,  # 是否启用 Patch 层归一化,默认为 True
        drop_path_rate=0.0,  # DropPath 的比率,默认为 0.0
        attention_probs_dropout_prob=0.0,  # 注意力矩阵的 dropout 概率,默认为 0.0
        qkv_bias=True,  # 是否使用 QKV 的偏置,默认为 True
        mlp_ratio=4.0,  # MLP 层中隐藏层和输入层的维度比率,默认为 4.0
        aff_block_r=4,  # 仿射块的参数 r,默认为 4
        num_hidden_layers=4,  # 隐藏层的数量,默认为 4
        projection_hidden_act="relu",  # 投影层的激活函数,默认为 "relu"
        layer_norm_eps=1e-5,  # LayerNorm 的 epsilon,默认为 1e-5
        initializer_factor=1.0,  # 初始化因子,默认为 1.0
        **kwargs,  # 其余未命名的参数
    ):
        # 调用父类的初始化方法,传入所有关键字参数
        super().__init__(**kwargs)
        # 设置模型的窗口大小
        self.window_size = window_size
        # 设置梅尔频谱的频道数量
        self.num_mel_bins = num_mel_bins
        # 设置规范化后的频谱大小
        self.spec_size = spec_size
        # 设置每个补丁的大小
        self.patch_size = patch_size
        # 设置补丁的步长
        self.patch_stride = patch_stride
        # 设置类别数量
        self.num_classes = num_classes
        # 设置隐藏层的大小
        self.hidden_size = hidden_size
        # 设置层级列表
        self.depths = depths
        # 设置隐藏层数量
        self.num_hidden_layers = num_hidden_layers
        # 设置注意力头的数量
        self.num_attention_heads = num_attention_heads
        # 重新设置窗口大小(这里可能是冗余的,因为之前已经设置过)
        self.window_size = window_size
        # 启用融合
        self.enable_fusion = enable_fusion
        # 设置融合类型
        self.fusion_type = fusion_type
        # 设置隐藏层激活函数
        self.hidden_act = hidden_act
        # 设置隐藏层的dropout概率
        self.hidden_dropout_prob = hidden_dropout_prob
        # 设置投影维度
        self.projection_dim = projection_dim
        # 是否展平补丁嵌入
        self.flatten_patch_embeds = flatten_patch_embeds
        # 补丁嵌入的隐藏层大小
        self.patch_embeds_hidden_size = patch_embeds_hidden_size
        # 是否启用补丁层的规范化
        self.enable_patch_layer_norm = enable_patch_layer_norm
        # 设置丢弃路径的率
        self.drop_path_rate = drop_path_rate
        # 注意力概率的dropout概率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        # 是否在QKV上使用偏置
        self.qkv_bias = qkv_bias
        # 多层感知机的比率
        self.mlp_ratio = mlp_ratio
        # 补丁嵌入的输入通道数
        self.patch_embed_input_channels = patch_embed_input_channels
        # AffineBlock的半径
        self.aff_block_r = aff_block_r
        # 层归一化的epsilon值
        self.layer_norm_eps = layer_norm_eps
        # 初始化因子
        self.initializer_factor = initializer_factor
        # 投影的隐藏层激活函数
        self.projection_hidden_act = projection_hidden_act

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 设置kwargs中的token
        cls._set_token_in_kwargs(kwargs)

        # 获取预训练模型的配置字典和剩余的kwargs
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典中的模型类型为"clap",则从中获取音频配置字典
        if config_dict.get("model_type") == "clap":
            config_dict = config_dict["audio_config"]

        # 如果配置字典中包含模型类型,并且类本身有model_type属性,并且配置的模型类型不是类本身的模型类型,则发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 根据配置字典和kwargs创建配置对象
        return cls.from_dict(config_dict, **kwargs)
# `ClapConfig` 类,继承自 `PretrainedConfig`,用于存储 `ClapModel` 的配置信息。
# 该类用于实例化一个 CLAP 模型,根据指定的参数定义文本模型和音频模型的配置。
class ClapConfig(PretrainedConfig):
    r"""
    [`ClapConfig`] is the configuration class to store the configuration of a [`ClapModel`]. It is used to instantiate
    a CLAP model according to the specified arguments, defining the text model and audio model configs. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the CLAP
    [laion/clap-htsat-fused](https://huggingface.co/laion/clap-htsat-fused) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`ClapTextConfig`].
        audio_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`ClapAudioConfig`].
        logit_scale_init_value (`float`, *optional*, defaults to 14.29):
            The inital value of the *logit_scale* paramter. Default is used as per the original CLAP implementation.
        projection_dim (`int`, *optional*, defaults to 512):
            Dimentionality of text and audio projection layers.
        projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
            Activation function for the projection layers.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            Factor to scale the initialization of the model weights.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```
    >>> from transformers import ClapConfig, ClapModel

    >>> # Initializing a ClapConfig with laion-ai/base style configuration
    >>> configuration = ClapConfig()

    >>> # Initializing a ClapModel (with random weights) from the laion-ai/base style configuration
    >>> model = ClapModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a ClapConfig from a ClapTextConfig and a ClapAudioConfig
    >>> from transformers import ClapTextConfig, ClapAudioConfig

    >>> # Initializing a ClapText and ClapAudioConfig configuration
    >>> config_text = ClapTextConfig()
    >>> config_audio = ClapAudioConfig()

    >>> config = ClapConfig.from_text_audio_configs(config_text, config_audio)
    ```"""

    # 类属性 `model_type`,指定为 "clap",用于标识模型类型
    model_type = "clap"

    # 初始化方法,用于创建 `ClapConfig` 的实例对象
    def __init__(
        self,
        text_config=None,  # 文本配置的字典,用于初始化 `ClapTextConfig`
        audio_config=None,  # 音频配置的字典,用于初始化 `ClapAudioConfig`
        logit_scale_init_value=(1 / 0.07),  # `logit_scale` 参数的初始值,默认为 CLAP 实现的原始值
        projection_dim=512,  # 文本和音频投影层的维度
        projection_hidden_act="relu",  # 投影层的激活函数,默认为 ReLU
        initializer_factor=1.0,  # 模型权重初始化的缩放因子,默认为 1.0
        **kwargs,  # 其他可选的关键字参数
    ):
    ):
        super().__init__(**kwargs)
        
        # 如果 text_config 参数为 None,则初始化为空字典,并记录日志信息
        if text_config is None:
            text_config = {}
            logger.info("text_config is None. Initializing the ClapTextConfig with default values.")
        
        # 如果 audio_config 参数为 None,则初始化为空字典,并记录日志信息
        if audio_config is None:
            audio_config = {}
            logger.info("audio_config is None. initializing the ClapAudioConfig with default values.")
        
        # 使用给定的 text_config 和 audio_config 创建 ClapTextConfig 和 ClapAudioConfig 实例
        self.text_config = ClapTextConfig(**text_config)
        self.audio_config = ClapAudioConfig(**audio_config)
        
        # 设置投影维度(projection_dim)到 text_config 和 audio_config 的实例中
        self.text_config.projection_dim = projection_dim
        self.audio_config.projection_dim = projection_dim
        
        # 设置投影隐藏层激活函数(projection_hidden_act)到 text_config 和 audio_config 的实例中
        self.text_config.projection_hidden_act = projection_hidden_act
        self.audio_config.projection_hidden_act = projection_hidden_act
        
        # 设置对象自身的投影维度和投影隐藏层激活函数
        self.projection_dim = projection_dim
        self.projection_hidden_act = projection_hidden_act
        
        # 设置隐藏层大小(hidden_size)为 text_config 的隐藏层大小
        self.hidden_size = self.text_config.hidden_size
        
        # 设置 logit_scale_init_value 和 initializer_factor
        self.logit_scale_init_value = logit_scale_init_value
        self.initializer_factor = initializer_factor
        
        # 计算总的隐藏层数,由 text_config 的隐藏层数和 audio_config 的深度之和得到
        self.num_hidden_layers = self.text_config.num_hidden_layers + len(self.audio_config.depths)

    @classmethod
    def from_text_audio_configs(cls, text_config: ClapTextConfig, audio_config: ClapAudioConfig, **kwargs):
        r"""
        Instantiate a [`ClapConfig`] (or a derived class) from clap text model configuration and clap audio model
        configuration.

        Returns:
            [`ClapConfig`]: An instance of a configuration object
        """

        # 从给定的 text_config 和 audio_config 创建一个 ClapConfig 类的实例,并返回
        return cls(text_config=text_config.to_dict(), audio_config=audio_config.to_dict(), **kwargs)

.\models\clap\convert_clap_original_pytorch_to_hf.py

# 导入 argparse 模块,用于解析命令行参数
import argparse
# 导入 re 模块,用于正则表达式操作
import re

# 从 laion_clap 模块中导入 CLAP_Module 类
from laion_clap import CLAP_Module

# 从 transformers 库中导入 AutoFeatureExtractor、ClapConfig 和 ClapModel 类
from transformers import AutoFeatureExtractor, ClapConfig, ClapModel

# 定义一个字典,用于将旧模型中的键名映射到新模型中对应的键名
KEYS_TO_MODIFY_MAPPING = {
    "text_branch": "text_model",
    "audio_branch": "audio_model.audio_encoder",
    "attn": "attention.self",
    "self.proj": "output.dense",
    "attention.self_mask": "attn_mask",
    "mlp.fc1": "intermediate.dense",
    "mlp.fc2": "output.dense",
    "norm1": "layernorm_before",
    "norm2": "layernorm_after",
    "bn0": "batch_norm",
}

# 使用 laion/clap-htsat-unfused 模型来初始化自动特征提取器
processor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused", truncation="rand_trunc")


# 初始化 CLAP 模型的函数,接受检查点路径、模型类型和是否启用融合作为参数
def init_clap(checkpoint_path, model_type, enable_fusion=False):
    model = CLAP_Module(
        amodel=model_type,
        enable_fusion=enable_fusion,
    )
    # 加载模型检查点
    model.load_ckpt(checkpoint_path)
    return model


# 从原始 CLAP 模型中获取配置信息的函数,返回包含音频和文本配置的 ClapConfig 对象
def get_config_from_original(clap_model):
    # 从 CLAP 模型中提取音频配置信息
    audio_config = {
        "patch_embeds_hidden_size": clap_model.model.audio_branch.embed_dim,
        "depths": clap_model.model.audio_branch.depths,
        "hidden_size": clap_model.model.audio_projection[0].in_features,
    }

    # 从 CLAP 模型中提取文本配置信息
    text_config = {"hidden_size": clap_model.model.text_branch.pooler.dense.in_features}

    return ClapConfig(audio_config=audio_config, text_config=text_config)


# 重命名状态字典中键名的函数
def rename_state_dict(state_dict):
    model_state_dict = {}

    # 正则表达式模式,用于匹配包含 "sequential" 的层次结构的键名
    sequential_layers_pattern = r".*sequential.(\d+).*"

    # 正则表达式模式,用于匹配包含 "_projection" 的文本投影层级结构的键名
    text_projection_pattern = r".*_projection.(\d+).*"
    for key, value in state_dict.items():
        # 检查是否有需要修改的键名
        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
            if key_to_modify in key:
                # 替换需要修改的键名
                key = key.replace(key_to_modify, new_key)

        if re.match(sequential_layers_pattern, key):
            # 匹配顺序层模式,并进行替换
            sequential_layer = re.match(sequential_layers_pattern, key).group(1)

            key = key.replace(f"sequential.{sequential_layer}.", f"layers.{int(sequential_layer)//3}.linear.")
        elif re.match(text_projection_pattern, key):
            # 匹配文本投影模式,确定投影层编号
            projecton_layer = int(re.match(text_projection_pattern, key).group(1))

            # 根据 CLAP 中的使用情况,确定 Transformers 投影层编号
            transformers_projection_layer = 1 if projecton_layer == 0 else 2

            key = key.replace(f"_projection.{projecton_layer}.", f"_projection.linear{transformers_projection_layer}.")

        if "audio" and "qkv" in key:
            # 将 qkv 分割为查询、键和值
            mixed_qkv = value
            qkv_dim = mixed_qkv.size(0) // 3

            query_layer = mixed_qkv[:qkv_dim]
            key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
            value_layer = mixed_qkv[qkv_dim * 2 :]

            # 将分割后的查询、键和值存入模型状态字典
            model_state_dict[key.replace("qkv", "query")] = query_layer
            model_state_dict[key.replace("qkv", "key")] = key_layer
            model_state_dict[key.replace("qkv", "value")] = value_layer
        else:
            # 将未处理的键值对存入模型状态字典
            model_state_dict[key] = value

    # 返回最终的模型状态字典
    return model_state_dict
# 定义一个函数,用于将 CLAP 模型的检查点转换为 PyTorch 模型
def convert_clap_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path, model_type, enable_fusion=False):
    # 初始化 CLAP 模型,使用给定的检查点路径、模型类型和是否启用融合功能
    clap_model = init_clap(checkpoint_path, model_type, enable_fusion=enable_fusion)

    # 将 CLAP 模型设置为评估模式
    clap_model.eval()

    # 获取 CLAP 模型的状态字典
    state_dict = clap_model.model.state_dict()

    # 重命名状态字典的键名,确保适配 PyTorch 的命名规则
    state_dict = rename_state_dict(state_dict)

    # 从 CLAP 模型中获取原始的 Transformers 配置
    transformers_config = get_config_from_original(clap_model)

    # 根据配置创建一个新的 CLAP 模型
    transformers_config.audio_config.enable_fusion = enable_fusion
    model = ClapModel(transformers_config)

    # 加载模型的状态字典,忽略掉声谱图嵌入层(如果有的话)
    model.load_state_dict(state_dict, strict=False)

    # 将转换后的模型保存为 PyTorch 模型
    model.save_pretrained(pytorch_dump_folder_path)

    # 保存 Transformers 配置文件到指定路径
    transformers_config.save_pretrained(pytorch_dump_folder_path)


if __name__ == "__main__":
    # 创建命令行参数解析器
    parser = argparse.ArgumentParser()
    # 添加命令行参数,用于指定输出的 PyTorch 模型路径
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加命令行参数,用于指定 Fairseq 检查点路径
    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
    # 添加命令行参数,用于指定模型配置文件的路径(例如 hf config.json)
    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
    # 添加命令行参数,用于指示是否启用融合功能
    parser.add_argument("--enable_fusion", action="store_true", help="Whether to enable fusion or not")
    # 添加命令行参数,用于指定模型类型,默认为 "HTSAT-tiny"
    parser.add_argument("--model_type", default="HTSAT-tiny", type=str, help="Whether to enable fusion or not")
    # 解析命令行参数
    args = parser.parse_args()

    # 调用转换函数,将 CLAP 模型的检查点转换为 PyTorch 模型
    convert_clap_checkpoint(
        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.model_type, args.enable_fusion
    )

.\models\clap\feature_extraction_clap.py

# 设置文件编码为 UTF-8
# 版权声明和保留声明
#
# 根据 Apache 许可证 2.0 版本,除非符合许可证,否则不得使用此文件
# 您可以在以下网址获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则按"原样"分发本软件
# 无论是明示的还是暗示的,都没有任何保证或条件
# 请参阅许可证以了解特定语言的详细信息
"""CLAP 的特征提取器类。"""


import copy  # 导入拷贝模块,用于对象的复制操作
from typing import Any, Dict, List, Optional, Union  # 导入类型提示模块

import numpy as np  # 导入 NumPy 库,用于数值计算
import torch  # 导入 PyTorch 库,用于深度学习

from ...audio_utils import mel_filter_bank, spectrogram, window_function  # 导入音频处理相关函数
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor  # 导入序列特征提取器
from ...feature_extraction_utils import BatchFeature  # 导入批次特征处理工具
from ...utils import TensorType, logging  # 导入工具函数和日志记录

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


class ClapFeatureExtractor(SequenceFeatureExtractor):
    r"""
    构建一个 CLAP 特征提取器。

    此特征提取器继承自 [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`],其中包含
    大多数主要方法。用户应参考此超类以获取有关这些方法的更多信息。

    此类使用自定义的 NumPy 实现的 *短时傅里叶变换* (STFT) 从原始语音中提取梅尔滤波器组特征,该实现应与
    PyTorch 的 `torch.stft` 等效。

    """

    model_input_names = ["input_features", "is_longer"]  # 模型输入名称列表

    def __init__(
        self,
        feature_size=64,  # 特征大小,默认为 64
        sampling_rate=48_000,  # 采样率,默认为 48000 Hz
        hop_length=480,  # 跳跃长度,默认为 480
        max_length_s=10,  # 最大长度(秒),默认为 10 秒
        fft_window_size=1024,  # FFT 窗口大小,默认为 1024
        padding_value=0.0,  # 填充值,默认为 0.0
        return_attention_mask=False,  # 是否返回注意力掩码,默认为 False
        frequency_min: float = 0,  # 最小频率,默认为 0 Hz
        frequency_max: float = 14_000,  # 最大频率,默认为 14000 Hz
        top_db: int = None,  # 上分贝数,默认为 None
        truncation: str = "fusion",  # 截断方式,默认为 "fusion"
        padding: str = "repeatpad",  # 填充方式,默认为 "repeatpad"
        **kwargs,  # 其他参数
    ):
        super().__init__()  # 调用父类的构造函数
        # 在这里可以添加特定于 CLAP 特征提取器的初始化逻辑
    ):
        # 调用父类构造函数,初始化特征提取器的参数
        super().__init__(
            feature_size=feature_size,
            sampling_rate=sampling_rate,
            padding_value=padding_value,
            return_attention_mask=return_attention_mask,
            **kwargs,
        )
        # 设置顶部动态范围的阈值
        self.top_db = top_db
        # 设置是否截断音频的标志
        self.truncation = truncation
        # 设置填充值
        self.padding = padding
        # 设置FFT窗口大小
        self.fft_window_size = fft_window_size
        # 计算频率频段的数量
        self.nb_frequency_bins = (fft_window_size >> 1) + 1
        # 设置帧移长度
        self.hop_length = hop_length
        # 设置最大长度(秒)的样本数量
        self.nb_max_samples = max_length_s * sampling_rate
        # 设置采样率
        self.sampling_rate = sampling_rate
        # 设置最小频率
        self.frequency_min = frequency_min
        # 设置最大频率
        self.frequency_max = frequency_max
        # 创建Mel滤波器组,基于HTK标准
        self.mel_filters = mel_filter_bank(
            num_frequency_bins=self.nb_frequency_bins,
            num_mel_filters=feature_size,
            min_frequency=frequency_min,
            max_frequency=frequency_max,
            sampling_rate=sampling_rate,
            norm=None,
            mel_scale="htk",
        )
        # 创建Slaney标准的Mel滤波器组
        self.mel_filters_slaney = mel_filter_bank(
            num_frequency_bins=self.nb_frequency_bins,
            num_mel_filters=feature_size,
            min_frequency=frequency_min,
            max_frequency=frequency_max,
            sampling_rate=sampling_rate,
            norm="slaney",
            mel_scale="slaney",
        )

    def to_dict(self) -> Dict[str, Any]:
        """
        Serializes this instance to a Python dictionary.

        Returns:
            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, except for the
            mel filter banks, which do not need to be saved or printed as they are too long.
        """
        # 深拷贝当前实例的所有属性
        output = copy.deepcopy(self.__dict__)
        # 添加特征提取器的类型信息到输出字典中
        output["feature_extractor_type"] = self.__class__.__name__
        # 如果存在Mel滤波器,从输出中删除,因为它们太长不需要被保存或打印
        if "mel_filters" in output:
            del output["mel_filters"]
        # 如果存在Slaney标准的Mel滤波器,从输出中删除
        if "mel_filters_slaney" in output:
            del output["mel_filters_slaney"]
        # 返回序列化后的字典
        return output
    def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array] = None) -> np.ndarray:
        """
        使用汉宁窗口计算给定 `waveform` 的对数梅尔频谱。在CLAP中,根据截断模式使用两种不同的滤波器组:
            - `self.mel_filters`:这些对应于`torchaudio`的默认参数,可以通过调用`torchaudio.transforms.MelSpectrogram().mel_scale.fb`获得。
              当`truncation`设置为`"fusion"`时使用这些滤波器。
            - `self.mel_filteres_slaney`:这些对应于`librosa`的默认参数,在计算梅尔频谱时使用`librosa.filters.mel`。
              在原始实现中,仅当截断模式不是`"fusion"`时才使用这些滤波器。
        """
        # 计算对数梅尔频谱
        log_mel_spectrogram = spectrogram(
            waveform,
            window_function(self.fft_window_size, "hann"),  # 使用汉宁窗口函数
            frame_length=self.fft_window_size,
            hop_length=self.hop_length,
            power=2.0,
            mel_filters=mel_filters,
            log_mel="dB",  # 返回对数梅尔值
        )
        return log_mel_spectrogram.T  # 返回转置后的对数梅尔频谱

    def _random_mel_fusion(self, mel, total_frames, chunk_frames):
        ranges = np.array_split(list(range(0, total_frames - chunk_frames + 1)), 3)
        if len(ranges[1]) == 0:
            # 如果音频太短,只使用第一个块
            ranges[1] = [0]
        if len(ranges[2]) == 0:
            # 如果音频太短,只使用第一个块
            ranges[2] = [0]
        # 随机选择每个部分的索引
        idx_front = np.random.choice(ranges[0])
        idx_middle = np.random.choice(ranges[1])
        idx_back = np.random.choice(ranges[2])

        # 提取前、中、后各部分的梅尔频谱块
        mel_chunk_front = mel[idx_front : idx_front + chunk_frames, :]
        mel_chunk_middle = mel[idx_middle : idx_middle + chunk_frames, :]
        mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :]

        # 对输入的mel进行调整大小
        mel = torch.tensor(mel[None, None, :])
        mel_shrink = torch.nn.functional.interpolate(
            mel, size=[chunk_frames, 64], mode="bilinear", align_corners=False
        )
        mel_shrink = mel_shrink[0][0].numpy()

        # 合并各部分的梅尔频谱块
        mel_fusion = np.stack([mel_shrink, mel_chunk_front, mel_chunk_middle, mel_chunk_back], axis=0)
        return mel_fusion

    def __call__(
        self,
        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
        truncation: str = None,
        padding: Optional[str] = None,
        max_length: Optional[int] = None,
        sampling_rate: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        **kwargs,

.\models\clap\modeling_clap.py

# 设置编码格式为 UTF-8
# 版权声明,指出了 LAION-AI 团队和 HuggingFace 团队对代码的所有权
# 根据 Apache 许可证版本 2.0 使用此文件,详细信息可以在指定网址获取
""" PyTorch CLAP model. """
# 导入必要的库和模块
import collections
import math
from dataclasses import dataclass
from typing import Any, List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
from torch import nn

# 导入来自其他路径的模块和函数
from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPooling,
    BaseModelOutputWithPoolingAndCrossAttentions,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# 导入 CLAP 的配置文件
from .configuration_clap import ClapAudioConfig, ClapConfig, ClapTextConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 文档中的模型检查点
_CHECKPOINT_FOR_DOC = "laion/clap-htsat-fused"

# 预训练模型的存档列表
CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "laion/clap-htsat-fused",
    "laion/clap-htsat-unfused",
    # 更多 CLAP 模型可以在指定链接中查看
]


# 从 https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L191 改编
def interpolate(hidden_states, ratio):
    """
    在时间域内插值数据。这用于补偿 CNN 下采样导致的分辨率降低。

    Args:
        hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)):
            输入的隐藏状态
        ratio (`int`):
            输出长度与输入长度的比率。
    """
    # 获取隐藏状态的维度信息
    (batch_size, time_length, classes_num) = hidden_states.shape
    # 将隐藏状态进行上采样
    upsampled = hidden_states[:, :, None, :].repeat(1, 1, ratio, 1)
    # 重新调整上采样后的形状
    upsampled = upsampled.reshape(batch_size, time_length * ratio, classes_num)
    return upsampled


# 从 https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L249 改编
def window_partition(hidden_states, window_size):
    """
    返回调整大小后的隐藏状态。输出形状应为 `(batch_size * num_windows, window_size, window_size, num_channels)`

    Args:
        hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`):
            输入的隐藏状态
        window_size (`int`):
            窗口大小
    # 获取隐藏状态张量的形状信息,分别为批大小、高度、宽度、通道数
    batch_size, height, width, num_channels = hidden_states.shape
    
    # 将隐藏状态张量重塑为更小窗口大小的形状
    hidden_states = hidden_states.view(
        batch_size,                    # 新的批大小保持不变
        height // window_size,         # 将高度分割成窗口大小的部分
        window_size,                   # 窗口的高度
        width // window_size,          # 将宽度分割成窗口大小的部分
        window_size,                   # 窗口的宽度
        num_channels                   # 保持通道数不变
    )
    
    # 对重塑后的隐藏状态张量进行维度置换和连续化操作,以便形成窗口视图
    windows = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().view(
        -1,                            # 自动计算批大小乘以新窗口数的总数
        window_size,                   # 窗口的高度
        window_size,                   # 窗口的宽度
        num_channels                   # 通道数保持不变
    )
    
    # 返回重塑后的窗口视图
    return windows
# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L263
def window_reverse(windows, window_size, height, width):
    """
    Merges windows to produce higher resolution features.
    Args:
        windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
            Input windows
        window_size (`int`):
            Window size
        height (`int`):
            Height of the resized audio
        width (`int`):
            Width of the resized audio
    """
    # 获取输入窗口的最后一个维度,即通道数
    num_channels = windows.shape[-1]
    # 重新排列窗口,以生成更高分辨率的特征
    windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels)
    windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, height, width, num_channels)
    return windows


# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        input_ids (`torch.Tensor`):
            Input tensor of token IDs
        padding_idx (`int`):
            Index of padding tokens in input_ids
        past_key_values_length (`int`, optional):
            Length of past key values, default is 0

    Returns:
        torch.Tensor:
            Tensor with position IDs corresponding to input_ids
    """
    # 创建一个 mask,标记非填充符号的位置
    mask = input_ids.ne(padding_idx).int()
    # 计算递增的位置索引,并考虑过去键值的长度
    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
    return incremental_indices.long() + padding_idx


# contrastive loss function, adapted from
# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html#CLIP-loss-function
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
    """
    Computes the contrastive loss using logits and labels.

    Args:
        logits (`torch.Tensor`):
            Logits from the model
    Returns:
        torch.Tensor:
            Computed contrastive loss
    """
    # 创建标签,长度与logits相同
    labels = torch.arange(len(logits), device=logits.device)
    return nn.functional.cross_entropy(logits, labels)


@dataclass
# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Clap
class ClapTextModelOutput(ModelOutput):
    """
    Output class for CLAP text model that includes a pooling of the last hidden states.
    Inherits from transformers.modeling_outputs.ModelOutput.
    """
    pass
    Args:
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The text embeddings obtained by applying the projection layer to the pooler_output.
            通过将投影层应用于池化输出得到的文本嵌入向量。
        
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
            模型最后一层输出的隐藏状态序列。

        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
            模型每一层输出的隐藏状态元组,如果模型有嵌入层,则包括嵌入层输出,形状为`(batch_size, sequence_length, hidden_size)`。

        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            注意力权重元组,每层一个张量,形状为`(batch_size, num_heads, sequence_length, sequence_length)`。
@dataclass
# 定义一个数据类 ClapAudioModelOutput,用于存储 Clap 模型的输出结果,模仿原始实现的输出格式

class ClapAudioModelOutput(ModelOutput):
    """
    ClapAudio 模型的输出,模拟了原始实现的输出格式。

    Args:
        audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            应用投影层到汇聚输出得到的音频嵌入向量。
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的隐藏状态序列输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            元组,包含每一层的注意力权重 `torch.FloatTensor`,形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。

            在注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            元组,包含模型的隐藏状态 `torch.FloatTensor` 输出,如果模型有嵌入层,还包含初始嵌入输出。
            形状为 `(batch_size, sequence_length, hidden_size)`。

            模型每一层的隐藏状态以及可选的初始嵌入层输出。
    """

    audio_embeds: Optional[torch.FloatTensor] = None
    last_hidden_state: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None


@dataclass
# 从 transformers.models.clip.modeling_clip.CLIPOutput 复制而来,替换 CLIP 为 Clap,vision 为 audio,Vision 为 Audio,image 为 audio

class ClapOutput(ModelOutput):
    """
    Clap 模型的输出,模仿了原始实现的输出格式。
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for audio-text similarity.
        logits_per_audio:(`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
            The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
            similarity scores.
        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
            The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
            similarity scores.
        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
        audio_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
        text_model_output(`BaseModelOutputWithPooling`):
            The output of the [`ClapTextModel`].
        audio_model_output(`BaseModelOutputWithPooling`):
            The output of the [`ClapAudioModel`].
    """
    # Optional: Holds the contrastive loss if computed
    loss: Optional[torch.FloatTensor] = None
    # Holds the similarity scores between audio and text embeddings
    logits_per_audio: torch.FloatTensor = None
    # Holds the similarity scores between text and audio embeddings
    logits_per_text: torch.FloatTensor = None
    # Holds the text embeddings after projection from `ClapTextModel`
    text_embeds: torch.FloatTensor = None
    # Holds the audio embeddings after projection from `ClapAudioModel`
    audio_embeds: torch.FloatTensor = None
    # Stores the output of `ClapTextModel` including pooling
    text_model_output: BaseModelOutputWithPooling = None
    # Stores the output of `ClapAudioModel` including pooling
    audio_model_output: BaseModelOutputWithPooling = None

    def to_tuple(self) -> Tuple[Any]:
        # Converts all attributes into a tuple, converting `text_model_output` and `audio_model_output` into tuples as well
        return tuple(
            self[k] if k not in ["text_model_output", "audio_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )
# Adapted from transformers.models.swin.modeling_swin.SwinDropPath
class ClapDropPath(nn.Module):
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
    refactored version of the `SwinDropPath` implementation.
    """

    def __init__(self, drop_prob=None):
        super().__init__()
        self.drop_prob = drop_prob  # 初始化时设置 dropout 概率

    def forward(self, hidden_states):
        if self.drop_prob == 0.0 or not self.training:  # 如果 dropout 概率为0或者不在训练模式下,直接返回原始输入
            return hidden_states

        keep_prob = 1 - self.drop_prob  # 计算保留的概率
        # 根据输入 hidden_states 的维度,创建一个与之相同的随机张量
        shape = (hidden_states.shape[0],) + (1,) * (hidden_states.ndim - 1)
        random_tensor = keep_prob + torch.rand(shape, dtype=hidden_states.dtype, device=hidden_states.device)
        random_tensor.floor_()  # 将随机张量二值化
        output = hidden_states.div(keep_prob) * random_tensor  # 应用 dropout 操作
        return output


# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/feature_fusion.py#L133
class ClapAudioAFFBlock(nn.Module):
    r"""
    ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
    the 1D version.
    """

    def __init__(self, config: ClapAudioConfig):
        super().__init__()
        channels = config.patch_embeds_hidden_size
        downsize_ratio = config.aff_block_r
        inter_channels = int(channels // downsize_ratio)

        # 局部注意力机制模块
        self.local_att = nn.Sequential(
            nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),  # 1x1 卷积
            nn.BatchNorm2d(inter_channels),  # 批量归一化层
            nn.ReLU(inplace=True),  # ReLU 激活函数
            nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),  # 1x1 卷积
            nn.BatchNorm2d(channels),  # 批量归一化层
        )

        # 全局注意力机制模块
        self.global_att = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),  # 自适应平均池化层
            nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),  # 1x1 卷积
            nn.BatchNorm2d(inter_channels),  # 批量归一化层
            nn.ReLU(inplace=True),  # ReLU 激活函数
            nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),  # 1x1 卷积
            nn.BatchNorm2d(channels),  # 批量归一化层
        )

        self.sigmoid = nn.Sigmoid()  # Sigmoid 激活函数

    def forward(self, hidden_states, residual):
        attention_input = hidden_states + residual  # 输入特征与残差连接

        # 融合层输出为局部注意力和全局注意力的加权和
        fused_layer_output = self.local_att(attention_input) + self.global_att(attention_input)
        fused_layer_output = self.sigmoid(fused_layer_output)  # 应用 Sigmoid 激活

        # 最终输出为经过加权后的输入特征与残差的线性组合
        output = 2 * hidden_states * fused_layer_output + 2 * residual * (1 - fused_layer_output)
        return output


class ClapAudioPatchEmbed(nn.Module):
    """
    This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
    Transformer block.
    """
    # 初始化函数,接受一个 ClapAudioConfig 类型的配置对象作为参数
    def __init__(self, config: ClapAudioConfig):
        # 调用父类的初始化方法
        super().__init__()
        
        # 根据配置对象中的 spec_size 属性确定图像的尺寸,如果是整数则生成正方形尺寸,否则使用配置中的尺寸元组
        img_size = (config.spec_size, config.spec_size) if isinstance(config.spec_size, int) else config.spec_size
        
        # 根据配置对象中的 patch_size 属性确定 patch 的尺寸,如果是整数则生成正方形尺寸,否则使用配置中的尺寸元组
        patch_size = (
            (config.patch_size, config.patch_size) if isinstance(config.patch_size, int) else config.patch_size
        )
        
        # 根据配置对象中的 patch_stride 属性确定 patch 的步幅,如果是整数则生成相同步幅,否则使用配置中的步幅元组
        patch_stride = (
            (config.patch_stride, config.patch_stride) if isinstance(config.patch_stride, int) else config.patch_stride
        )

        # 将计算得到的图像尺寸和 patch 步幅存储到对象的 img_size 和 patch_stride 属性中
        self.img_size = img_size
        self.patch_stride = patch_stride

        # 计算图像网格的大小,即将图像尺寸按照 patch 步幅划分的块数
        self.grid_size = (img_size[0] // patch_stride[0], img_size[1] // patch_stride[1])
        
        # 计算总的 patch 数量,即图像网格的行数乘以列数
        self.num_patches = self.grid_size[0] * self.grid_size[1]

        # 根据配置对象中的 flatten_patch_embeds 属性确定是否展平 patch 的嵌入表示
        self.flatten = config.flatten_patch_embeds
        
        # 根据配置对象中的 enable_fusion 属性确定是否启用融合
        self.enable_fusion = config.enable_fusion

        # 根据 patch_size 和 patch_stride 计算用于卷积操作的 padding
        padding = ((patch_size[0] - patch_stride[0]) // 2, (patch_size[1] - patch_stride[1]) // 2)

        # 根据 enable_fusion 和 fusion_type 配置创建卷积核的尺寸缩放因子
        scale_factor = 4 if (self.enable_fusion) and (config.fusion_type == "channel_map") else 1

        # 创建一个卷积层,用于将 patch 的输入通道映射到隐藏表示空间,采用配置对象中的参数
        self.proj = nn.Conv2d(
            config.patch_embed_input_channels * scale_factor,  # 输入通道数为 patch 的输入通道数乘以尺寸缩放因子
            config.patch_embeds_hidden_size,  # 输出通道数为 patch 的嵌入表示的隐藏层大小
            kernel_size=patch_size,  # 卷积核尺寸为 patch_size
            stride=patch_stride,  # 步幅为 patch_stride
            padding=padding,  # 使用计算得到的 padding
        )

        # 根据配置对象中的 enable_patch_layer_norm 属性确定是否启用 patch 层的归一化
        self.norm = nn.LayerNorm(config.patch_embeds_hidden_size) if config.enable_patch_layer_norm else nn.Identity()
        
        # 如果启用融合,则创建融合模型和用于 mel 频谱的卷积层
        if self.enable_fusion:
            # 创建融合模型对象
            self.fusion_model = ClapAudioAFFBlock(config)
            
            # 创建用于 mel 频谱的卷积层,采用配置对象中的参数
            self.mel_conv2d = nn.Conv2d(
                config.patch_embed_input_channels,  # 输入通道数为 patch 的输入通道数
                config.patch_embeds_hidden_size,  # 输出通道数为 patch 的嵌入表示的隐藏层大小
                kernel_size=(patch_size[0], patch_size[1] * 3),  # 卷积核尺寸为 patch_size 的高,宽乘以3
                stride=(patch_stride[0], patch_stride[1] * 3),  # 步幅为 patch_stride 的高,宽乘以3
                padding=padding,  # 使用计算得到的 padding
            )
    # 前向传播函数,接受隐藏状态和可能的更长输入索引
    def forward(self, hidden_states, is_longer_idx=None):
        # 如果启用融合
        if self.enable_fusion:
            # 提取最后一个 mel,因为输入已经进行了转置
            global_hidden_states = hidden_states[:, 0:1, :, :]

            # 全局处理
            batch_size, num_channels, height, width = global_hidden_states.shape

            # 检查输入音频尺寸是否与模型期望的图像尺寸匹配
            if height != self.img_size[0] or width != self.img_size[1]:
                raise ValueError(
                    f"Input audio size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
                )

            # 对全局隐藏状态进行投影
            global_hidden_states = self.proj(global_hidden_states)
            output_width = global_hidden_states.size(-1)

            # 如果存在更长的输入索引
            if len(is_longer_idx) > 0:
                # 本地处理
                local_hidden_states = hidden_states[is_longer_idx, 1:, :, :].contiguous()
                batch_size, num_channels, height, width = local_hidden_states.shape
                # 重塑本地隐藏状态以便进行卷积操作
                local_hidden_states = local_hidden_states.view(batch_size * num_channels, 1, height, width)

                local_hidden_states = self.mel_conv2d(local_hidden_states)

                _, features, height, width = local_hidden_states.shape
                local_hidden_states = local_hidden_states.view(batch_size, num_channels, features, height, width)
                local_hidden_states = local_hidden_states.permute((0, 2, 3, 1, 4)).contiguous().flatten(3)

                local_width = local_hidden_states.size(-1)
                # 对本地隐藏状态进行填充,使其与全局隐藏状态的输出宽度一致
                local_hidden_states = torch.nn.functional.pad(
                    local_hidden_states, (0, output_width - local_width), "constant", 0
                )

                # 使用融合模型融合全局隐藏状态和本地隐藏状态
                global_hidden_states[is_longer_idx] = self.fusion_model(
                    global_hidden_states[is_longer_idx], local_hidden_states
                )
            # 更新隐藏状态为全局隐藏状态
            hidden_states = global_hidden_states
        else:
            # 如果未启用融合,直接进行投影
            _, _, height, width = hidden_states.shape
            # 检查输入音频尺寸是否与模型期望的图像尺寸匹配
            if height != self.img_size[0] or width != self.img_size[1]:
                raise ValueError(
                    f"Input audio size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
                )
            hidden_states = self.proj(hidden_states)

        # 如果设置了 flatten 标志,将隐藏状态展平并转置
        if self.flatten:
            hidden_states = hidden_states.flatten(2).transpose(1, 2)
        # 对隐藏状态进行归一化
        hidden_states = self.norm(hidden_states)
        # 返回最终处理后的隐藏状态
        return hidden_states
# 从 transformers.models.swin.modeling_swin.SwinSelfAttention 复制并改名为 ClapAudioSelfAttention 的类定义
class ClapAudioSelfAttention(nn.Module):
    # 初始化方法,接受 config、维度 dim、注意力头数 num_heads 和窗口大小 window_size 作为参数
    def __init__(self, config, dim, num_heads, window_size):
        super().__init__()
        # 如果维度 dim 不能被注意力头数 num_heads 整除,抛出数值错误异常
        if dim % num_heads != 0:
            raise ValueError(
                f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
            )

        # 设置注意力头数和每个头的大小
        self.num_attention_heads = num_heads
        self.attention_head_size = int(dim / num_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.window_size = (
            window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
        )

        # 创建相对位置偏置表格参数,维度为 ((2 * window_size[0] - 1) * (2 * window_size[1] - 1)) x num_heads
        self.relative_position_bias_table = nn.Parameter(
            torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
        )

        # 获取窗口内每个 token 的相对位置索引对
        coords_h = torch.arange(self.window_size[0])
        coords_w = torch.arange(self.window_size[1])
        coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
        coords_flatten = torch.flatten(coords, 1)
        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
        relative_coords[:, :, 0] += self.window_size[0] - 1
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)
        # 将相对位置索引作为缓冲区注册到模块中
        self.register_buffer("relative_position_index", relative_position_index)

        # 定义查询、键、值的线性变换层,并考虑配置中的偏置
        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)

        # 定义用于注意力掩码的丢弃层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    # 将输入张量 x 转换为注意力分数矩阵的形状
    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播方法,接受隐藏状态 hidden_states 和可选的注意力掩码 attention_mask 等作为输入参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
        ) -> Tuple[torch.Tensor]:
        # 获取隐藏状态张量的批大小、维度和通道数
        batch_size, dim, num_channels = hidden_states.shape
        # 使用 self.query 对隐藏状态进行查询,生成混合的查询层
        mixed_query_layer = self.query(hidden_states)

        # 使用 self.key 对隐藏状态进行键的转换,并为注意力打分做准备
        key_layer = self.transpose_for_scores(self.key(hidden_states))
        # 使用 self.value 对隐藏状态进行值的转换,并为注意力的加权求和做准备
        value_layer = self.transpose_for_scores(self.value(hidden_states))
        # 对混合的查询层进行转置,以便与键层进行点积运算
        query_layer = self.transpose_for_scores(mixed_query_layer)

        # 计算原始的注意力分数,即查询层和键层的点积
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        # 对注意力分数进行缩放,以减少梯度消失问题
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        # 获取相对位置偏置,根据预先计算的相对位置偏置表和索引
        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)]
        # 重新调整相对位置偏置的形状,以便与注意力分数相加
        relative_position_bias = relative_position_bias.view(
            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
        )
        # 对相对位置偏置进行维度置换和连续性处理
        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
        # 将相对位置偏置添加到注意力分数中
        attention_scores = attention_scores + relative_position_bias.unsqueeze(0)

        # 如果存在注意力掩码,则应用该掩码
        if attention_mask is not None:
            # 调整注意力分数的形状以便与注意力掩码相加
            mask_shape = attention_mask.shape[0]
            attention_scores = attention_scores.view(
                batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
            )
            attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0)
            attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim)

        # 将注意力分数归一化为概率分布
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 对注意力概率进行 dropout 处理
        attention_probs = self.dropout(attention_probs)

        # 如果存在头部掩码,则将注意力概率乘以头部掩码
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 计算加权求和后的值层,得到上下文层
        context_layer = torch.matmul(attention_probs, value_layer)
        # 对上下文层进行维度置换和连续性处理,以便后续计算
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        # 调整上下文层的形状,以便匹配预期的输出形状
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

        # 根据需要选择输出内容,包括注意力概率
        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)

        return outputs
# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput with Swin->ClapAudio
class ClapAudioSelfOutput(nn.Module):
    def __init__(self, config, dim):
        super().__init__()
        # 初始化一个全连接层,输入输出维度都为 dim
        self.dense = nn.Linear(dim, dim)
        # 初始化一个 dropout 层,使用给定的 dropout 概率
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 全连接层处理隐藏状态
        hidden_states = self.dense(hidden_states)
        # 使用 dropout 处理全连接层输出
        hidden_states = self.dropout(hidden_states)

        return hidden_states


# Copied from transformers.models.swin.modeling_swin.SwinAttention with Swin->ClapAudio
class ClapAudioAttention(nn.Module):
    def __init__(self, config, dim, num_heads, window_size):
        super().__init__()
        # 初始化 self attention 层
        self.self = ClapAudioSelfAttention(config, dim, num_heads, window_size)
        # 初始化输出层
        self.output = ClapAudioSelfOutput(config, dim)
        # 初始化一个集合,用于存储需要剪枝的注意力头
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 调用剪枝函数,获取需要剪枝的注意力头和相应的索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 对 self attention 层的查询、键、值线性层进行剪枝
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        # 对输出层的全连接层进行剪枝
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储剪枝过的头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用 self attention 层的 forward 方法
        self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
        # 通过输出层处理 self attention 层的输出和输入的隐藏状态
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果需要输出注意力权重,则将其添加到输出中
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要输出注意力权重,则将其添加到输出中
        return outputs


# Copied from transformers.models.swin.modeling_swin.SwinIntermediate with Swin->ClapAudio
class ClapAudioIntermediate(nn.Module):
    def __init__(self, config, dim):
        super().__init__()
        # 初始化一个全连接层,输入维度为 dim,输出维度为 config.mlp_ratio * dim
        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
        # 如果隐藏激活函数是字符串,则使用对应的函数映射;否则直接使用配置中的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用全连接层处理隐藏状态
        hidden_states = self.dense(hidden_states)
        # 使用中间激活函数处理全连接层输出
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states
# 从transformers.models.swin.modeling_swin.SwinOutput复制过来,将Swin替换为ClapAudio
class ClapAudioOutput(nn.Module):
    def __init__(self, config, dim):
        super().__init__()
        # 创建一个线性层,输入维度为config.mlp_ratio乘以dim的整数部分,输出维度为dim
        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
        # 创建一个dropout层,使用config.hidden_dropout_prob作为丢弃概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入的hidden_states传递给线性层
        hidden_states = self.dense(hidden_states)
        # 对线性层的输出进行dropout处理
        hidden_states = self.dropout(hidden_states)
        # 返回处理后的hidden_states作为输出
        return hidden_states


# 从transformers.models.swin.modeling_swin.SwinLayer复制过来,将Swin替换为ClapAudio,将SwinDropPath替换为ClapDropPath
class ClapAudioLayer(nn.Module):
    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
        super().__init__()
        # 设定前馈分块大小为config.chunk_size_feed_forward
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 设定位移大小为shift_size
        self.shift_size = shift_size
        # 设定窗口大小为config.window_size
        self.window_size = config.window_size
        # 设定输入分辨率为input_resolution
        self.input_resolution = input_resolution
        # 在LayerNorm层之前添加LayerNorm,输入维度为dim,epsilon值为config.layer_norm_eps
        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
        # 创建ClapAudioAttention对象,使用config、dim、num_heads和window_size作为参数
        self.attention = ClapAudioAttention(config, dim, num_heads, window_size=self.window_size)
        # 如果config.drop_path_rate大于0.0,则创建ClapDropPath对象,否则创建一个恒等映射层nn.Identity()
        self.drop_path = ClapDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
        # 在LayerNorm层之后添加LayerNorm,输入维度为dim,epsilon值为config.layer_norm_eps
        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
        # 创建ClapAudioIntermediate对象,使用config和dim作为参数
        self.intermediate = ClapAudioIntermediate(config, dim)
        # 创建ClapAudioOutput对象,使用config和dim作为参数
        self.output = ClapAudioOutput(config, dim)

    def set_shift_and_window_size(self, input_resolution):
        if min(input_resolution) <= self.window_size:
            # 如果输入分辨率中的最小值小于等于窗口大小,则不对窗口进行分区
            self.shift_size = 0
            self.window_size = min(input_resolution)

    def get_attn_mask(self, height, width, dtype):
        if self.shift_size > 0:
            # 计算SW-MSA的注意力掩码
            img_mask = torch.zeros((1, height, width, 1), dtype=dtype)
            height_slices = (
                slice(0, -self.window_size),
                slice(-self.window_size, -self.shift_size),
                slice(-self.shift_size, None),
            )
            width_slices = (
                slice(0, -self.window_size),
                slice(-self.window_size, -self.shift_size),
                slice(-self.shift_size, None),
            )
            count = 0
            for height_slice in height_slices:
                for width_slice in width_slices:
                    img_mask[:, height_slice, width_slice, :] = count
                    count += 1

            mask_windows = window_partition(img_mask, self.window_size)
            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
        else:
            attn_mask = None
        # 返回注意力掩码
        return attn_mask
    # 对输入的隐藏状态进行可能的填充,使其能够被窗口大小整除
    def maybe_pad(self, hidden_states, height, width):
        # 计算右侧需要填充的数量,确保宽度能够被窗口大小整除
        pad_right = (self.window_size - width % self.window_size) % self.window_size
        # 计算底部需要填充的数量,确保高度能够被窗口大小整除
        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
        # 组成填充的数值,(top, bottom, left, right, 0, 0),这里只填充右侧和底部
        pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
        # 使用PyTorch的函数对隐藏状态进行填充操作
        hidden_states = nn.functional.pad(hidden_states, pad_values)
        # 返回填充后的隐藏状态和填充数值
        return hidden_states, pad_values

    # 前向传播函数,接受隐藏状态、输入维度、头部遮罩、输出注意力权重、始终分区等参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        input_dimensions: Tuple[int, int],
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
        always_partition: Optional[bool] = False,
        ) -> Tuple[torch.Tensor, torch.Tensor]:  
        # 函数声明,接受一个输入参数并返回一个元组,包含两个 torch.Tensor 类型的对象

        if not always_partition:  
            # 如果参数 always_partition 不为真,则执行以下操作
            self.set_shift_and_window_size(input_dimensions)  
            # 调用对象的方法设置位移和窗口大小
        else:  
            # 否则,如果 always_partition 为真,则执行以下操作
            pass  
            # 不执行任何操作,直接跳过

        height, width = input_dimensions  
        # 解包输入维度元组,获取高度和宽度

        batch_size, _, channels = hidden_states.size()  
        # 获取隐藏状态张量的批量大小、通道数等信息
        shortcut = hidden_states  
        # 将隐藏状态张量赋值给快捷变量 shortcut

        hidden_states = self.layernorm_before(hidden_states)  
        # 对隐藏状态张量应用前层归一化

        hidden_states = hidden_states.view(batch_size, height, width, channels)  
        # 将隐藏状态张量重新形状为四维张量,形状为(批量大小,高度,宽度,通道数)

        # pad hidden_states to multiples of window size
        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)  
        # 可能对隐藏状态张量进行填充,使其大小为窗口大小的倍数,并返回填充后的张量和填充值

        _, height_pad, width_pad, _ = hidden_states.shape  
        # 解包填充后的张量形状,获取填充后的高度和宽度

        # cyclic shift
        if self.shift_size > 0:  
            # 如果位移大小大于零,则执行以下操作
            shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))  
            # 在指定维度上将隐藏状态张量进行循环位移
        else:  
            # 否则,如果位移大小不大于零,则执行以下操作
            shifted_hidden_states = hidden_states  
            # 将隐藏状态张量赋值给位移后的隐藏状态张量

        # partition windows
        hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)  
        # 划分窗口,将位移后的隐藏状态张量划分为窗口
        hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)  
        # 将划分后的窗口重新形状为三维张量

        attn_mask = self.get_attn_mask(height_pad, width_pad, dtype=hidden_states.dtype)  
        # 获取注意力掩码,用于注意力计算
        if attn_mask is not None:  
            # 如果注意力掩码不为空,则执行以下操作
            attn_mask = attn_mask.to(hidden_states_windows.device)  
            # 将注意力掩码移到与隐藏状态窗口相同的设备上

        attention_outputs = self.attention(
            hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
        )  
        # 使用注意力机制计算输出,包括注意力权重和其它相关输出

        attention_output = attention_outputs[0]  
        # 获取注意力输出的第一个元素

        attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)  
        # 将注意力输出重新形状为四维张量,表示窗口形式的注意力输出

        shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)  
        # 将注意力窗口反转,逆操作

        # reverse cyclic shift
        if self.shift_size > 0:  
            # 如果位移大小大于零,则执行以下操作
            attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))  
            # 在指定维度上对注意力窗口进行反向循环位移
        else:  
            # 否则,如果位移大小不大于零,则执行以下操作
            attention_windows = shifted_windows  
            # 将反转后的注意力窗口赋值给注意力窗口

        was_padded = pad_values[3] > 0 or pad_values[5] > 0  
        # 检查是否进行了填充

        if was_padded:  
            # 如果进行了填充,则执行以下操作
            attention_windows = attention_windows[:, :height, :width, :].contiguous()  
            # 对注意力窗口进行切片,保留非填充部分,并确保内存连续性

        attention_windows = attention_windows.view(batch_size, height * width, channels)  
        # 将注意力窗口重新形状为三维张量

        hidden_states = shortcut + self.drop_path(attention_windows)  
        # 将快捷路径与注意力窗口加上 dropout 后的结果相加,作为隐藏状态的新值

        layer_output = self.layernorm_after(hidden_states)  
        # 对隐藏状态应用后层归一化
        layer_output = self.intermediate(layer_output)  
        # 对层输出应用中间层处理
        layer_output = hidden_states + self.output(layer_output)  
        # 将层输出与输出层处理后的结果相加,作为最终的层输出

        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)  
        # 如果需要输出注意力,将注意力权重也作为输出之一
        return layer_outputs  
        # 返回层的输出元组
# 从transformers.models.swin.modeling_swin.SwinStage复制而来,将Swin替换为ClapAudio
class ClapAudioStage(nn.Module):
    def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
        super().__init__()
        self.config = config
        self.dim = dim
        # 创建包含多个ClapAudioLayer的模块列表
        self.blocks = nn.ModuleList(
            [
                ClapAudioLayer(
                    config=config,
                    dim=dim,
                    input_resolution=input_resolution,
                    num_heads=num_heads,
                    shift_size=0 if (i % 2 == 0) else config.window_size // 2,
                )
                for i in range(depth)
            ]
        )

        # 如果downsample不为None,则使用给定的输入分辨率和维度创建下采样层
        if downsample is not None:
            self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
        else:
            self.downsample = None

        # 是否进行指向性操作的标志
        self.pointing = False

    def forward(
        self,
        hidden_states: torch.Tensor,
        input_dimensions: Tuple[int, int],
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
        always_partition: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        height, width = input_dimensions
        # 对每个ClapAudioLayer进行前向传播
        for i, layer_module in enumerate(self.blocks):
            layer_head_mask = head_mask[i] if head_mask is not None else None

            layer_outputs = layer_module(
                hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
            )

            hidden_states = layer_outputs[0]

        hidden_states_before_downsampling = hidden_states
        # 如果存在下采样层,则进行下采样操作
        if self.downsample is not None:
            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
            output_dimensions = (height, width, height_downsampled, width_downsampled)
            hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions)
        else:
            output_dimensions = (height, width, height, width)

        stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)

        # 如果需要输出注意力权重,则将它们包含在输出中
        if output_attentions:
            stage_outputs += layer_outputs[1:]
        return stage_outputs


# 从transformers.models.swin.modeling_swin.SwinPatchMerging复制而来,将Swin替换为ClapAudio
class ClapAudioPatchMerging(nn.Module):
    """
    Patch Merging Layer.

    Args:
        input_resolution (`Tuple[int]`):
            输入特征的分辨率。
        dim (`int`):
            输入通道数。
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            标准化层的类。
    """
    def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
        super().__init__()
        self.input_resolution = input_resolution  # 初始化输入分辨率
        self.dim = dim  # 初始化维度
        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)  # 创建线性变换层,从4*dim到2*dim
        self.norm = norm_layer(4 * dim)  # 初始化归一化层,输入为4*dim

    def maybe_pad(self, input_feature, height, width):
        should_pad = (height % 2 == 1) or (width % 2 == 1)  # 判断是否需要填充,如果高或宽为奇数则需要
        if should_pad:
            pad_values = (0, 0, 0, width % 2, 0, height % 2)  # 计算填充值,使得宽和高都为偶数
            input_feature = nn.functional.pad(input_feature, pad_values)  # 对输入特征进行填充

        return input_feature

    def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
        height, width = input_dimensions  # 解包输入维度
        # `dim` is height * width
        batch_size, dim, num_channels = input_feature.shape  # 获取输入特征的形状信息

        input_feature = input_feature.view(batch_size, height, width, num_channels)  # 将输入特征重塑为四维张量
        # pad input to be disible by width and height, if needed
        input_feature = self.maybe_pad(input_feature, height, width)  # 如果需要,对输入特征进行填充
        # [batch_size, height/2, width/2, num_channels]
        input_feature_0 = input_feature[:, 0::2, 0::2, :]  # 提取特征的子采样部分,步长为2
        # [batch_size, height/2, width/2, num_channels]
        input_feature_1 = input_feature[:, 1::2, 0::2, :]  # 提取特征的子采样部分,步长为2
        # [batch_size, height/2, width/2, num_channels]
        input_feature_2 = input_feature[:, 0::2, 1::2, :]  # 提取特征的子采样部分,步长为2
        # [batch_size, height/2, width/2, num_channels]
        input_feature_3 = input_feature[:, 1::2, 1::2, :]  # 提取特征的子采样部分,步长为2
        # batch_size height/2 width/2 4*num_channels
        input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)  # 将四个子采样特征拼接在一起
        input_feature = input_feature.view(batch_size, -1, 4 * num_channels)  # 将特征重新展平,变成三维张量

        input_feature = self.norm(input_feature)  # 对特征进行归一化
        input_feature = self.reduction(input_feature)  # 对特征进行线性变换

        return input_feature
# 定义 ClapAudioEncoder 类,继承自 nn.Module,用于音频编码器的定义和处理
class ClapAudioEncoder(nn.Module):
    # 初始化方法,接受一个配置对象 config
    def __init__(self, config):
        super().__init__()
        # 计算层数并保存到 self.num_layers 中
        self.num_layers = len(config.depths)

        # 保存配置对象到 self.config 中
        self.config = config
        # 创建 ClapAudioPatchEmbed 对象并保存到 self.patch_embed 中
        self.patch_embed = ClapAudioPatchEmbed(config)
        # 从配置中获取是否启用融合,并保存到 self.enable_fusion 中
        self.enable_fusion = config.enable_fusion
        # 从 patch_embed 中获取 patch 的步幅并保存到 self.patch_stride 中
        self.patch_stride = self.patch_embed.patch_stride
        # 从配置中获取 spec_size 并保存到 self.spec_size 中
        self.spec_size = config.spec_size
        # 计算频率比率并保存到 self.freq_ratio 中
        self.freq_ratio = config.spec_size // config.num_mel_bins

        # 计算特征数量并保存到 self.num_features 中
        self.num_features = int(config.patch_embeds_hidden_size * 2 ** (self.num_layers - 1))

        # 根据 drop_path_rate 创建一个列表,用于后续的层级设置
        drop_path_rate = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]

        # 计算 patch embed 的网格大小并保存到 grid_size 中
        grid_size = self.patch_embed.grid_size
        # 根据层数创建输入分辨率列表,并保存到 self.input_resolutions 中
        self.input_resolutions = [(grid_size[0] // (2**i), grid_size[1] // (2**i)) for i in range(self.num_layers)]

        # 创建一个 nn.ModuleList,包含多个 ClapAudioStage 层,并保存到 self.layers 中
        self.layers = nn.ModuleList(
            [
                ClapAudioStage(
                    config=config,
                    dim=int(config.patch_embeds_hidden_size * 2**i_layer),
                    input_resolution=self.input_resolutions[i_layer],
                    depth=config.depths[i_layer],
                    num_heads=config.num_attention_heads[i_layer],
                    drop_path=drop_path_rate[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
                    downsample=ClapAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
                )
                for i_layer in range(self.num_layers)
            ]
        )

        # 设置梯度检查点为 False
        self.gradient_checkpointing = False

        # 创建一个 nn.BatchNorm2d 层,用于批量归一化,并保存到 self.batch_norm 中
        self.batch_norm = nn.BatchNorm2d(config.num_mel_bins)
        # 创建一个 nn.LayerNorm 层,用于层归一化,并保存到 self.norm 中
        self.norm = nn.LayerNorm(self.num_features)
        # 从配置中获取 depths 并保存到 self.depths 中
        self.depths = config.depths
        # 创建一个 nn.AdaptiveAvgPool1d 层,用于自适应平均池化,并保存到 self.avgpool 中
        self.avgpool = nn.AdaptiveAvgPool1d(1)
    def reshape_mel2img(self, normalized_input_features):
        """
        The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
        """
        # 获取输入特征的形状信息:batch_size, channels, time_length, freq_length
        _, _, time_length, freq_length = normalized_input_features.shape

        # 计算目标图像的宽度和高度
        spec_width = int(self.spec_size * self.freq_ratio)
        spec_heigth = self.spec_size // self.freq_ratio

        # 检查输入的时间长度和频率长度是否超过了目标图像的大小
        if time_length > spec_width or freq_length > spec_heigth:
            raise ValueError("the wav size should be less than or equal to the swin input size")

        # 为了避免双三次插值时的零值错误,对输入进行插值处理
        if time_length < spec_width:
            normalized_input_features = nn.functional.interpolate(
                normalized_input_features, (spec_width, freq_length), mode="bicubic", align_corners=True
            )
        if freq_length < spec_heigth:
            normalized_input_features = nn.functional.interpolate(
                normalized_input_features, (time_length, spec_heigth), mode="bicubic", align_corners=True
            )

        # 获取调整后的输入特征的新形状信息
        batch, channels, time, freq = normalized_input_features.shape

        # 将输入特征重塑为目标形状
        # batch_size, channels, spec_width, spec_heigth --> batch_size, channels * freq_ratio, spec_heigth, spec_width // freq_ratio
        normalized_input_features = normalized_input_features.reshape(
            batch, channels * self.freq_ratio, time // self.freq_ratio, freq
        )
        # 转置特征以匹配期望的维度顺序
        normalized_input_features = normalized_input_features.permute(0, 1, 3, 2).contiguous()
        # 再次重塑特征以最终形状返回
        normalized_input_features = normalized_input_features.reshape(
            batch, channels, freq * self.freq_ratio, time // self.freq_ratio
        )

        return normalized_input_features
# CLAP_START_DOCSTRING 变量,包含模型继承自 `PreTrainedModel` 的描述,建议查看超类文档以获取关于模型的通用方法,如下载、保存、调整输入嵌入大小、修剪头等。
CLAP_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`ClapConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# CLAP_TEXT_INPUTS_DOCSTRING 变量,包含描述模型文本输入参数的文档字符串,包括 input_ids、attention_mask、position_ids 等。
CLAP_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# CLAP_AUDIO_INPUTS_DOCSTRING 变量,暂时为空字符串,预留给后续可能会添加的音频输入相关的文档字符串。
CLAP_AUDIO_INPUTS_DOCSTRING = r"""
    """
    Args:
        input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
            retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attention tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

CLAP_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Input audio features. This should be returnes by the [`ClapFeatureExtractor`] class that you can also
            retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


class ClapProjectionLayer(nn.Module):
    """
    Projection layer for CLAP model.

    Args:
        config (Union[ClapAudioConfig, ClapTextConfig]): Configuration object for CLAP model.

    Attributes:
        linear1 (nn.Linear): First linear transformation layer.
        activation: Activation function applied after the first linear transformation.
        linear2 (nn.Linear): Second linear transformation layer.
    """

    def __init__(self, config: Union[ClapAudioConfig, ClapTextConfig]):
        super().__init__()
        self.config = config
        hidden_size = config.hidden_size
        projection_dim = config.projection_dim

        # Initialize linear layers
        self.linear1 = nn.Linear(hidden_size, projection_dim)
        self.activation = ACT2FN[config.projection_hidden_act]
        self.linear2 = nn.Linear(projection_dim, projection_dim)

    def forward(self, hidden_states):
        """
        Perform forward pass of the projection layer.

        Args:
            hidden_states (torch.Tensor): Input tensor of shape `(batch_size, hidden_size)`.

        Returns:
            torch.Tensor: Output tensor of shape `(batch_size, projection_dim)`.
        """
        hidden_states = self.linear1(hidden_states)
        hidden_states = self.activation(hidden_states)
        hidden_states = self.linear2(hidden_states)
        return hidden_states


# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->ClapText, persistent=False->persistent=True
class ClapTextEmbeddings(nn.Module):
    """
    CLAP model text embeddings.

    Inherits from nn.Module and handles the embeddings for text input.

    """
    # Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    """
    
    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
    def __init__(self, config):
        super().__init__()
        # 初始化词嵌入层,使用 nn.Embedding 类,配置词汇大小、隐藏大小,并设置填充索引为 config.pad_token_id
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 初始化位置嵌入层,使用 nn.Embedding 类,配置最大位置嵌入大小和隐藏大小
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 初始化类型嵌入层,使用 nn.Embedding 类,配置类型词汇表大小和隐藏大小
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
    
        # self.LayerNorm 没有使用蛇形命名以保持与 TensorFlow 模型变量名的一致性,以便能够加载任何 TensorFlow 检查点文件
        # 初始化 LayerNorm 层,使用 nn.LayerNorm 类,配置隐藏大小和 eps 参数为 config.layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 Dropout 层,使用 nn.Dropout 类,配置丢弃率为 config.hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # position_ids(1,长度位置 emb)在内存中是连续的,并在序列化时导出
        # 设置位置嵌入类型,默认为 "absolute",或从 config 中获取 position_embedding_type 属性
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册缓冲区,创建位置 ID 张量,大小为 (1, config.max_position_embeddings),持久化为 True
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=True
        )
        # 注册缓冲区,创建类型 ID 张量,大小与位置 ID 张量相同,类型为 long 型,持久化为 True
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=True
        )
    
        # End copy
        # 设置填充索引为 config.pad_token_id
        self.padding_idx = config.pad_token_id
        # 重新初始化位置嵌入层,使用 nn.Embedding 类,配置最大位置嵌入大小、隐藏大小,并设置填充索引为 self.padding_idx
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )
        ):
            # 如果未提供位置标识符,则根据输入的标记标识符创建位置标识符。任何填充的标记保持填充状态。
            if position_ids is None:
                if input_ids is not None:
                    position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
                else:
                    position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

            # 如果提供了输入标记标识符,则确定其形状
            if input_ids is not None:
                input_shape = input_ids.size()
            else:
                input_shape = inputs_embeds.size()[:-1]

            # 获取序列的长度
            seq_length = input_shape[1]

            # 将token_type_ids设置为构造函数中注册的缓冲区,通常是全零,这在自动生成时很有用,注册的缓冲区有助于用户在不传递token_type_ids的情况下跟踪模型,解决问题#5664
            if token_type_ids is None:
                if hasattr(self, "token_type_ids"):
                    buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                    buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                    token_type_ids = buffered_token_type_ids_expanded
                else:
                    token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

            # 如果未提供inputs_embeds,则使用input_ids获取word_embeddings
            if inputs_embeds is None:
                inputs_embeds = self.word_embeddings(input_ids)
            token_type_embeddings = self.token_type_embeddings(token_type_ids)

            # 计算最终的嵌入向量
            embeddings = inputs_embeds + token_type_embeddings
            if self.position_embedding_type == "absolute":
                position_embeddings = self.position_embeddings(position_ids)
                embeddings += position_embeddings
            embeddings = self.LayerNorm(embeddings)
            embeddings = self.dropout(embeddings)
            return embeddings

    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
        """
        给定直接的嵌入向量,我们无法推断哪些是填充的,因此只生成顺序位置标识符。

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        """
        input_shape = inputs_embeds.size()[:-1]
        sequence_length = input_shape[1]

        # 创建顺序的位置标识符
        position_ids = torch.arange(
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
        )
        return position_ids.unsqueeze(0).expand(input_shape)
# 从transformers.models.bert.modeling_bert.BertSelfAttention复制并修改为ClapTextSelfAttention
class ClapTextSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 检查隐藏层大小是否能被注意力头数整除,若不满足条件且config没有embedding_size属性则引发错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 设置注意力头数和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 初始化查询、键和值的线性层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # Dropout层用于注意力概率的随机失活
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        
        # 设置位置嵌入类型,默认为绝对位置嵌入
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )

        # 如果位置嵌入类型为相对键或相对键查询,则初始化距离嵌入层
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 标记是否为解码器
        self.is_decoder = config.is_decoder

    # 将输入张量重塑为注意力分数张量的形状
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数定义
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ):



# 从transformers.models.bert.modeling_bert.BertSelfOutput复制并修改为ClapTextSelfOutput
class ClapTextSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 全连接层,将隐藏状态映射回原始维度
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # LayerNorm层,用于归一化隐藏状态
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout层,用于隐藏状态的随机失活
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数定义
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 全连接层映射
        hidden_states = self.dense(hidden_states)
        # Dropout随机失活
        hidden_states = self.dropout(hidden_states)
        # LayerNorm归一化,并加上残差连接
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
# 定义 ClapTextAttention 类,继承自 nn.Module
class ClapTextAttention(nn.Module):
    # 初始化函数,接收 config 和 position_embedding_type 参数
    def __init__(self, config, position_embedding_type=None):
        # 调用父类的初始化函数
        super().__init__()
        # 创建 self 属性,调用 ClapTextSelfAttention 类,传入 config 和 position_embedding_type 参数
        self.self = ClapTextSelfAttention(config, position_embedding_type=position_embedding_type)
        # 创建 output 属性,调用 ClapTextSelfOutput 类,传入 config 参数
        self.output = ClapTextSelfOutput(config)
        # 创建 pruned_heads 属性,初始化为空集合
        self.pruned_heads = set()

    # 头部修剪函数,接收 heads 参数
    def prune_heads(self, heads):
        # 如果 heads 长度为 0,则直接返回
        if len(heads) == 0:
            return
        # 调用 find_pruneable_heads_and_indices 函数,获取可以修剪的头部及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 修剪线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并记录修剪的头部
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播函数,接收多个输入参数,返回元组类型的张量
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用 self 的 forward 方法,传入多个参数,获取 self_outputs
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 调用 output 属性的 forward 方法,传入 self_outputs[0] 和 hidden_states,获取 attention_output
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果需要输出注意力,将 attentions 添加到 outputs 中
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要输出注意力,将 attentions 添加到 outputs 中
        return outputs


# 从 transformers.models.bert.modeling_bert.BertIntermediate 类复制而来
class ClapTextIntermediate(nn.Module):
    # 初始化函数,接收 config 参数
    def __init__(self, config):
        # 调用父类的初始化函数
        super().__init__()
        # 创建 dense 属性,使用 nn.Linear 类,输入为 config.hidden_size 和 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 如果 config.hidden_act 是字符串类型,则使用 ACT2FN 字典中对应的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    # 前向传播函数,接收 hidden_states 参数,返回 torch.Tensor 类型的张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 使用 dense 属性对 hidden_states 进行线性变换
        hidden_states = self.dense(hidden_states)
        # 使用 intermediate_act_fn 激活函数对 hidden_states 进行激活
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertOutput 类复制而来
class ClapTextOutput(nn.Module):
    def __init__(self, config):
        # 调用父类构造函数进行初始化
        super().__init__()
        # 创建一个全连接层,将输入大小设为config中的中间层大小,输出大小为config中的隐藏层大小
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个LayerNorm层,输入大小为config中的隐藏层大小,设置epsilon为config中的layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个Dropout层,设置dropout概率为config中的hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 使用全连接层进行前向传播,将隐藏状态转换为新的表示
        hidden_states = self.dense(hidden_states)
        # 对转换后的表示进行dropout操作,以减少过拟合风险
        hidden_states = self.dropout(hidden_states)
        # 对dropout后的表示进行LayerNorm操作,并将输入张量与LayerNorm后的结果相加
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回经过处理后的隐藏状态张量
        return hidden_states
# 从transformers.models.bert.modeling_bert.BertLayer复制代码,并将Bert->ClapText
class ClapTextLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 初始化层参数
        self.chunk_size_feed_forward = config.chunk_size_feed_forward  # 设置前馈过程的分块大小
        self.seq_len_dim = 1  # 序列长度维度设为1
        self.attention = ClapTextAttention(config)  # 创建ClapTextAttention对象
        self.is_decoder = config.is_decoder  # 是否为解码器模型
        self.add_cross_attention = config.add_cross_attention  # 是否添加交叉注意力
        if self.add_cross_attention:
            # 如果添加了交叉注意力,且不是解码器模型,则抛出错误
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            # 创建具有绝对位置嵌入类型的ClapTextAttention对象
            self.crossattention = ClapTextAttention(config, position_embedding_type="absolute")
        # 创建ClapTextIntermediate对象
        self.intermediate = ClapTextIntermediate(config)
        # 创建ClapTextOutput对象
        self.output = ClapTextOutput(config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
        ) -> Tuple[torch.Tensor]:
        # 解码器单向自注意力的缓存键/值元组在位置1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 执行自注意力层的前向传播
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        # 获取自注意力层的输出
        attention_output = self_attention_outputs[0]

        # 如果是解码器,最后一个输出是自注意力缓存的元组
        if self.is_decoder:
            # 输出中排除最后一个元素,因为它是自注意力的缓存
            outputs = self_attention_outputs[1:-1]
            # 获取当前的键/值元组
            present_key_value = self_attention_outputs[-1]
        else:
            # 如果需要输出注意力权重,则包括自注意力层的输出
            outputs = self_attention_outputs[1:]
        
        cross_attn_present_key_value = None
        # 如果是解码器且有编码器隐藏状态作为输入
        if self.is_decoder and encoder_hidden_states is not None:
            # 如果未定义crossattention,抛出错误
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # 交叉注意力的缓存键/值元组在过去键/值元组的第3,4位置
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # 执行交叉注意力层的前向传播
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            # 获取交叉注意力层的输出
            attention_output = cross_attention_outputs[0]
            # 将交叉注意力层的输出添加到总输出中
            outputs = outputs + cross_attention_outputs[1:-1]

            # 将交叉注意力的键/值元组添加到当前键/值元组中
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # 将注意力输出应用于前向传播的分块处理
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        # 将层输出添加到总输出中
        outputs = (layer_output,) + outputs

        # 如果是解码器,将注意力键/值作为最后一个输出返回
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        return outputs

    def feed_forward_chunk(self, attention_output):
        # 执行前馈网络的分块处理
        intermediate_output = self.intermediate(attention_output)
        # 应用激活函数和残差连接
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output
# 从transformers.models.bert.modeling_bert.BertEncoder复制代码并修改为ClapTextEncoder
class ClapTextEncoder(nn.Module):
    # 初始化方法
    def __init__(self, config):
        super().__init__()
        # 保存配置信息
        self.config = config
        # 创建一个包含多个ClapTextLayer对象的层列表,数量由配置文件中的num_hidden_layers指定
        self.layer = nn.ModuleList([ClapTextLayer(config) for _ in range(config.num_hidden_layers)])
        # 是否使用梯度检查点,默认为False
        self.gradient_checkpointing = False

    # 前向传播方法
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        # 输入参数结束,这里只是声明参数,并未实际执行任何操作
    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        # 如果输出隐藏状态,则初始化一个空元组用于存储所有隐藏状态
        all_hidden_states = () if output_hidden_states else None
        # 如果输出注意力权重,则初始化一个空元组用于存储所有自注意力权重
        all_self_attentions = () if output_attentions else None
        # 如果输出注意力权重且模型配置支持交叉注意力,则初始化一个空元组用于存储所有交叉注意力权重
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果启用梯度检查点且在训练模式下,检查是否与使用缓存同时设置。如果是,则发出警告并强制将use_cache设置为False
        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # 如果不使用缓存,则初始化一个空元组来存储下一个解码器缓存
        next_decoder_cache = () if use_cache else None
        # 遍历所有解码器层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态,则将当前隐藏状态添加到all_hidden_states元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码(如果有的话)
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 获取当前层的过去键值对(如果有的话)
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果启用梯度检查点且在训练模式下,使用梯度检查点函数来计算当前层的输出
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则,直接调用当前层模块计算当前层的输出
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            # 更新当前隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果使用缓存,则将当前层的缓存添加到next_decoder_cache元组中
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果需要输出注意力权重,则将当前层的自注意力权重添加到all_self_attentions元组中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果模型配置支持交叉注意力,则将当前层的交叉注意力权重添加到all_cross_attentions元组中
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果需要输出隐藏状态,则将最终隐藏状态添加到all_hidden_states元组中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典形式的输出,则返回一个元组,包含所有需要返回的结果,过滤掉值为None的项
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 否则,返回一个BaseModelOutputWithPastAndCrossAttentions对象,包含特定的输出结果
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
# Copied from transformers.models.bert.modeling_bert.BertPooler
class ClapTextPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一个全连接层,输入和输出维度均为config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 定义激活函数为双曲正切函数
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 从输入的hidden_states中获取第一个token对应的隐藏状态
        first_token_tensor = hidden_states[:, 0]
        # 将获取的隐藏状态输入全连接层,进行线性变换
        pooled_output = self.dense(first_token_tensor)
        # 将线性变换的结果输入激活函数,得到最终的池化输出
        pooled_output = self.activation(pooled_output)
        # 返回池化后的输出张量
        return pooled_output


class ClapPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 配置类为ClapConfig
    config_class = ClapConfig
    # 模型基础名称前缀为"clap"
    base_model_prefix = "clap"
    # 不支持梯度检查点
    supports_gradient_checkpointing = False

    def _init_weights(self, module):
        """Initialize the weights"""
        # 获取初始化因子
        factor = self.config.initializer_factor

        if isinstance(module, ClapTextEmbeddings):
            # 如果是文本嵌入模块,初始化位置嵌入和token类型嵌入的权重
            module.position_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
            module.token_type_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
        elif isinstance(module, ClapModel):
            # 如果是ClapModel,初始化logit_scale_a和logit_scale_t
            nn.init.normal_(module.logit_scale_a, std=factor * 0.02)
            nn.init.normal_(module.logit_scale_t, std=factor * 0.02)
        elif isinstance(module, nn.Embedding):
            # 如果是嵌入层,初始化权重
            module.weight.data.normal_(mean=0.0, std=factor * 0.02)
        elif isinstance(module, nn.LayerNorm):
            # 如果是LayerNorm层,初始化偏置为零,权重为1
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        elif isinstance(module, (nn.Conv2d, nn.Linear)):
            # 如果是卷积层或线性层,根据特定的初始化公式初始化权重
            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
            nn.init.normal_(module.weight, std=in_proj_std)
            if module.bias is not None:
                module.bias.data.zero_()


class ClapAudioModel(ClapPreTrainedModel):
    # 配置类为ClapAudioConfig
    config_class = ClapAudioConfig
    # 主要输入名称为"input_features"
    main_input_name = "input_features"

    def __init__(self, config: ClapAudioConfig):
        super().__init__(config)
        # 初始化音频编码器
        self.audio_encoder = ClapAudioEncoder(config)
        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        # 返回音频编码器中的投影嵌入层
        return self.audio_encoder.patch_embed.proj

    @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ClapAudioConfig)
    def forward(
        self,
        input_features: Optional[torch.FloatTensor] = None,
        is_longer: Optional[torch.BoolTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果 return_dict 参数不为 None,则使用其自身的值;否则使用 self.config.use_return_dict 的值

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果 output_attentions 参数不为 None,则使用其自身的值;否则使用 self.config.output_attentions 的值

        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果 output_hidden_states 参数不为 None,则使用其自身的值;否则使用 self.config.output_hidden_states 的值

        return self.audio_encoder(
            input_features=input_features,
            is_longer=is_longer,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 调用 self.audio_encoder 方法,传入以下参数:
        # - input_features: 输入特征
        # - is_longer: 布尔值,指示输入是否较长
        # - output_attentions: 是否输出注意力权重,根据前面的处理得到的值
        # - output_hidden_states: 是否输出隐藏状态,根据前面的处理得到的值
        # - return_dict: 是否返回字典形式的输出结果,根据前面的处理得到的值
# 定义一个名为 ClapTextModel 的类,它继承自 ClapPreTrainedModel 类
class ClapTextModel(ClapPreTrainedModel):
    """
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
    """

    # 设置配置类为 ClapTextConfig
    config_class = ClapTextConfig

    # 从 transformers.models.bert.modeling_bert.BertModel.__init__ 复制的初始化函数,将 Bert 替换为 ClapText
    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config

        # 初始化嵌入层为 ClapTextEmbeddings 对象
        self.embeddings = ClapTextEmbeddings(config)
        # 初始化编码器为 ClapTextEncoder 对象
        self.encoder = ClapTextEncoder(config)

        # 如果 add_pooling_layer 为 True,则初始化池化层为 ClapTextPooler 对象,否则为 None
        self.pooler = ClapTextPooler(config) if add_pooling_layer else None

        # 调用后处理函数,初始化权重并进行最终处理
        self.post_init()

    # 从 transformers.models.bert.modeling_bert.BertModel.forward 复制的前向传播函数
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 开始时增加文档字符串,未提供具体内容
        @add_start_docstrings(CLAP_START_DOCSTRING)
        class ClapModel(ClapPreTrainedModel):
            config_class = ClapConfig
    # 初始化方法,接受一个 ClapConfig 类型的参数 config
    def __init__(self, config: ClapConfig):
        # 调用父类的初始化方法,传入 config 参数
        super().__init__(config)

        # 检查 config.text_config 是否为 ClapTextConfig 类型,如果不是则抛出 ValueError 异常
        if not isinstance(config.text_config, ClapTextConfig):
            raise ValueError(
                "config.text_config is expected to be of type ClapTextConfig but is of type"
                f" {type(config.text_config)}."
            )

        # 检查 config.audio_config 是否为 ClapAudioConfig 类型,如果不是则抛出 ValueError 异常
        if not isinstance(config.audio_config, ClapAudioConfig):
            raise ValueError(
                "config.audio_config is expected to be of type ClapAudioConfig but is of type"
                f" {type(config.audio_config)}."
            )

        # 将 config 中的 text_config 和 audio_config 分别赋值给局部变量 text_config 和 audio_config
        text_config = config.text_config
        audio_config = config.audio_config

        # 初始化 logit_scale_a 和 logit_scale_t 为对数形式的初始值
        self.logit_scale_a = nn.Parameter(torch.tensor(math.log(config.logit_scale_init_value)))
        self.logit_scale_t = nn.Parameter(torch.tensor(math.log(config.logit_scale_init_value)))

        # 将 projection_dim 初始化为 config 中指定的 projection_dim
        self.projection_dim = config.projection_dim

        # 初始化文本模型和文本投影层
        self.text_model = ClapTextModel(text_config)
        self.text_projection = ClapProjectionLayer(text_config)

        # 初始化音频模型和音频投影层
        self.audio_model = ClapAudioModel(audio_config)
        self.audio_projection = ClapProjectionLayer(audio_config)

        # 调用自定义的 post_init 方法,用于初始化权重并进行最终处理
        self.post_init()
        ) -> torch.FloatTensor:
        r"""
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`ClapTextModel`].

        Examples:

        ```
        >>> from transformers import AutoTokenizer, ClapModel

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```"""
        # Use CLAP model's config for some fields (if specified) instead of those of audio & text components.
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass input_ids, attention_mask, position_ids, and other relevant parameters to the text_model
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Determine pooled_output based on whether return_dict is enabled
        pooled_output = text_outputs[1] if return_dict is not None else text_outputs.pooler_output

        # Project pooled_output to obtain text_features
        text_features = self.text_projection(pooled_output)

        # Normalize text_features along the last dimension
        text_features = F.normalize(text_features, dim=-1)

        # Return the normalized text_features
        return text_features
        ) -> torch.FloatTensor:
        r"""
        Returns:
            audio_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by
            applying the projection layer to the pooled output of [`ClapAudioModel`].

        Examples:

        ```
        >>> from transformers import AutoFeatureExtractor, ClapModel
        >>> import torch

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
        >>> random_audio = torch.rand((16_000))
        >>> inputs = feature_extractor(random_audio, return_tensors="pt")
        >>> audio_features = model.get_audio_features(**inputs)
        ```"""
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用音频模型获取音频输出特征
        audio_outputs = self.audio_model(
            input_features=input_features,
            is_longer=is_longer,
            return_dict=return_dict,
        )

        # 根据返回的结果是否使用字典形式确定池化输出
        pooled_output = audio_outputs[1] if not return_dict else audio_outputs.pooler_output

        # 使用音频投影层对池化输出进行投影
        audio_features = self.audio_projection(pooled_output)

        # 对音频特征进行标准化处理
        audio_features = F.normalize(audio_features, dim=-1)

        # 返回处理后的音频特征
        return audio_features

    @add_start_docstrings_to_model_forward(CLAP_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=ClapOutput, config_class=ClapConfig)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        input_features: Optional[torch.FloatTensor] = None,
        is_longer: Optional[torch.BoolTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        return_loss: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
"""
CLAP Audio Model with a projection layer on top (a linear layer on top of the pooled output).
"""

@add_start_docstrings(
    """
    CLAP Text Model with a projection layer on top (a linear layer on top of the pooled output).
    """,
    CLAP_START_DOCSTRING,
)
class ClapTextModelWithProjection(ClapPreTrainedModel):
    # 指定配置类
    config_class = ClapTextConfig

    def __init__(self, config: ClapTextConfig):
        # 调用父类初始化方法
        super().__init__(config)
        # 初始化文本模型
        self.text_model = ClapTextModel(config)
        # 初始化投影层
        self.text_projection = ClapProjectionLayer(config)
        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        # 返回文本模型的词嵌入层
        return self.text_model.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        # 设置文本模型的词嵌入层
        self.text_model.embeddings.word_embeddings = value

    @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=ClapTextModelOutput, config_class=ClapTextConfig)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, ClapTextModelOutput]:
        """
        Returns:

        Examples:

        ```
        >>> from transformers import AutoTokenizer, ClapTextModelWithProjection

        >>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用文本模型的forward方法
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取池化后的输出
        pooled_output = text_outputs[1] if not return_dict else text_outputs.pooler_output

        # 通过投影层得到文本嵌入
        text_embeds = self.text_projection(pooled_output)

        if not return_dict:
            # 如果不返回字典,则返回元组形式的输出
            outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
            return tuple(output for output in outputs if output is not None)

        # 如果返回字典,则构造ClapTextModelOutput对象返回
        return ClapTextModelOutput(
            text_embeds=text_embeds,
            last_hidden_state=text_outputs.last_hidden_state,
            hidden_states=text_outputs.hidden_states,
            attentions=text_outputs.attentions,
        )


@add_start_docstrings(
    """
    CLAP Audio Model with a projection layer on top (a linear layer on top of the pooled output).
    """,
    CLAP_START_DOCSTRING,
)
class ClapAudioModelWithProjection(ClapPreTrainedModel):
    config_class = ClapAudioConfig  # 设置类的配置类为ClapAudioConfig

    main_input_name = "input_features"  # 主要输入名称为"input_features"

    def __init__(self, config: ClapAudioConfig):
        super().__init__(config)  # 调用父类构造函数,初始化模型配置

        self.audio_model = ClapAudioModel(config)  # 初始化音频模型
        self.audio_projection = ClapProjectionLayer(config)  # 初始化音频投影层

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        return self.audio_model.audio_encoder.patch_embed.proj  # 返回输入嵌入的投影层

    @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING)  # 添加模型前向传播的文档字符串
    @replace_return_docstrings(output_type=ClapAudioModelOutput, config_class=ClapAudioConfig)  # 替换返回的文档字符串为ClapAudioModelOutput类型

    def forward(
        self,
        input_features: Optional[torch.FloatTensor] = None,
        is_longer: Optional[torch.BoolTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, ClapAudioModelOutput]:
        r"""
        前向传播函数,接受以下参数并返回相应的输出:

        - input_features (Optional[torch.FloatTensor]): 输入特征张量,默认为None。
        - is_longer (Optional[torch.BoolTensor]): 是否为较长输入张量,默认为None。
        - output_attentions (Optional[bool]): 是否输出注意力,默认为None。
        - output_hidden_states (Optional[bool]): 是否输出隐藏状态,默认为None。
        - return_dict (Optional[bool]): 是否返回字典类型输出,默认为None。

        Returns:
        Union[Tuple, ClapAudioModelOutput]: 返回音频嵌入或ClapAudioModelOutput对象。

        Examples:

        ```
        >>> from datasets import load_dataset
        >>> from transformers import ClapAudioModelWithProjection, ClapProcessor

        >>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
        >>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> dataset = load_dataset("ashraq/esc50")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> inputs = processor(audios=audio_sample, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> audio_embeds = outputs.audio_embeds
        ```
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict  # 设置返回字典的类型
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions  # 设置输出注意力的类型
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )  # 设置输出隐藏状态的类型

        audio_outputs = self.audio_model(
            input_features=input_features,
            is_longer=is_longer,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )  # 获取音频模型的输出

        pooled_output = audio_outputs[1] if not return_dict else audio_outputs.pooler_output  # 获取池化输出

        audio_embeds = self.audio_projection(pooled_output)  # 使用音频投影层得到音频嵌入

        if not return_dict:
            outputs = (audio_embeds, audio_outputs[0]) + audio_outputs[2:]  # 构建输出元组
            return tuple(output for output in outputs if output is not None)  # 返回非空的输出元组

        return ClapAudioModelOutput(
            audio_embeds=audio_embeds,
            last_hidden_state=audio_outputs.last_hidden_state,
            attentions=audio_outputs.attentions,
            hidden_states=audio_outputs.hidden_states,
        )  # 返回ClapAudioModelOutput对象
posted @ 2024-06-29 15:50  绝不原创的飞龙  阅读(15)  评论(0编辑  收藏  举报