Transformers-源码解析-二十-

Transformers 源码解析（二十）

`.\models\blenderbot_small\tokenization_blenderbot_small.py`

# coding=utf-8
# 版权 2021 年 Facebook Inc. 和 HuggingFace Inc. 团队保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（"许可证"）进行许可；
# 除非符合许可证，否则您不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则依据"原样"分发此软件
# 软件没有任何形式的担保或条件，无论是明示的还是暗示的。
# 有关特定语言的条款，请参阅许可证。
"""BlenderbotSmall 的分词类。"""

import json  # 导入 JSON 模块
import os  # 导入操作系统路径模块
from typing import Dict, List, Optional, Tuple  # 导入类型提示相关模块

import regex as re  # 导入正则表达式模块

from ...tokenization_utils import PreTrainedTokenizer  # 从 tokenization_utils 中导入 PreTrainedTokenizer 类
from ...utils import logging  # 从 utils 中导入 logging 模块


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",  # 词汇表文件名
    "merges_file": "merges.txt",  # 合并文件名
    "tokenizer_config_file": "tokenizer_config.json",  # 分词器配置文件名
}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json"
    },  # 预训练词汇文件映射
    "merges_file": {
        "facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt"
    },  # 预训练合并文件映射
    "tokenizer_config_file": {
        "facebook/blenderbot_small-90M": (
            "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer_config.json"
        )
    },  # 预训练分词器配置文件映射
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot_small-90M": 512}  # 预训练位置嵌入尺寸映射


def get_pairs(word):
    """
    返回单词中的符号对集合。

    单词表示为符号的元组（符号是可变长度字符串）。
    """
    pairs = set()  # 创建空集合用于存储符号对
    prev_char = word[0]  # 获取单词的第一个符号
    for char in word[1:]:  # 遍历单词中的每个符号（除第一个符号外）
        pairs.add((prev_char, char))  # 将前一个符号和当前符号作为一个符号对加入集合
        prev_char = char  # 更新前一个符号为当前符号

    pairs = set(pairs)  # 去除重复的符号对（因为集合具有唯一性）
    return pairs  # 返回符号对集合


class BlenderbotSmallTokenizer(PreTrainedTokenizer):
    """
    基于 BPE（字节对编码）构建 Blenderbot-90M 分词器。

    此分词器继承自 [`PreTrainedTokenizer`]，其中包含大多数主要方法。用户应参考
    超类以获取有关方法的更多信息。
    """
    pass  # 占位符，表示类的实现在此省略
    # 词汇文件的名称映射，用于指定特定模型的词汇文件
    vocab_files_names = VOCAB_FILES_NAMES
    # 预训练模型的词汇文件映射，用于指定预训练模型的特定词汇文件
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 预训练位置嵌入大小的映射，指定预训练模型的位置嵌入大小
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 模型输入名称列表，包含输入 ID 和注意力掩码
    model_input_names = ["input_ids", "attention_mask"]

    # 初始化函数，用于创建一个新的分词器对象
    def __init__(
        self,
        vocab_file,  # 词汇文件的路径
        merges_file,  # 合并文件的路径
        bos_token="__start__",  # 句子开始标记，默认为 "__start__"
        eos_token="__end__",  # 句子结束标记，默认为 "__end__"
        unk_token="__unk__",  # 未知标记，用于词汇表中不存在的词，默认为 "__unk__"
        pad_token="__null__",  # 填充标记，用于序列填充，默认为 "__null__"
        **kwargs,  # 其他可选关键字参数
    ):
        # 使用 utf-8 编码打开词汇文件，并加载到 self.encoder 中作为词汇表
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        # 创建反向映射，将编码转换为词汇
        self.decoder = {v: k for k, v in self.encoder.items()}
        # 使用 utf-8 编码打开合并文件，并读取为字符串，按行拆分（去掉首尾空行）
        with open(merges_file, encoding="utf-8") as merges_handle:
            merges = merges_handle.read().split("\n")[1:-1]
        # 将每行合并操作转换为元组，创建 BPE 合并的排名字典
        merges = [tuple(merge.split()) for merge in merges]
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        # 初始化缓存字典
        self.cache = {}
        # 调用父类 PreTrainedTokenizer 的初始化方法，传递未知标记、句子开始结束标记、填充标记等参数
        super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)

    # vocab_size 属性，返回词汇表大小
    @property
    def vocab_size(self) -> int:
        return len(self.encoder)

    # get_vocab 方法，返回包含 encoder 和 added_tokens_encoder 的词汇表字典
    def get_vocab(self) -> Dict:
        return dict(self.encoder, **self.added_tokens_encoder)
    def bpe(self, token: str) -> str:
        # 如果 token 已经在缓存中，则直接返回缓存中的结果
        if token in self.cache:
            return self.cache[token]

        # 根据正则表达式替换标点符号等符号
        token = re.sub("([.,!?()])", r" \1", token)
        token = re.sub("(')", r" \1 ", token)
        token = re.sub(r"\s{2,}", " ", token)

        # 如果 token 中包含换行符，则用 "__newln__" 替换换行符
        if "\n" in token:
            token = token.replace("\n", " __newln__")

        # 将 token 按空格分割成 tokens
        tokens = token.split(" ")
        words = []

        # 遍历每个 token
        for token in tokens:
            if not len(token):
                continue

            # 将 token 转换为小写
            token = token.lower()

            # 将 token 转换为字符元组
            word = tuple(token)

            # 将字符元组的最后一个字符和 "</w>" 组合成新的元组
            word = tuple(list(word[:-1]) + [word[-1] + "</w>"])

            # 获取字符元组的所有可能的 bigram 组合
            pairs = get_pairs(word)

            # 如果没有 bigram 组合，则直接将 token 添加到结果中
            if not pairs:
                words.append(token)
                continue

            # 处理包含 bigram 组合的情况
            while True:
                # 找到在当前 bigram 中排名最小的 bigram
                bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))

                # 如果找不到 bigram 的排名，则跳出循环
                if bigram not in self.bpe_ranks:
                    break

                # 拆分当前字符元组，并根据找到的 bigram 合并字符
                first, second = bigram
                new_word = []
                i = 0

                while i < len(word):
                    try:
                        j = word.index(first, i)
                        new_word.extend(word[i:j])
                        i = j
                    except ValueError:
                        new_word.extend(word[i:])
                        break

                    if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                        new_word.append(first + second)
                        i += 2
                    else:
                        new_word.append(word[i])
                        i += 1

                # 更新字符元组为新的字符组合
                new_word = tuple(new_word)
                word = new_word

                # 如果字符元组长度为 1，则跳出循环
                if len(word) == 1:
                    break
                else:
                    pairs = get_pairs(word)

            # 将处理后的字符元组转换为字符串，并去除末尾的 "@@" 标记
            word = "@@ ".join(word)
            word = word[:-4]

            # 将 token 及其对应的处理结果存入缓存中
            self.cache[token] = word
            words.append(word)

        # 将处理后的 tokens 组合成一个字符串并返回
        return " ".join(words)

    def _tokenize(self, text: str) -> List[str]:
        """Split a string into tokens using BPE."""
        split_tokens = []

        # 使用正则表达式找出所有非空白字符序列，包括可能的换行符
        words = re.findall(r"\S+\n?", text)

        # 遍历每个找到的 token，并将其通过 BPE 分词加入到 split_tokens 中
        for token in words:
            split_tokens.extend(list(self.bpe(token).split(" ")))

        # 返回分词后的结果列表
        return split_tokens

    def _convert_token_to_id(self, token: str) -> int:
        """Converts a token to an id using the vocab."""
        # 将 token 转换为小写，并通过词汇表获取对应的 id，如果不存在则返回未知 token 的 id
        token = token.lower()
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    def _convert_id_to_token(self, index: int) -> str:
        """Converts an index (integer) in a token (str) using the vocab."""
        # 将索引转换为对应的 token，如果不存在则返回未知 token
        return self.decoder.get(index, self.unk_token)

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        """Converts a sequence of tokens in a single string."""
        # 将 tokens 列表连接成一个字符串，并移除 "@@" 标记，然后去除首尾空格并返回结果
        out_string = " ".join(tokens).replace("@@ ", "").strip()
        return out_string
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 检查保存目录是否存在，如果不存在则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # 根据给定的前缀（如果有的话），构建词汇表文件的完整路径
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        # 根据给定的前缀（如果有的话），构建合并文件的完整路径
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 写入编码器（self.encoder）的 JSON 格式到词汇表文件中
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        index = 0
        # 写入 BPE（字节对编码）的合并信息到合并文件中
        with open(merge_file, "w", encoding="utf-8") as writer:
            # 写入文件版本信息
            writer.write("#version: 0.2\n")
            # 遍历并按照索引排序 BPE merge 信息，确保连续性并写入文件
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    # 如果索引不连续，记录警告信息
                    logger.warning(
                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        # 返回保存的词汇表文件和合并文件的路径
        return vocab_file, merge_file

    @property
    # 从 transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template 复制
    def default_chat_template(self):
        """
        A very simple chat template that just adds whitespace between messages.
        """
        # 如果没有为该分词器定义聊天模板，则记录警告信息，并使用默认模板
        logger.warning_once(
            "\nNo chat template is defined for this tokenizer - using the default template "
            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
            "your model, please set `tokenizer.chat_template` to an appropriate template. "
            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
        )
        # 返回默认的聊天模板，该模板用于在消息之间添加空格
        return (
            "{% for message in messages %}"
            "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
            "{{ message['content'] }}"
            "{% if not loop.last %}{{ '  ' }}{% endif %}"
            "{% endfor %}"
            "{{ eos_token }}"
        )

`.\models\blenderbot_small\tokenization_blenderbot_small_fast.py`

# coding=utf-8
# Copyright 2021, The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fast tokenization class for BlenderbotSmall."""
from typing import List, Optional

from tokenizers import ByteLevelBPETokenizer

from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 导入父类
from ...utils import logging  # 导入日志模块
from .tokenization_blenderbot_small import BlenderbotSmallTokenizer  # 导入慢速Tokenizer类


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器对象

VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",  # 词汇表文件名
    "merges_file": "merges.txt",  # 合并文件名
    "tokenizer_config_file": "tokenizer_config.json",  # Tokenizer配置文件名
}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json"
    },  # 预训练词汇文件映射
    "merges_file": {
        "facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt"
    },  # 预训练合并文件映射
    "tokenizer_config_file": {
        "facebook/blenderbot_small-90M": (
            "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer_config.json"
        )
    },  # 预训练Tokenizer配置文件映射
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "facebook/blenderbot_small-90M": 512,  # 预训练位置嵌入尺寸
}


class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a "fast" BlenderbotSmall tokenizer (backed by HuggingFace's *tokenizers* library).

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
    """

    vocab_files_names = VOCAB_FILES_NAMES  # 设置词汇文件名属性
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP  # 设置预训练词汇文件映射属性
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES  # 设置最大模型输入尺寸属性
    slow_tokenizer_class = BlenderbotSmallTokenizer  # 设置慢速Tokenizer类属性

    def __init__(
        self,
        vocab_file=None,  # 词汇文件路径，默认为None
        merges_file=None,  # 合并文件路径，默认为None
        unk_token="<|endoftext|>",  # 未知标记，默认为"<|endoftext|>"
        bos_token="<|endoftext|>",  # 开始序列标记，默认为"<|endoftext|>"
        eos_token="<|endoftext|>",  # 结束序列标记，默认为"<|endoftext|>"
        add_prefix_space=False,  # 是否在前缀之前加空格，默认为False
        trim_offsets=True,  # 是否修剪偏移量，默认为True
        **kwargs,
    ):
        super().__init__(
            ByteLevelBPETokenizer(
                vocab=vocab_file,  # 使用指定的词汇文件初始化ByteLevelBPETokenizer
                merges=merges_file,  # 使用指定的合并文件初始化ByteLevelBPETokenizer
                add_prefix_space=add_prefix_space,  # 设置是否在前缀之前加空格
                trim_offsets=trim_offsets,  # 设置是否修剪偏移量
            ),
            bos_token=bos_token,  # 设置开始序列标记
            eos_token=eos_token,  # 设置结束序列标记
            unk_token=unk_token,  # 设置未知标记
            **kwargs,
        )
        self.add_prefix_space = add_prefix_space  # 初始化是否在前缀之前加空格属性
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        # 构建包含特殊标记的输入序列
        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
        if token_ids_1 is None:
            # 如果只有一个输入序列，则返回包含特殊标记的序列
            return output

        # 如果有两个输入序列，将它们连接起来，并添加结束标记
        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        从传入的两个序列创建用于序列对分类任务的类型标识。BlenderbotSmall 不使用 token type ids，因此返回一个全为零的列表。

        Args:
            token_ids_0 (`List[int]`):
                ID 的列表。
            token_ids_1 (`List[int]`, *optional*):
                第二个序列的 ID 列表，用于序列对。

        Returns:
            `List[int]`: 全为零的列表。
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        if token_ids_1 is None:
            # 如果只有一个输入序列，返回一个全为零的列表
            return len(cls + token_ids_0 + sep) * [0]
        # 如果有两个输入序列，返回一个全为零的列表，包含特殊标记和分隔符
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    @property
    # 从 transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template 复制过来
    def default_chat_template(self):
        """
        一个非常简单的聊天模板，只在消息之间添加空白。
        """
        logger.warning_once(
            "\nNo chat template is defined for this tokenizer - using the default template "
            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
            "your model, please set `tokenizer.chat_template` to an appropriate template. "
            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
        )
        # 返回一个用于聊天的默认模板字符串
        return (
            "{% for message in messages %}"
            "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
            "{{ message['content'] }}"
            "{% if not loop.last %}{{ '  ' }}{% endif %}"
            "{% endfor %}"
            "{{ eos_token }}"
        )

`.\models\blenderbot_small\init.py`

# 导入必要的模块和函数
from typing import TYPE_CHECKING

# 从相对路径的utils模块导入所需的函数和异常类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义一个字典结构，用于存储模块导入的结构信息
_import_structure = {
    "configuration_blenderbot_small": [
        "BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "BlenderbotSmallConfig",
        "BlenderbotSmallOnnxConfig",
    ],
    "tokenization_blenderbot_small": ["BlenderbotSmallTokenizer"],
}

# 检查tokenizers库是否可用，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若tokenizers库可用，则更新_import_structure字典，添加tokenization_blenderbot_small_fast模块的导入信息
    _import_structure["tokenization_blenderbot_small_fast"] = ["BlenderbotSmallTokenizerFast"]

# 检查torch库是否可用，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若torch库可用，则更新_import_structure字典，添加modeling_blenderbot_small模块的导入信息
    _import_structure["modeling_blenderbot_small"] = [
        "BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST",
        "BlenderbotSmallForCausalLM",
        "BlenderbotSmallForConditionalGeneration",
        "BlenderbotSmallModel",
        "BlenderbotSmallPreTrainedModel",
    ]

# 检查tensorflow库是否可用，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若tensorflow库可用，则更新_import_structure字典，添加modeling_tf_blenderbot_small模块的导入信息
    _import_structure["modeling_tf_blenderbot_small"] = [
        "TFBlenderbotSmallForConditionalGeneration",
        "TFBlenderbotSmallModel",
        "TFBlenderbotSmallPreTrainedModel",
    ]

# 检查flax库是否可用，若不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若flax库可用，则更新_import_structure字典，添加modeling_flax_blenderbot_small模块的导入信息
    _import_structure["modeling_flax_blenderbot_small"] = [
        "FlaxBlenderbotSmallForConditionalGeneration",
        "FlaxBlenderbotSmallModel",
        "FlaxBlenderbotSmallPreTrainedModel",
    ]

# 如果在类型检查模式下
if TYPE_CHECKING:
    # 从相对路径的configuration_blenderbot_small模块导入所需的类和常量
    from .configuration_blenderbot_small import (
        BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
        BlenderbotSmallConfig,
        BlenderbotSmallOnnxConfig,
    )
    # 从相对路径的tokenization_blenderbot_small模块导入所需的类
    from .tokenization_blenderbot_small import BlenderbotSmallTokenizer

    # 检查tokenizers库是否可用，若不可用则引发OptionalDependencyNotAvailable异常
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若tokenizers库可用，则从tokenization_blenderbot_small_fast模块导入所需的类
        from .tokenization_blenderbot_small_fast import BlenderbotSmallTokenizerFast
    # 尝试检查是否安装了 Torch 库，如果未安装则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，不做任何操作
    except OptionalDependencyNotAvailable:
        pass
    # 如果 Torch 可用，则导入相关的 Blenderbot Small 模型类和常量
    else:
        from .modeling_blenderbot_small import (
            BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST,
            BlenderbotSmallForCausalLM,
            BlenderbotSmallForConditionalGeneration,
            BlenderbotSmallModel,
            BlenderbotSmallPreTrainedModel,
        )

    # 尝试检查是否安装了 TensorFlow 库，如果未安装则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，不做任何操作
    except OptionalDependencyNotAvailable:
        pass
    # 如果 TensorFlow 可用，则导入相关的 TensorFlow 版 Blenderbot Small 模型类和常量
    else:
        from .modeling_tf_blenderbot_small import (
            TFBlenderbotSmallForConditionalGeneration,
            TFBlenderbotSmallModel,
            TFBlenderbotSmallPreTrainedModel,
        )

    # 尝试检查是否安装了 Flax 库，如果未安装则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，不做任何操作
    except OptionalDependencyNotAvailable:
        pass
    # 如果 Flax 可用，则导入相关的 Flax 版 Blenderbot Small 模型类和常量
    else:
        from .modeling_flax_blenderbot_small import (
            FlaxBlenderbotSmallForConditionalGeneration,
            FlaxBlenderbotSmallModel,
            FlaxBlenderbotSmallPreTrainedModel,
        )
else:
    # 如果前面的条件不满足，则执行以下代码块
    import sys
    # 导入 sys 模块，用于处理 Python 解释器的系统参数和功能

    # 将当前模块注册到 sys.modules 中，使用 _LazyModule 包装当前模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
    # __name__ 表示当前模块的名称
    # globals()["__file__"] 获取当前模块的文件路径
    # _import_structure 可能是一个函数或对象，用于指定模块的导入结构
    # module_spec=__spec__ 指定当前模块的规范对象，用于描述模块的详细信息

`.\models\blip\configuration_blip.py`

# 导入操作系统和Union类型
import os
from typing import Union

# 从configuration_utils中导入PretrainedConfig类
from ...configuration_utils import PretrainedConfig
# 从utils中导入logging函数
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义一个字典，映射预训练模型名称到其配置文件的URL
BLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "Salesforce/blip-vqa-base": "https://huggingface.co/Salesforce/blip-vqa-base/resolve/main/config.json",
    "Salesforce/blip-vqa-capfit-large": (
        "https://huggingface.co/Salesforce/blip-vqa-base-capfit/resolve/main/config.json"
    ),
    "Salesforce/blip-image-captioning-base": (
        "https://huggingface.co/Salesforce/blip-image-captioning-base/resolve/main/config.json"
    ),
    "Salesforce/blip-image-captioning-large": (
        "https://huggingface.co/Salesforce/blip-image-captioning-large/resolve/main/config.json"
    ),
    "Salesforce/blip-itm-base-coco": "https://huggingface.co/Salesforce/blip-itm-base-coco/resolve/main/config.json",
    "Salesforce/blip-itm-large-coco": "https://huggingface.co/Salesforce/blip-itm-large-coco/resolve/main/config.json",
    "Salesforce/blip-itm-base-flikr": "https://huggingface.co/Salesforce/blip-itm-base-flikr/resolve/main/config.json",
    "Salesforce/blip-itm-large-flikr": (
        "https://huggingface.co/Salesforce/blip-itm-large-flikr/resolve/main/config.json"
    ),
}

# 定义BlipTextConfig类，继承自PretrainedConfig类
class BlipTextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`BlipTextModel`]. It is used to instantiate a BLIP
    text model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the `BlipText` used by the [base
    architectures](https://huggingface.co/Salesforce/blip-vqa-base).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```
    >>> from transformers import BlipTextConfig, BlipTextModel

    >>> # Initializing a BlipTextConfig with Salesforce/blip-vqa-base style configuration
    >>> configuration = BlipTextConfig()

    >>> # Initializing a BlipTextModel (with random weights) from the Salesforce/blip-vqa-base style configuration
    >>> model = BlipTextModel(configuration)

    >>> # Accessing the model configuration

    """
    >>> configuration = model.config
    ```"""
    # 获取模型的配置信息
    configuration = model.config

    model_type = "blip_text_model"
    # 设置模型类型为文本模型

    def __init__(
        self,
        vocab_size=30524,
        hidden_size=768,
        encoder_hidden_size=768,
        intermediate_size=3072,
        projection_dim=768,
        num_hidden_layers=12,
        num_attention_heads=8,
        max_position_embeddings=512,
        hidden_act="gelu",
        layer_norm_eps=1e-12,
        hidden_dropout_prob=0.0,
        attention_probs_dropout_prob=0.0,
        initializer_range=0.02,
        bos_token_id=30522,
        eos_token_id=2,
        pad_token_id=0,
        sep_token_id=102,
        is_decoder=True,
        use_cache=True,
        label_smoothing=0.0,
        **kwargs,
    ):
        # 调用父类初始化方法，设置特殊标记的ID，并传入额外的关键字参数
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            sep_token_id=sep_token_id,
            **kwargs,
        )

        # 设置模型配置的各种参数
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.encoder_hidden_size = encoder_hidden_size
        self.intermediate_size = intermediate_size
        self.projection_dim = projection_dim
        self.hidden_dropout_prob = hidden_dropout_prob
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.max_position_embeddings = max_position_embeddings
        self.layer_norm_eps = layer_norm_eps
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.is_decoder = is_decoder
        self.use_cache = use_cache
        self.label_smoothing = label_smoothing

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 设置关键字参数中的token
        cls._set_token_in_kwargs(kwargs)

        # 获取配置字典和处理后的关键字参数
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果从BlipConfig加载，获取文本配置字典
        if config_dict.get("model_type") == "blip":
            config_dict = config_dict["text_config"]

        # 如果配置字典中包含model_type，并且不同于当前类的model_type，发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 从配置字典和关键字参数创建实例
        return cls.from_dict(config_dict, **kwargs)
# 定义 BlipVisionConfig 类，用于存储 [`BlipVisionModel`] 的配置信息。该类用于实例化 BLIP 视觉模型，
# 根据指定参数定义模型架构。默认情况下，配置实例化将产生与 Blip-base 架构类似的配置。
class BlipVisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`BlipVisionModel`]. It is used to instantiate a
    BLIP vision model according to the specified arguments, defining the model architecture. Instantiating a
    configuration defaults will yield a similar configuration to that of the Blip-base
    [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        image_size (`int`, *optional*, defaults to 384):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 1e-10):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    Example:

    ```
    >>> from transformers import BlipVisionConfig, BlipVisionModel

    >>> # Initializing a BlipVisionConfig with Salesforce/blip-vqa-base style configuration
    >>> configuration = BlipVisionConfig()

    >>> # Initializing a BlipVisionModel (with random weights) from the Salesforce/blip-vqa-base style configuration
    >>> model = BlipVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```

    model_type = "blip_vision_model"
    # 初始化方法，设置模型的各种参数和超参数
    def __init__(
        self,
        hidden_size=768,                  # 隐藏层的大小，默认为768
        intermediate_size=3072,           # 中间层的大小，默认为3072
        projection_dim=512,               # 投影维度，默认为512
        num_hidden_layers=12,             # 隐藏层数，默认为12
        num_attention_heads=12,           # 注意力头的数量，默认为12
        image_size=384,                   # 图像大小，默认为384
        patch_size=16,                    # 图像块大小，默认为16
        hidden_act="gelu",                # 隐藏层激活函数，默认为gelu
        layer_norm_eps=1e-5,              # 层归一化的 epsilon，默认为1e-5
        attention_dropout=0.0,            # 注意力层的 dropout，默认为0.0
        initializer_range=1e-10,          # 初始化范围，默认为1e-10
        **kwargs,                         # 其他关键字参数
    ):
        super().__init__(**kwargs)        # 调用父类的初始化方法，传入额外的关键字参数

        self.hidden_size = hidden_size    # 设置对象的隐藏层大小属性
        self.intermediate_size = intermediate_size  # 设置对象的中间层大小属性
        self.projection_dim = projection_dim        # 设置对象的投影维度属性
        self.num_hidden_layers = num_hidden_layers  # 设置对象的隐藏层数属性
        self.num_attention_heads = num_attention_heads  # 设置对象的注意力头数量属性
        self.patch_size = patch_size                # 设置对象的图像块大小属性
        self.image_size = image_size                # 设置对象的图像大小属性
        self.initializer_range = initializer_range  # 设置对象的初始化范围属性
        self.attention_dropout = attention_dropout  # 设置对象的注意力 dropout 属性
        self.layer_norm_eps = layer_norm_eps        # 设置对象的层归一化 epsilon 属性
        self.hidden_act = hidden_act                # 设置对象的隐藏层激活函数属性

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        cls._set_token_in_kwargs(kwargs)  # 调用类方法设置 kwargs 中的 token

        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)  # 获得配置字典和更新的关键字参数

        # 如果配置字典的模型类型是 "blip"，则使用视觉配置字典
        if config_dict.get("model_type") == "blip":
            config_dict = config_dict["vision_config"]

        # 如果配置字典中包含模型类型，并且类属性中的 model_type 不同，发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 从配置字典和关键字参数创建对象
        return cls.from_dict(config_dict, **kwargs)
# BlipConfig 类继承自 PretrainedConfig 类，用于存储 BLIP 模型的配置信息。
class BlipConfig(PretrainedConfig):
    r"""
    [`BlipConfig`] 是一个配置类，用于存储 [`BlipModel`] 的配置信息。它用于根据指定的参数实例化一个 BLIP 模型，
    定义文本模型和视觉模型的配置。使用默认参数实例化一个配置对象将得到类似于 BLIP-base
    [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) 架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。更多信息请参阅 [`PretrainedConfig`] 的文档。

    Args:
        text_config (`dict`, *optional*):
            用于初始化 [`BlipTextConfig`] 的配置选项字典。
        vision_config (`dict`, *optional*):
            用于初始化 [`BlipVisionConfig`] 的配置选项字典。
        projection_dim (`int`, *optional*, defaults to 512):
            文本和视觉投影层的维度。
        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
            *logit_scale* 参数的初始值。默认使用原始 BLIP 实现中的值。
        image_text_hidden_size (`int`, *optional*, defaults to 256):
            图像文本融合层隐藏状态的维度。
        label_smoothing (float, optional, *optional*, defaults to 0.0):
            在计算损失时的平滑度，取值范围为 [0.0, 1.0]，其中 0.0 表示无平滑。目标成为原始标签和均匀分布的混合体，详细描述见
            `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__。默认值: :math:`0.0`。
        kwargs (*optional*):
            关键字参数字典。

    Example:

    ```
    >>> from transformers import BlipConfig, BlipModel

    >>> # 使用 Salesforce/blip-vqa-base 风格的配置初始化 BlipConfig
    >>> configuration = BlipConfig()

    >>> # 使用 Salesforce/blip-vqa-base 风格的配置初始化 BlipModel（随机权重）
    >>> model = BlipModel(configuration)

    >>> # 访问模型配置
    >>> configuration = model.config

    >>> # 也可以从 BlipTextConfig 和 BlipVisionConfig 初始化 BlipConfig

    >>> # 初始化 BLIPText 和 BLIPVision 配置
    >>> config_text = BlipTextConfig()
    >>> config_vision = BlipVisionConfig()

    >>> config = BlipConfig.from_text_vision_configs(config_text, config_vision)
    ```
    """

    # 模型类型标识符
    model_type = "blip"

    # 初始化方法
    def __init__(
        self,
        text_config=None,
        vision_config=None,
        projection_dim=512,
        logit_scale_init_value=2.6592,
        image_text_hidden_size=256,
        label_smoothing=0.0,
        **kwargs,
    ):
        super().__init__(**kwargs)

        if text_config is None:
            text_config = {}
            # 如果未提供 `text_config` 参数，则使用默认配置，并记录日志
            logger.info("`text_config` is `None`. Initializing the `BlipTextConfig` with default values.")

        if vision_config is None:
            vision_config = {}
            # 如果未提供 `vision_config` 参数，则使用默认配置，并记录日志
            logger.info("`vision_config` is `None`. Initializing the `BlipVisionConfig` with default values.")

        # 使用提供的 `text_config` 和 `vision_config` 创建 `BlipTextConfig` 和 `BlipVisionConfig` 的实例
        self.text_config = BlipTextConfig(**text_config)
        self.vision_config = BlipVisionConfig(**vision_config)

        # 设置文本编码器的隐藏层大小为视觉模型的隐藏层大小
        self.text_config.encoder_hidden_size = self.vision_config.hidden_size

        # 设置投影维度、logit 缩放初始值、初始化因子、初始化范围、图像文本隐藏层大小和标签平滑度参数
        self.projection_dim = projection_dim
        self.logit_scale_init_value = logit_scale_init_value
        self.initializer_factor = 1.0
        self.initializer_range = 0.02
        self.image_text_hidden_size = image_text_hidden_size
        self.label_smoothing = label_smoothing

    @classmethod
    def from_text_vision_configs(cls, text_config: BlipTextConfig, vision_config: BlipVisionConfig, **kwargs):
        r"""
        从 Blip 文本模型配置和 Blip 视觉模型配置实例化一个 [`BlipConfig`]（或其派生类）。

        Returns:
            [`BlipConfig`]: 一个配置对象的实例
        """

        # 使用文本配置和视觉配置的字典表示，以及其他可能的关键字参数实例化类
        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)

`.\models\blip\convert_blip_original_pytorch_to_hf.py`

# 定义一个装饰器，用于告知解释器在调用 convert_blip_checkpoint 函数时不需要进行梯度计算
@torch.no_grad()
# 定义函数 convert_blip_checkpoint，用于将 BLIP 模型的检查点转换为 transformers 模型的权重
def convert_blip_checkpoint(pytorch_dump_folder_path, config_path=None):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    # 如果提供了配置文件路径，则加载 BLIP 模型的配置信息
    if config_path is not None:
        config = BlipConfig.from_pretrained(config_path)
    else:
        # 如果没有提供配置，则使用默认配置创建 BlipConfig 对象
        config = BlipConfig(projection_dim=512, text_config={}, vision_config={})

    # 创建用于生成文本的 BLIP 模型对象，并设置为评估模式
    hf_model = BlipForConditionalGeneration(config).eval()

    # 指定预训练模型的 URL 地址
    model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"

    # 使用指定的 URL 地址加载预训练模型，并设置为评估模式
    pt_model = blip_decoder(pretrained=model_url, image_size=384, vit="base")
    pt_model = pt_model.eval()

    # 复制修改后的状态字典，并为每个键重新命名
    modified_state_dict = pt_model.state_dict()
    for key in modified_state_dict.copy():
        value = modified_state_dict.pop(key)
        renamed_key = rename_key(key)
        modified_state_dict[renamed_key] = value

    # 将修改后的模型状态字典加载到 hf_model 中
    hf_model.load_state_dict(modified_state_dict)

    # 加载演示图像，并指定图像大小和设备
    image_size = 384
    image = load_demo_image(image_size=image_size, device="cpu")

    # 使用 Google BERT tokenizer 创建 tokenizer 对象
    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

    # 使用 tokenizer 处理输入文本，生成 input_ids
    input_ids = tokenizer(["a picture of"]).input_ids

    # 使用 hf_model 生成文本输出
    out = hf_model.generate(image, input_ids)

    # 断言生成的文本输出符合预期的 token 序列
    assert out[0].tolist() == [30522, 1037, 3861, 1997, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102]

    # 使用 hf_model 生成文本输出（不带额外的 input_ids）
    out = hf_model.generate(image)

    # 断言生成的文本输出符合预期的 token 序列
    assert out[0].tolist() == [30522, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102]

    # 如果指定了 pytorch_dump_folder_path，则保存 hf_model 的预训练参数
    if pytorch_dump_folder_path is not None:
        hf_model.save_pretrained(pytorch_dump_folder_path)

    # 指定用于 VQA 模型的预训练模型 URL 地址
    model_url = (
        "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
    )

    # 使用指定的 URL 地址加载 VQA 预训练模型，并设置为评估模式
    vqa_model = blip_vqa(pretrained=model_url, image_size=image_size, vit="base")
    vqa_model.eval()

    # 复制修改后的状态字典，并为每个键重新命名
    modified_state_dict = vqa_model.state_dict()
    for key in modified_state_dict.copy():
        value = modified_state_dict.pop(key)
        renamed_key = rename_key(key)
        modified_state_dict[renamed_key] = value

    # 创建用于 VQA 的 BLIP 模型对象
    hf_vqa_model = BlipForQuestionAnswering(config)

    # 将修改后的模型状态字典加载到 hf_vqa_model 中
    hf_vqa_model.load_state_dict(modified_state_dict)

    # 指定 VQA 问题
    question = ["How many dogs are in this image?"]
    # 使用 tokenizer 处理 VQA 问题，生成 question_input_ids
    question_input_ids = tokenizer(question, return_tensors="pt").input_ids

    # 使用 hf_vqa_model 生成 VQA 回答
    answer = hf_vqa_model.generate(question_input_ids, image)

    # 打印解码后的 VQA 回答文本
    print(tokenizer.decode(answer[0]))

    # 断言解码后的 VQA 回答文本符合预期结果
    assert tokenizer.decode(answer[0]) == "[UNK] 1 [SEP]"

    # 如果指定了 pytorch_dump_folder_path，则保存 hf_vqa_model 的预训练参数
    if pytorch_dump_folder_path is not None:
        hf_vqa_model.save_pretrained(pytorch_dump_folder_path + "_vqa")

    # 指定用于 ITM 模型的预训练模型 URL 地址
    model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth"

    # 使用指定的 URL 地址加载 ITM 预训练模型，并设置为评估模式
    itm_model = blip_itm(pretrained=model_url, image_size=image_size, vit="base")
    itm_model.eval()

    # 复制修改后的状态字典，并为每个键重新命名
    modified_state_dict = itm_model.state_dict()
    for key in modified_state_dict.copy():
        value = modified_state_dict.pop(key)
        renamed_key = rename_key(key)
        modified_state_dict[renamed_key] = value

    # 创建用于图像文本检索的 BLIP 模型对象
    hf_itm_model = BlipForImageTextRetrieval(config)

    # 将修改后的模型状态字典加载到 hf_itm_model 中
    hf_itm_model.load_state_dict(modified_state_dict)

    # 指定图像文本检索的问题
    question = ["A picture of a woman with a dog sitting in a beach"]
    # 使用tokenizer对问题进行编码，返回PyTorch张量格式的输入ID，进行填充和截断以适应最大长度为35
    question_input_ids = tokenizer(
        question,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=35,
    ).input_ids

    # 载入预训练模型的修改后状态字典，用于更新模型参数
    hf_itm_model.load_state_dict(modified_state_dict)
    # 将模型设置为评估模式，不启用训练相关的模块（如Dropout）
    hf_itm_model.eval()

    # 使用修改后的模型进行推理，生成图片和问题输入对应的输出，使用itm头部
    out_itm = hf_itm_model(question_input_ids, image, use_itm_head=True)
    # 使用修改后的模型进行推理，生成图片和问题输入对应的输出，不使用itm头部
    out = hf_itm_model(question_input_ids, image, use_itm_head=False)

    # 断言输出中的第一个元素等于预期的值 0.2110687494277954
    assert out[0].item() == 0.2110687494277954
    # 断言输出中softmax后在第二维度上第一列的值等于预期的值 0.45698845386505127
    assert torch.nn.functional.softmax(out_itm[0], dim=1)[:, 1].item() == 0.45698845386505127

    # 如果给定了PyTorch模型保存的文件夹路径，则将修改后的模型保存在指定路径+"_itm"处
    if pytorch_dump_folder_path is not None:
        hf_itm_model.save_pretrained(pytorch_dump_folder_path + "_itm")
# 如果当前脚本作为主程序执行，则执行以下代码块
if __name__ == "__main__":
    # 创建一个参数解析器对象
    parser = argparse.ArgumentParser()
    # 添加一个命令行参数，用于指定输出的 PyTorch 模型的路径
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加一个命令行参数，用于指定要转换的模型的 hf config.json 文件路径
    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
    # 解析命令行参数
    args = parser.parse_args()

    # 调用 convert_blip_checkpoint 函数，传入命令行参数中的 checkpoint_path、pytorch_dump_folder_path 和 config_path
    convert_blip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)

`.\models\blip\image_processing_blip.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for BLIP."""

from typing import Dict, List, Optional, Union

import numpy as np

from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format
from ...image_utils import (
    OPENAI_CLIP_MEAN,
    OPENAI_CLIP_STD,
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_kwargs,
    validate_preprocess_arguments,
)
from ...utils import TensorType, is_vision_available, logging


if is_vision_available():
    import PIL


logger = logging.get_logger(__name__)


class BlipImageProcessor(BaseImageProcessor):
    r"""
    Constructs a BLIP image processor.
    """
    """
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            是否将图像的（高度，宽度）尺寸调整为指定的 `size`。可以在 `preprocess` 方法的 `do_resize` 参数中被覆盖。
        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
            调整后的输出图像尺寸。可以在 `preprocess` 方法的 `size` 参数中被覆盖。
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            在调整图像大小时使用的重采样滤波器。仅在 `do_resize` 设置为 `True` 时有效。可以在 `preprocess` 方法的 `resample` 参数中被覆盖。
        do_rescale (`bool`, *optional*, defaults to `True`):
            是否按指定的缩放比例 `rescale_factor` 进行图像缩放。可以在 `preprocess` 方法的 `do_rescale` 参数中被覆盖。
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            如果进行图像缩放，则使用的缩放因子。仅在 `do_rescale` 设置为 `True` 时有效。可以在 `preprocess` 方法的 `rescale_factor` 参数中被覆盖。
        do_normalize (`bool`, *optional*, defaults to `True`):
            是否对图像进行归一化处理。可以在 `preprocess` 方法的 `do_normalize` 参数中被覆盖。
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            如果进行图像归一化，则使用的均值。这是一个浮点数或与图像通道数相同长度的浮点数列表。可以在 `preprocess` 方法的 `image_mean` 参数中被覆盖。
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            如果进行图像归一化，则使用的标准差。这是一个浮点数或与图像通道数相同长度的浮点数列表。可以在 `preprocess` 方法的 `image_std` 参数中被覆盖。
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            是否将图像转换为 RGB 格式。
    """

    # 定义模型输入的名称列表
    model_input_names = ["pixel_values"]

    def __init__(
        self,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = True,
        **kwargs,
    ):
        """
        初始化方法，设置图像预处理参数。

        Args:
            do_resize (`bool`, *optional*, defaults to `True`): 是否将图像的（高度，宽度）尺寸调整为指定的 `size`。
                可以在 `preprocess` 方法的 `do_resize` 参数中被覆盖。
            size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): 调整后的输出图像尺寸。
                可以在 `preprocess` 方法的 `size` 参数中被覆盖。
            resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
                在调整图像大小时使用的重采样滤波器。仅在 `do_resize` 设置为 `True` 时有效。
                可以在 `preprocess` 方法的 `resample` 参数中被覆盖。
            do_rescale (`bool`, *optional*, defaults to `True`): 是否按指定的缩放比例 `rescale_factor` 进行图像缩放。
                可以在 `preprocess` 方法的 `do_rescale` 参数中被覆盖。
            rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
                如果进行图像缩放，则使用的缩放因子。仅在 `do_rescale` 设置为 `True` 时有效。
                可以在 `preprocess` 方法的 `rescale_factor` 参数中被覆盖。
            do_normalize (`bool`, *optional*, defaults to `True`): 是否对图像进行归一化处理。
                可以在 `preprocess` 方法的 `do_normalize` 参数中被覆盖。
            image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
                如果进行图像归一化，则使用的均值。这是一个浮点数或与图像通道数相同长度的浮点数列表。
                可以在 `preprocess` 方法的 `image_mean` 参数中被覆盖。
            image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
                如果进行图像归一化，则使用的标准差。这是一个浮点数或与图像通道数相同长度的浮点数列表。
                可以在 `preprocess` 方法的 `image_std` 参数中被覆盖。
            do_convert_rgb (`bool`, *optional*, defaults to `True`): 是否将图像转换为 RGB 格式。
        **kwargs: 其他未明确指定的参数，以字典形式传递。
        """
    # 初始化函数，继承父类的初始化方法，并设置一些参数的默认值
    ) -> None:
        # 调用父类的初始化方法，传入所有关键字参数
        super().__init__(**kwargs)
        # 如果 size 参数不为 None，则使用它；否则设置默认的高度和宽度为 384
        size = size if size is not None else {"height": 384, "width": 384}
        # 根据给定的 size 字典，获取一个符合要求的尺寸字典，确保是正方形
        size = get_size_dict(size, default_to_square=True)

        # 设置对象的属性，用于控制是否进行图片缩放和缩放后的尺寸
        self.do_resize = do_resize
        self.size = size
        # 设置图像缩放时使用的重采样方法，默认为 BICUBIC
        self.resample = resample
        # 控制是否进行图像的线性缩放
        self.do_rescale = do_rescale
        # 图像缩放的因子
        self.rescale_factor = rescale_factor
        # 控制是否进行图像的标准化
        self.do_normalize = do_normalize
        # 图像标准化的均值，默认使用 OPENAI_CLIP_MEAN
        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
        # 图像标准化的标准差，默认使用 OPENAI_CLIP_STD
        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
        # 控制是否将图像转换为 RGB 格式
        self.do_convert_rgb = do_convert_rgb
        # 定义一个包含所有有效处理器键的列表
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "resample",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "do_convert_rgb",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    # 从 transformers.models.vit.image_processing_vit.ViTImageProcessor.resize 复制而来，用于调整图像大小，并将重采样方法从 BILINEAR 改为 BICUBIC
    def resize(
        self,
        # 图像的 ndarray 数组作为输入
        image: np.ndarray,
        # 目标尺寸的字典，包含高度和宽度
        size: Dict[str, int],
        # 图像的重采样方法，默认为 BICUBIC
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        # 数据格式参数，用于指定通道维度的表示方式
        data_format: Optional[Union[str, ChannelDimension]] = None,
        # 输入数据的格式参数，用于指定通道维度的表示方式
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        # 其他关键字参数
        **kwargs,
    ) -> np.ndarray:
        """
        Resize an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        """
        # 调整输入的大小参数格式，确保其为字典形式
        size = get_size_dict(size)
        # 检查字典中是否包含必需的 height 和 width 键
        if "height" not in size or "width" not in size:
            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
        # 根据输入的 size 字典获取目标输出的尺寸大小
        output_size = (size["height"], size["width"])
        # 调用 resize 函数对图像进行大小调整，返回调整后的图像数据
        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )

    def preprocess(
        self,
        images: ImageInput,
        do_resize: Optional[bool] = None,
        size: Optional[Dict[str, int]] = None,
        resample: PILImageResampling = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        do_convert_rgb: bool = None,
        data_format: ChannelDimension = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,

`.\models\blip\modeling_blip.py`

# 设置文件编码格式为 UTF-8

# 导入警告模块，用于处理警告信息
import warnings

# 导入 dataclass 模块，用于定义数据类
from dataclasses import dataclass

# 导入类型提示相关的模块
from typing import Any, Optional, Tuple, Union

# 导入 PyTorch 框架及其相关模块
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn.functional import normalize

# 导入自定义模块和函数
from ...activations import ACT2FN
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
from ...modeling_utils import PreTrainedModel
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)

# 导入 BLIP 模型的配置类
from .configuration_blip import BlipConfig, BlipTextConfig, BlipVisionConfig

# 导入文本相关的 BLIP 模型类
from .modeling_blip_text import BlipTextLMHeadModel, BlipTextModel

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# BLIP 模型的检查点名称
_CHECKPOINT_FOR_DOC = "Salesforce/blip-vqa-base"

# BLIP 预训练模型的列表
BLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "Salesforce/blip-vqa-base",
    "Salesforce/blip-vqa-capfilt-large",
    "Salesforce/blip-image-captioning-base",
    "Salesforce/blip-image-captioning-large",
    "Salesforce/blip-itm-base-coco",
    "Salesforce/blip-itm-large-coco",
    "Salesforce/blip-itm-base-flickr",
    "Salesforce/blip-itm-large-flickr",
    # 查看所有 BLIP 模型列表：https://huggingface.co/models?filter=blip
]


# 从 transformers.models.clip.modeling_clip.contrastive_loss 复制的对比损失函数
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
    """
    计算对比损失，使用交叉熵损失函数。
    Args:
        logits (torch.Tensor): 模型预测的 logits.

    Returns:
        torch.Tensor: 对比损失值.
    """
    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))


# 从 transformers.models.clip.modeling_clip.clip_loss 复制并修改为 blip_loss
def blip_loss(similarity: torch.Tensor) -> torch.Tensor:
    """
    计算 BLIP 损失，包括文本和图像的对比损失的平均值。
    Args:
        similarity (torch.Tensor): 模型预测的相似性张量.

    Returns:
        torch.Tensor: BLIP 损失值.
    """
    caption_loss = contrastive_loss(similarity)
    image_loss = contrastive_loss(similarity.t())
    return (caption_loss + image_loss) / 2.0


@dataclass
class BlipForConditionalGenerationModelOutput(ModelOutput):
    """
    BLIP 生成条件模型的输出类，继承自 BaseModelOutput，并包含最后隐藏状态的图像嵌入池化结果。
    该类还添加了来自文本解码器的损失项。
    """
    pass
    Args:
        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            Languge modeling loss from the text decoder.
            文本解码器生成的语言建模损失（如果提供了标签）。
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
            Prediction scores of the language modeling head of the text decoder model.
            文本解码器模型的语言建模头部的预测分数。
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*):
            The image embeddings obtained after applying the Vision Transformer model to the input image.
            应用视觉Transformer模型到输入图像后得到的图像嵌入。
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the model.
            模型最后一层输出的隐藏状态序列。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
            每层模型输出的隐藏状态，以及可选的初始嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
            注意力权重，经过注意力softmax后的权重，用于计算自注意力头中的加权平均值。

    """

    loss: Optional[Tuple[torch.FloatTensor]] = None
    logits: Optional[Tuple[torch.FloatTensor]] = None
    image_embeds: Optional[torch.FloatTensor] = None
    last_hidden_state: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None

    @property
    def decoder_logits(self):
        """
        Deprecated property to access logits. Use `logits` attribute instead.
        获取logits的过时属性。请使用`logits`属性。
        """
        warnings.warn(
            "`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers."
            " Please use the `logits` attribute to retrieve the final output instead.",
            FutureWarning,
        )
        return self.logits
# 数据类，用于表示BlipTextVision模型的输出结果，继承自ModelOutput基类
@dataclass
class BlipTextVisionModelOutput(ModelOutput):
    """
    从视觉模型输出基类改编而来，还包含了最后隐藏状态的图像嵌入。这个类还添加了文本解码器的损失项。

    Args:
        loss (`torch.FloatTensor`，形状为 `(1,)`，可选，当提供`labels`时返回):
            文本解码器的语言建模损失。
        image_embeds (`torch.FloatTensor`，形状为 `(batch_size, output_dim)`，可选，当模型初始化时使用 `with_projection=True` 返回):
            通过将池化输出应用于投影层获得的图像嵌入。
        last_hidden_state (`torch.FloatTensor`，形状为 `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的隐藏状态序列输出。
        hidden_states (`tuple(torch.FloatTensor)`，可选，当传入 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
            `torch.FloatTensor` 元组（如果模型有嵌入层，则返回一个用于每层输出的嵌入输出 + 每层输出的隐藏状态），
            形状为 `(batch_size, sequence_length, hidden_size)`。

            模型每层输出的隐藏状态，以及可选的初始嵌入输出。
        attentions (`tuple(torch.FloatTensor)`，可选，当传入 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
            `torch.FloatTensor` 元组（每层一个），
            形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。

            注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
    """

    loss: Optional[torch.FloatTensor] = None  # 损失项，默认为None
    image_embeds: Optional[torch.FloatTensor] = None  # 图像嵌入，默认为None
    last_hidden_state: torch.FloatTensor = None  # 最后一层隐藏状态的输出，默认为None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None  # 隐藏状态的元组，默认为None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None  # 注意力权重的元组，默认为None


# 数据类，用于表示BlipImageTextMatching模型的输出结果，继承自ModelOutput基类
@dataclass
class BlipImageTextMatchingModelOutput(ModelOutput):
    """
    从视觉模型输出基类改编而来，还包含了最后隐藏状态的图像嵌入。这个类还添加了文本解码器的损失项以及图像文本相似度分数。
    """
    """
    Args:
        itm_score (`torch.FloatTensor`):
            The image-text similarity scores.
            图像与文本之间的相似性分数。
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Languge modeling loss from the text decoder.
            文本解码器产生的语言建模损失，当提供了`labels`时返回。
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
            通过将投影层应用于池化输出得到的图像嵌入。
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
            模型最后一层输出的隐藏状态序列。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
            每层模型输出的隐藏状态组成的元组，如果模型有嵌入层则包括嵌入输出，形状为`(batch_size, sequence_length, hidden_size)`。
        vision_pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*):
            Last layer hidden-state of the vision of the vision-only branch of the model.
            模型视觉分支的最后一层隐藏状态。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
            注意力权重，经过注意力softmax后的加权平均值，用于自注意力头的计算。
        question_embeds (`torch.FloatTensor`):
            The question embeddings obtained by the text projection layer.
            通过文本投影层得到的问题嵌入。
    """

    itm_score: Optional[torch.FloatTensor] = None
    loss: Optional[torch.FloatTensor] = None
    image_embeds: Optional[torch.FloatTensor] = None
    last_hidden_state: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    vision_pooler_output: Optional[torch.FloatTensor] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    question_embeds: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class BlipOutput(ModelOutput):
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image: (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text: (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds: (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`].
        image_embeds: (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
            The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`].
        text_model_output: (`BaseModelOutputWithPooling`):
            The output of the [`BlipTextModel`].
        vision_model_output: (`BaseModelOutputWithPooling`):
            The output of the [`BlipVisionModel`].
    """

    loss: Optional[torch.FloatTensor] = None  # 初始化为可选的浮点数张量，用于存储图像-文本相似性的对比损失
    logits_per_image: torch.FloatTensor = None  # 存储图像嵌入与文本嵌入之间的点积得分，表示图像-文本的相似性分数
    logits_per_text: torch.FloatTensor = None  # 存储文本嵌入与图像嵌入之间的点积得分，表示文本-图像的相似性分数
    text_embeds: torch.FloatTensor = None  # 存储通过投影层应用到[`BlipTextModel`]池化输出得到的文本嵌入
    image_embeds: torch.FloatTensor = None  # 存储通过投影层应用到[`BlipVisionModel`]池化输出得到的图像嵌入
    text_model_output: BaseModelOutputWithPooling = None  # 存储[`BlipTextModel`]的输出，包含池化层的基本模型输出
    vision_model_output: BaseModelOutputWithPooling = None  # 存储[`BlipVisionModel`]的输出，包含池化层的基本模型输出

    def to_tuple(self) -> Tuple[Any]:
        return tuple(
            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )


class BlipVisionEmbeddings(nn.Module):
    """
    A module for handling vision embeddings in the Blip model.

    Args:
        config (BlipVisionConfig): Configuration object for the BlipVisionEmbeddings module.
    """

    def __init__(self, config: BlipVisionConfig):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size  # 设置嵌入维度为配置中的隐藏尺寸
        self.image_size = config.image_size  # 设置图像大小为配置中的图像尺寸
        self.patch_size = config.patch_size  # 设置补丁大小为配置中的补丁尺寸

        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))  # 初始化类别嵌入参数

        self.patch_embedding = nn.Conv2d(
            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
        )  # 创建卷积层，用于从图像中提取补丁特征嵌入

        self.num_patches = (self.image_size // self.patch_size) ** 2  # 计算图像中的补丁数量
        self.num_positions = self.num_patches + 1  # 计算位置嵌入的数量，包括额外的类别嵌入

        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))  # 初始化位置嵌入参数
    # 定义前向传播方法，接收像素数值作为输入，并返回张量
    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        # 获取输入张量的批量大小
        batch_size = pixel_values.shape[0]
        # 获取目标数据类型，与补丁嵌入权重的数据类型相同
        target_dtype = self.patch_embedding.weight.dtype
        # 使用补丁嵌入层处理输入像素值，将像素值转换为指定数据类型，并形成补丁嵌入
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
        # 将补丁嵌入展平，并调换维度以适应后续操作
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)

        # 扩展类别嵌入以匹配批次大小，并转换为目标数据类型
        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
        # 将类别嵌入与补丁嵌入连接起来形成最终嵌入
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
        # 将位置嵌入加到嵌入张量中（位置嵌入可能与输入批次大小不完全匹配）
        embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
        # 返回最终的嵌入张量作为前向传播的输出
        return embeddings
# 从 transformers.models.clip.modeling_clip.CLIPTextEmbeddings 复制而来，将 CLIP 替换为 Blip
class BlipTextEmbeddings(nn.Module):
    def __init__(self, config: BlipTextConfig):
        super().__init__()
        embed_dim = config.hidden_size

        # 初始化 token_embedding，用于词嵌入
        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
        # 初始化 position_embedding，用于位置编码
        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)

        # 创建 position_ids 缓冲区，用于位置编码，持久化为非连续内存块
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
    ) -> torch.Tensor:
        # 获取序列长度
        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]

        # 如果未提供 position_ids，则使用预先初始化的 position_ids
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        # 如果未提供 inputs_embeds，则通过 token_embedding 获取嵌入
        if inputs_embeds is None:
            inputs_embeds = self.token_embedding(input_ids)

        # 获取位置编码
        position_embeddings = self.position_embedding(position_ids)
        # 将输入嵌入和位置编码相加作为最终的嵌入表示
        embeddings = inputs_embeds + position_embeddings

        return embeddings


class BlipAttention(nn.Module):
    """来自 'Attention Is All You Need' 论文的多头注意力机制"""

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        # 检查 embed_dim 必须被 num_heads 整除
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        # 缩放因子为 head_dim 的负半数
        self.scale = self.head_dim**-0.5
        # dropout 层
        self.dropout = nn.Dropout(config.attention_dropout)

        # 线性层 qkv，用于查询、键、值的线性变换
        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim)

        # 输出投影层，用于最终的线性映射
        self.projection = nn.Linear(self.embed_dim, self.embed_dim)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 将输入张量重塑为多头注意力矩阵的形状
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        head_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
        #
        ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        # 获取隐藏状态张量的维度信息
        bsz, tgt_len, embed_dim = hidden_states.size()

        # 使用 self.qkv 对隐藏状态进行变换，生成混合的查询、键、值张量
        mixed_qkv = (
            self.qkv(hidden_states)
            .reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads)
            .permute(2, 0, 3, 1, 4)
        )
        query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]

        # 计算注意力分数，使用 query 和 key 的点积
        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))

        # 缩放注意力分数
        attention_scores = attention_scores * self.scale

        # 将注意力分数归一化为概率分布
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 对注意力分数应用 dropout
        attention_probs = self.dropout(attention_probs)

        # 如果有头部掩码，则应用到注意力概率上
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        # 计算加权后的值张量，生成上下文层
        context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)

        # 重新调整上下文层的形状以匹配 self.projection 的输入要求
        new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,)
        context_layer = context_layer.reshape(new_context_layer_shape)

        # 使用 self.projection 将上下文层映射到输出空间
        output = self.projection(context_layer)

        # 根据需要决定是否输出注意力分数
        outputs = (output, attention_probs) if output_attentions else (output, None)

        return outputs
# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Blip
class BlipMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.activation_fn = ACT2FN[config.hidden_act]  # 从配置中获取激活函数
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)  # 第一个全连接层
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)  # 第二个全连接层

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.fc1(hidden_states)  # 输入经过第一个全连接层
        hidden_states = self.activation_fn(hidden_states)  # 应用激活函数
        hidden_states = self.fc2(hidden_states)  # 经过第二个全连接层
        return hidden_states


class BlipEncoderLayer(nn.Module):
    def __init__(self, config: BlipConfig):
        super().__init__()
        self.embed_dim = config.hidden_size
        self.self_attn = BlipAttention(config)  # 自注意力机制
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # 第一个层标准化层
        self.mlp = BlipMLP(config)  # MLP网络
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)  # 第二个层标准化层

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.FloatTensor]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states  # 残差连接

        hidden_states = self.layer_norm1(hidden_states)  # 应用第一个层标准化
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            head_mask=attention_mask,
            output_attentions=output_attentions,
        )  # 自注意力机制计算
        hidden_states = hidden_states + residual  # 添加残差连接
        residual = hidden_states  # 更新残差连接

        hidden_states = self.layer_norm2(hidden_states)  # 应用第二个层标准化
        hidden_states = self.mlp(hidden_states)  # 经过MLP网络

        hidden_states = hidden_states + residual  # 再次添加残差连接

        outputs = (hidden_states,)

        if output_attentions:
            outputs += (attn_weights,)  # 如果需要输出注意力权重，添加到输出中

        return outputs


class BlipPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    config_class = BlipConfig  # 模型配置类
    base_model_prefix = "blip"  # 基础模型前缀
    supports_gradient_checkpointing = True  # 支持梯度检查点
    # 初始化模型中特定模块的权重和偏置
    def _init_weights(self, module):
        """Initialize the weights"""
        # 获取初始化因子
        factor = self.config.initializer_range
        
        # 如果模块是卷积层、嵌入层或线性层，则使用正态分布初始化权重，并将偏置置零
        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=factor)
            if hasattr(module, "bias") and module.bias is not None:
                module.bias.data.zero_()

        # 如果模块是 BlipVisionEmbeddings 类型，则根据视觉配置初始化位置嵌入和类别嵌入
        if isinstance(module, BlipVisionEmbeddings):
            if hasattr(self.config, "vision_config"):
                factor = self.config.vision_config.initializer_range
            nn.init.trunc_normal_(
                module.position_embedding,
                mean=0.0,
                std=factor,
            )

            nn.init.trunc_normal_(
                module.class_embedding,
                mean=0.0,
                std=factor,
            )

        # 如果模块是 LayerNorm 类型，则将偏置置零并将权重填充为 1.0
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

        # 对于线性层，如果存在偏置，则将偏置置零
        elif isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()
# BLIP_START_DOCSTRING 是一个包含模型描述信息的原始文本字符串，用于指示此模型继承自 PreTrainedModel，
# 并提供了有关模型类通用方法的信息。详细内容可以在 PreTrainedModel 类的文档中找到，
# 包括下载、保存、调整输入嵌入大小、修剪头等功能。
BLIP_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`BlipConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# BLIP_TEXT_INPUTS_DOCSTRING 是关于模型文本输入参数的描述信息，包括 input_ids、attention_mask、position_ids 等参数的说明。
# 每个参数的数据类型和形状都有详细描述，以及如何获取输入 IDs 和如何使用注意力掩码等细节。
BLIP_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# BLIP_VISION_INPUTS_DOCSTRING 是一个空字符串，用于定义模型视觉输入的文档字符串，目前未提供任何信息。
BLIP_VISION_INPUTS_DOCSTRING = r"""
    
    # 参数 pixel_values 是一个 torch.FloatTensor，表示像素值，其形状为 (batch_size, num_channels, height, width)
    # 默认情况下会忽略填充值。可以使用 BlipImageProcessor 获取像素值。
    # 参见 BlipImageProcessor.__call__ 获取更多详情。

    # 是否返回所有注意力层的注意力张量。在返回的张量中，详见 attentions。
    # 默认为 False。

    # 是否返回所有层的隐藏状态。在返回的张量中，详见 hidden_states。
    # 默认为 False。

    # 是否返回一个 utils.ModelOutput 对象，而不是普通的元组。
    # 默认为 False。
"""

BLIP_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""


class BlipEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`BlipEncoderLayer`].

    Args:
        config (`BlipConfig`):
            The corresponding vision configuration for the `BlipEncoder`.
    """

    def __init__(self, config: BlipConfig):
        super().__init__()
        self.config = config
        # 创建一个包含多个 BlipEncoderLayer 实例的列表，列表长度为 config.num_hidden_layers
        self.layers = nn.ModuleList([BlipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        # 是否使用梯度检查点，默认为 False
        self.gradient_checkpointing = False

    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, BaseModelOutput]:
        r"""
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
        # Determine whether to use the provided `output_attentions` value or fallback to the model's default
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # Determine whether to use the provided `output_hidden_states` value or fallback to the model's default
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # Determine whether to use the provided `return_dict` value or fallback to the model's default
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Initialize empty tuples based on output configuration to store encoder states and attentions
        encoder_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None

        # Start with the embedded inputs as the initial hidden states
        hidden_states = inputs_embeds

        # Iterate through each encoder layer in the model
        for idx, encoder_layer in enumerate(self.layers):
            # If configured to return hidden states, append current hidden states to encoder states
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            
            # Perform gradient checkpointing if enabled during training
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    encoder_layer.__call__,
                    hidden_states,
                    attention_mask,
                    output_attentions,
                )
            else:
                # Otherwise, directly pass inputs to the encoder layer
                layer_outputs = encoder_layer(
                    hidden_states,
                    attention_mask,
                    output_attentions=output_attentions,
                )

            # Update hidden states with the output from the encoder layer
            hidden_states = layer_outputs[0]

            # If configured to return attentions, append current layer's attentions to all_attentions
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        # If configured to return hidden states, append final hidden states to encoder states
        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)

        # If return_dict is False, return a tuple of relevant outputs; otherwise, return a ModelOutput object
        if not return_dict:
            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
        )
class BlipVisionModel(BlipPreTrainedModel):
    main_input_name = "pixel_values"  # 设置主要输入名称为"pixel_values"
    config_class = BlipVisionConfig  # 指定配置类为BlipVisionConfig

    def __init__(self, config: BlipVisionConfig):
        super().__init__(config)
        self.config = config
        embed_dim = config.hidden_size

        self.embeddings = BlipVisionEmbeddings(config)  # 初始化图像嵌入模块
        self.encoder = BlipEncoder(config)  # 初始化编码器模块
        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)  # 初始化后层归一化模块

        self.post_init()  # 执行额外的初始化步骤

    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=BlipVisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        前向传播函数

        Returns:
            根据return_dict返回相应的输出对象
        """
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")  # 如果未提供pixel_values则抛出数值错误

        hidden_states = self.embeddings(pixel_values)  # 将输入的pixel_values转换为嵌入向量

        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )  # 使用编码器处理嵌入向量，得到编码器的输出

        last_hidden_state = encoder_outputs[0]  # 获取编码器输出的最后隐藏状态
        last_hidden_state = self.post_layernorm(last_hidden_state)  # 对最后隐藏状态进行层归一化处理

        pooled_output = last_hidden_state[:, 0, :]  # 获取池化输出
        pooled_output = self.post_layernorm(pooled_output)  # 对池化输出进行层归一化处理

        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]  # 如果不返回字典，则返回元组形式的输出

        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )  # 返回包含池化输出和其他编码器输出的BaseModelOutputWithPooling对象

    def get_input_embeddings(self):
        return self.embeddings  # 返回嵌入模块的实例
    def __init__(self, config: BlipConfig):
        # 调用父类的初始化方法，传入配置对象
        super().__init__(config)

        # 检查配置对象中的文本配置是否为BlipTextConfig类型，如果不是则抛出数值错误异常
        if not isinstance(config.text_config, BlipTextConfig):
            raise ValueError(
                "config.text_config is expected to be of type BlipTextConfig but is of type"
                f" {type(config.text_config)}."
            )

        # 检查配置对象中的视觉配置是否为BlipVisionConfig类型，如果不是则抛出数值错误异常
        if not isinstance(config.vision_config, BlipVisionConfig):
            raise ValueError(
                "config.vision_config is expected to be of type BlipVisionConfig but is of type"
                f" {type(config.vision_config)}."
            )

        # 从配置对象中获取文本配置和视觉配置
        text_config = config.text_config
        vision_config = config.vision_config

        # 初始化模型的投影维度、文本嵌入维度和视觉嵌入维度
        self.projection_dim = config.projection_dim
        self.text_embed_dim = text_config.hidden_size
        self.vision_embed_dim = vision_config.hidden_size

        # 初始化文本模型和视觉模型，分别使用文本配置和视觉配置
        self.text_model = BlipTextModel(text_config)
        self.vision_model = BlipVisionModel(vision_config)

        # 初始化视觉投影层和文本投影层，分别映射视觉和文本嵌入到投影维度空间，无偏置
        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)

        # 初始化对数尺度参数，使用配置中的初始值
        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))

        # 调用后初始化函数，用于权重初始化和最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
    def get_text_features(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        return_dict: Optional[bool] = None,
    ) -> torch.FloatTensor:
        r"""
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`BlipTextModel`].

        Examples:

        ```
        >>> from transformers import AutoProcessor, BlipModel

        >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```"""
        # 如果未指定返回字典，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用文本模型，传入输入的ids、注意力掩码、位置ids和是否返回字典的标志
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            return_dict=return_dict,
        )

        # 获取文本模型的汇总输出（pooled output）
        pooled_output = text_outputs[1]

        # 将汇总输出投影到文本投影层，得到文本特征
        text_features = self.text_projection(pooled_output)

        return text_features

    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
    def get_image_features(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        return_dict: Optional[bool] = None,
    ) -> torch.FloatTensor:
        r"""
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`BlipVisionModel`].

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, BlipModel

        >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```

        Initialize `return_dict` to `self.config.use_return_dict` if `return_dict` is not provided.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 获取视觉模型的输出，可以选择是否返回字典格式的输出
        vision_outputs = self.vision_model(pixel_values=pixel_values, return_dict=return_dict)

        # 从视觉模型的输出中获取池化后的特征向量
        pooled_output = vision_outputs[1]  # pooled_output
        # 将池化后的特征向量应用于视觉投影层，得到最终的图像特征表示
        image_features = self.visual_projection(pooled_output)

        # 返回图像特征表示
        return image_features

    @add_start_docstrings_to_model_forward(BLIP_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BlipOutput, config_class=BlipConfig)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        return_loss: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        BLIP模型的前向传播方法。

        Args:
            input_ids (Optional[torch.LongTensor], optional): 输入的token IDs. Defaults to None.
            pixel_values (Optional[torch.FloatTensor], optional): 输入的像素值. Defaults to None.
            attention_mask (Optional[torch.Tensor], optional): 注意力遮罩. Defaults to None.
            position_ids (Optional[torch.LongTensor], optional): 位置 IDs. Defaults to None.
            return_loss (Optional[bool], optional): 是否返回损失值. Defaults to None.
            output_attentions (Optional[bool], optional): 是否返回注意力权重. Defaults to None.
            output_hidden_states (Optional[bool], optional): 是否返回隐藏状态. Defaults to None.
            return_dict (Optional[bool], optional): 是否以字典格式返回输出. Defaults to None.

        Returns:
            BLIP模型的输出，类型为`BlipOutput`，根据`return_dict`参数决定返回方式.

        """
@add_start_docstrings(
    """
    BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass
    `input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise,
    the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption
    from the text input. If no text input is provided, the decoder will start with the [BOS] token only.
    """,
    BLIP_START_DOCSTRING,
)
class BlipForConditionalGeneration(BlipPreTrainedModel):
    # 定义配置类为 BlipConfig
    config_class = BlipConfig
    # 定义权重共享的键列表
    _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
    # 主要输入名称为 "pixel_values"
    main_input_name = "pixel_values"

    def __init__(self, config: BlipConfig):
        # 调用父类的初始化方法
        super().__init__(config)

        # 使用 BlipVisionModel 初始化视觉模型
        self.vision_model = BlipVisionModel(config.vision_config)

        # 使用 BlipTextLMHeadModel 初始化文本解码器
        self.text_decoder = BlipTextLMHeadModel(config.text_config)

        # 设置解码器的起始输入为 BOS 标记的 ID
        self.decoder_input_ids = config.text_config.bos_token_id
        # 设置解码器的填充标记的 ID
        self.decoder_pad_token_id = config.text_config.pad_token_id

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        # 返回视觉模型的嵌入模块
        return self.vision_model.embeddings.patch_embedding

    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BlipForConditionalGenerationModelOutput, config_class=BlipVisionConfig)
    def forward(
        self,
        pixel_values: torch.FloatTensor,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        labels: Optional[torch.LongTensor] = None,
        return_dict: Optional[bool] = None,
        # forward 方法的参数说明文档添加 BLIP_VISION_INPUTS_DOCSTRING
        # 替换返回文档字符串的输出类型和配置类为 BlipVisionConfig
        ):
    ) -> Union[Tuple, BlipForConditionalGenerationModelOutput]:
        r"""
        Returns:

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, BlipForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "A picture of"

        >>> inputs = processor(images=image, text=text, return_tensors="pt")

        >>> outputs = model(**inputs)
        ```"""

        # 如果 return_dict 参数未指定，则使用模型配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果 output_attentions 参数未指定，则使用模型配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果 output_hidden_states 参数未指定，则使用模型配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )

        # 使用视觉模型处理像素值，根据参数返回不同的结果
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 提取视觉输出的第一个元素，即图像嵌入
        image_embeds = vision_outputs[0]

        # 使用文本解码器处理输入的信息，生成输出结果
        outputs = self.text_decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_hidden_states=image_embeds,
            labels=labels,
            return_dict=return_dict,
            reduction="mean",
        )

        # 如果 return_dict 为 False，则返回多个输出元组
        if not return_dict:
            outputs = (outputs[0], outputs[1], image_embeds, vision_outputs[0]) + vision_outputs[2:]
            return tuple(output for output in outputs if output is not None)

        # 如果 return_dict 为 True，则返回 BlipForConditionalGenerationModelOutput 对象
        return BlipForConditionalGenerationModelOutput(
            loss=outputs.loss,
            logits=outputs.logits,
            image_embeds=image_embeds,
            last_hidden_state=vision_outputs.last_hidden_state,
            hidden_states=vision_outputs.hidden_states,
            attentions=vision_outputs.attentions,
        )

    @torch.no_grad()
    def generate(
        self,
        pixel_values: torch.FloatTensor,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        **generate_kwargs,
    ) -> torch.LongTensor:
        r"""
        Overrides *generate* function to be able to use the model as a conditional generator

        Parameters:
            pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*:
                Input image to be processed
            input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:


        Examples:
        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, BlipForConditionalGeneration

        >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model.generate(**inputs)
        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
        two cats sleeping on a couch
        ```
        """

        # 获取批处理大小
        batch_size = pixel_values.shape[0]
        
        # 使用视觉模型处理输入图像，获取视觉输出
        vision_outputs = self.vision_model(pixel_values=pixel_values)

        # 从视觉输出中提取图像嵌入
        image_embeds = vision_outputs[0]

        # 创建图像注意力掩码，用于避免在填充标记索引上执行注意力
        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image_embeds.device)

        # 如果输入的input_ids是列表，则转换为torch.LongTensor
        if isinstance(input_ids, list):
            input_ids = torch.LongTensor(input_ids)
        # 如果input_ids为None，则创建包含开始和结束标记的输入序列
        elif input_ids is None:
            input_ids = (
                torch.LongTensor([[self.decoder_input_ids, self.config.text_config.eos_token_id]])
                .repeat(batch_size, 1)
                .to(image_embeds.device)
            )

        # 设置输入序列的开始标记为配置中的开始标记
        input_ids[:, 0] = self.config.text_config.bos_token_id

        # 调整注意力掩码，移除最后一个标记以对齐输入序列
        attention_mask = attention_mask[:, :-1] if attention_mask is not None else None

        # 使用文本解码器生成文本输出
        outputs = self.text_decoder.generate(
            input_ids=input_ids[:, :-1],
            eos_token_id=self.config.text_config.sep_token_id,
            pad_token_id=self.config.text_config.pad_token_id,
            attention_mask=attention_mask,
            encoder_hidden_states=image_embeds,
            encoder_attention_mask=image_attention_mask,
            **generate_kwargs,
        )

        # 返回生成的输出
        return outputs
@add_start_docstrings(
    """
    BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text
    decoder. The vision encoder will encode the input image, the text encoder will encode the input question together
    with the encoding of the image, and the text decoder will output the answer to the question.
    """,
    BLIP_START_DOCSTRING,
)
class BlipForQuestionAnswering(BlipPreTrainedModel):
    config_class = BlipConfig
    _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]

    def __init__(self, config: BlipConfig):
        super().__init__(config)

        # Initialize the vision encoder model using the provided vision configuration
        self.vision_model = BlipVisionModel(config.vision_config)

        # Initialize the text encoder model using the provided text configuration,
        # with pooling layer excluded
        self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False)

        # Initialize the text decoder model using the provided text configuration
        self.text_decoder = BlipTextLMHeadModel(config.text_config)

        # Store special token IDs for decoder inputs
        self.decoder_pad_token_id = config.text_config.pad_token_id
        self.decoder_start_token_id = config.text_config.bos_token_id

        # Initialize weights and perform any necessary post-initialization steps
        self.post_init()

    def get_input_embeddings(self) -> nn.Module:
        # Return the patch embedding module from the vision encoder's embeddings
        return self.vision_model.embeddings.patch_embedding

    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BlipTextVisionModelOutput, config_class=BlipVisionConfig)
    def forward(
        self,
        input_ids: torch.LongTensor,
        pixel_values: torch.FloatTensor,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        labels: Optional[torch.LongTensor] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Forward pass of the BLIP model for question answering.

        Args:
            input_ids (:obj:`torch.LongTensor`):
                Indices of input sequence tokens in the vocabulary.
            pixel_values (:obj:`torch.FloatTensor`):
                Pixel values of images (shape batch_size x channels x height x width).
            decoder_input_ids (:obj:`torch.LongTensor`, optional):
                Optional input for decoder. If provided, computes the loss and returns the logits.
            decoder_attention_mask (:obj:`torch.LongTensor`, optional):
                Optional attention mask for the decoder input.
            attention_mask (:obj:`torch.LongTensor`, optional):
                Optional attention mask for the input.
            output_attentions (:obj:`bool`, optional):
                Whether to return attentions weights.
            output_hidden_states (:obj:`bool`, optional):
                Whether to return hidden states.
            labels (:obj:`torch.LongTensor`, optional):
                Labels for computing the cross-entropy loss.
            return_dict (:obj:`bool`, optional):
                Whether to return a dictionary.

        Returns:
            :class:`~transformers.BlipTextVisionModelOutput`: A subclass of :class:`~transformers.ModelOutput`.
        """
        # Implementation of the forward pass is provided by the decorated function

    @torch.no_grad()
    def generate(
        self,
        input_ids: torch.LongTensor,
        pixel_values: torch.FloatTensor,
        attention_mask: Optional[torch.LongTensor] = None,
        **generate_kwargs,
    ):
        """
        Generate output sequences for the given inputs.

        Args:
            input_ids (:obj:`torch.LongTensor`):
                Indices of input sequence tokens in the vocabulary.
            pixel_values (:obj:`torch.FloatTensor`):
                Pixel values of images (shape batch_size x channels x height x width).
            attention_mask (:obj:`torch.LongTensor`, optional):
                Optional attention mask for the input.
            **generate_kwargs:
                Additional keyword arguments for generation (e.g., max_length, num_beams).

        Returns:
            :obj:`torch.LongTensor`: Generated sequences.
        """
        # Implementation of the generation process is provided by the decorated function
    ) -> torch.LongTensor:
        r"""
        重写 *generate* 函数以便将模型用作条件生成器

        Parameters:
            input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*):
                用作生成提示的序列。
            pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*:
                要处理的输入图像。
            attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
                遮罩，避免在填充令牌索引上执行注意力。遮罩值选在 `[0, 1]` 中。`1` 表示未被掩盖的令牌，`0` 表示被掩盖的令牌。
            **generate_kwargs:
                传递给解码器 *generate* 函数的额外参数

        Examples:
        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, BlipForQuestionAnswering

        >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "How many cats are in the picture?"

        >>> inputs = processor(images=image, text=text, return_tensors="pt")

        >>> outputs = model.generate(**inputs)
        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
        2
        ```
        """
        vision_outputs = self.vision_model(pixel_values=pixel_values)

        image_embeds = vision_outputs[0]  # 提取视觉模型的输出中的图像嵌入表示

        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image_embeds.device)  # 创建图像的注意力遮罩

        if isinstance(input_ids, list):
            input_ids = torch.LongTensor(input_ids)  # 如果输入的是列表，将其转换为 torch.LongTensor

        question_outputs = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_hidden_states=image_embeds,
            encoder_attention_mask=image_attention_mask,
            return_dict=False,
        )  # 使用文本编码器处理输入的文本和图像嵌入

        question_embeds = question_outputs[0]  # 提取文本编码器的输出中的问题嵌入表示

        question_attention_mask = torch.ones(question_embeds.size()[:-1], dtype=torch.long).to(question_embeds.device)  # 创建问题的注意力遮罩

        bos_ids = torch.full(
            (question_embeds.size(0), 1), fill_value=self.decoder_start_token_id, device=question_embeds.device
        )  # 创建包含起始标记的张量

        outputs = self.text_decoder.generate(
            input_ids=bos_ids,
            eos_token_id=self.config.text_config.sep_token_id,
            pad_token_id=self.config.text_config.pad_token_id,
            encoder_hidden_states=question_embeds,
            encoder_attention_mask=question_attention_mask,
            **generate_kwargs,
        )  # 使用文本解码器生成输出序列

        return outputs  # 返回生成的输出序列
# 定义 BLIP 图像文本检索模型，包含视觉和文本投影器以及顶部的分类头部。用于图像文本检索任务，给定图像和文本，模型返回文本与图像相关性的概率。
@add_start_docstrings(
    """
    BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of
    image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to
    the image.
    """,
    BLIP_START_DOCSTRING,
)
class BlipForImageTextRetrieval(BlipPreTrainedModel):
    # 使用 BlipConfig 类型的配置
    config_class = BlipConfig

    def __init__(self, config: BlipConfig):
        # 调用父类构造函数，传入配置
        super().__init__(config)

        # 初始化视觉模型，使用 BlipVisionModel 和视觉配置
        self.vision_model = BlipVisionModel(config.vision_config)

        # 初始化文本编码器，使用 BlipTextModel 和文本配置，不添加池化层
        self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False)

        # 视觉投影层，线性变换视觉隐藏状态的维度到图像文本隐藏大小
        self.vision_proj = nn.Linear(config.vision_config.hidden_size, config.image_text_hidden_size)

        # 文本投影层，线性变换文本隐藏状态的维度到图像文本隐藏大小
        self.text_proj = nn.Linear(config.text_config.hidden_size, config.image_text_hidden_size)

        # 图像文本匹配头部，线性层输出大小为 2，用于二分类任务
        self.itm_head = nn.Linear(config.text_config.hidden_size, 2)

        # 解码器的填充标记 ID，根据配置的填充标记 ID 初始化
        self.decoder_pad_token_id = (
            config.text_config.pad_token_id
            if not hasattr(config, "decoder_pad_token_id")
            else config.decoder_pad_token_id
        )

        # 解码器的起始标记 ID，根据配置的起始标记 ID 初始化
        self.decoder_start_token_id = (
            config.text_config.bos_token_id
            if not hasattr(config, "decoder_start_token_id")
            else config.decoder_start_token_id
        )

        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输入嵌入，返回视觉模型的 patch 嵌入层
    def get_input_embeddings(self) -> nn.Module:
        return self.vision_model.embeddings.patch_embedding

    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BlipTextVisionModelOutput, config_class=BlipVisionConfig)
    # 重写 forward 方法，使用 BLIP_VISION_INPUTS_DOCSTRING 和 BlipTextVisionModelOutput 来替换返回值的文档字符串
    def forward(
        self,
        input_ids: torch.LongTensor,
        pixel_values: torch.FloatTensor,
        use_itm_head: Optional[bool] = True,
        attention_mask: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 输入参数：input_ids 是文本输入的 token ID，pixel_values 是视觉输入的像素值
        # use_itm_head 控制是否使用图像文本匹配头部，attention_mask 控制注意力机制的掩码
        # output_attentions 和 output_hidden_states 控制是否输出注意力权重和隐藏状态
        # return_dict 控制是否返回字典形式的输出
        #
        # 输出类型为 BlipTextVisionModelOutput，配置类为 BlipVisionConfig
    ) -> Union[Tuple, BlipTextVisionModelOutput]:
        r"""
        Returns:

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, BlipForImageTextRetrieval

        >>> model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "an image of a cat"

        >>> inputs = processor(images=image, text=text, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```
        """
        # 如果 return_dict 参数不为 None，则使用该值；否则使用 self.config.use_return_dict 的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果 output_attentions 参数不为 None，则使用该值；否则使用 self.config.output_attentions 的设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果 output_hidden_states 参数不为 None，则使用该值；否则使用 self.config.output_hidden_states 的设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )

        # 使用 vision_model 处理图像数据，获取视觉模型的输出
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 提取图像的嵌入表示
        image_embeds = vision_outputs[0]
        # 创建与图像嵌入相同大小的注意力掩码
        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long)

        # 如果 use_itm_head 为真，则使用 text_encoder 处理输入问题文本，并应用 itm_head 进行匹配分数计算
        if use_itm_head:
            # 使用 text_encoder 处理文本数据，将图像嵌入作为 encoder_hidden_states 提供给文本编码器
            question_embeds = self.text_encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                encoder_hidden_states=image_embeds,
                encoder_attention_mask=image_atts,
                return_dict=return_dict,
            )
            # 如果 return_dict 为 False，则使用第一个元素作为输出；否则使用 last_hidden_state
            question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state

            # 使用 itm_head 计算问题嵌入的匹配分数
            output = self.itm_head(question_embeds[:, 0, :])
        else:
            # 使用 text_encoder 处理文本数据，获取问题文本的嵌入表示
            question_embeds = self.text_encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                return_dict=return_dict,
            )
            # 如果 return_dict 为 False，则使用第一个元素作为输出；否则使用 last_hidden_state
            question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state

            # 规范化图像嵌入，并通过 vision_proj 将其投影到与问题文本嵌入相同的空间中
            image_feat = normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1)
            # 规范化问题文本嵌入，并通过 text_proj 进行同样的投影
            text_feat = normalize(self.text_proj(question_embeds[:, 0, :]), dim=-1)

            # 计算图像嵌入与问题文本嵌入之间的相似度分数
            output = image_feat @ text_feat.t()

        # 如果 return_dict 为 False，则返回多个元组，确保输出中没有 None 值
        if not return_dict:
            outputs = (output, vision_outputs[0]) + vision_outputs[2:] + (question_embeds,)
            return tuple(output for output in outputs if output is not None)

        # 如果 return_dict 为 True，则返回 BlipImageTextMatchingModelOutput 对象，包含 ITM 计算的结果和相关信息
        return BlipImageTextMatchingModelOutput(
            itm_score=output,
            last_hidden_state=vision_outputs.last_hidden_state,
            hidden_states=vision_outputs.hidden_states,
            attentions=vision_outputs.attentions,
            question_embeds=question_embeds,
        )

`.\models\blip\modeling_blip_text.py`

# coding=utf-8
# 版权 2022 年 Salesforce 团队作者和 HuggingFace 团队。保留所有权利。
#
# 根据 BSD-3-clause 许可证授权（“许可证”）;
# 除非符合许可证的规定，否则您不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     https://opensource.org/licenses/BSD-3-Clause
#
# 除非适用法律要求或书面同意，否则依照“原样”分发的软件
# 不附带任何形式的明示或暗示担保或条件。
# 有关特定语言的详细信息，请参阅许可证。
#

import math  # 导入数学模块
from typing import List, Optional, Tuple, Union  # 导入类型提示相关的模块

import torch  # 导入 PyTorch 库
import torch.utils.checkpoint  # 导入 PyTorch 的 checkpoint 工具
from torch import Tensor, device, nn  # 从 PyTorch 导入 Tensor、device、nn 等
from torch.nn import CrossEntropyLoss  # 导入交叉熵损失函数

from ...activations import ACT2FN  # 导入激活函数
from ...modeling_outputs import (  # 导入模型输出相关的类
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
)
from ...modeling_utils import (  # 导入模型工具函数
    PreTrainedModel,
    apply_chunking_to_forward,
    find_pruneable_heads_and_indices,
    prune_linear_layer,
)
from ...utils import logging  # 导入日志工具
from .configuration_blip import BlipTextConfig  # 导入 BLIP 文本配置类


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


# 从 https://github.com/salesforce/BLIP/blob/main/models/med.py#L52 进行了适配
class BlipTextEmbeddings(nn.Module):
    """根据单词和位置嵌入构建嵌入层。"""

    def __init__(self, config):
        super().__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)

        # self.LayerNorm 未使用蛇形命名，以保持与 TensorFlow 模型变量名一致，并能够加载任何 TensorFlow 检查点文件
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # position_ids（1，长度位置嵌入）在序列化时是连续的内存并可导出
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")

        self.config = config

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        past_key_values_length: int = 0,
    # 定义函数的输入和输出类型，此处返回一个 torch.Tensor 对象
    ) -> torch.Tensor:
        # 如果传入了 input_ids，则获取其形状
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            # 否则获取 inputs_embeds 的形状，排除最后一个维度
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度，这里假设 input_shape 是一个元组，其第二个维度表示序列长度
        seq_length = input_shape[1]

        # 如果未提供 position_ids，则从预设的 position_ids 中选择对应序列长度的部分
        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        # 如果 inputs_embeds 为空，则使用 input_ids 加载对应设备的 word_embeddings 来获取 inputs_embeds
        if inputs_embeds is None:
            input_ids = input_ids.to(self.word_embeddings.weight.device)
            inputs_embeds = self.word_embeddings(input_ids)

        # 将 embeddings 初始化为 inputs_embeds
        embeddings = inputs_embeds

        # 如果使用绝对位置编码（absolute），则添加位置编码到 embeddings 中
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings

        # 对 embeddings 进行 LayerNorm（归一化处理）
        embeddings = self.LayerNorm(embeddings)

        # 对 embeddings 应用 dropout 处理
        embeddings = self.dropout(embeddings)

        # 返回处理后的 embeddings
        return embeddings
# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97
# 定义一个自注意力机制模块，继承自 nn.Module
class BlipTextSelfAttention(nn.Module):
    def __init__(self, config, is_cross_attention):
        super().__init__()
        self.config = config
        # 检查隐藏大小是否可以被注意力头数整除，若不是则引发错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
                % (config.hidden_size, config.num_attention_heads)
            )

        # 初始化注意力头数和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 定义查询、键、值的线性变换层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        if is_cross_attention:
            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
        else:
            self.key = nn.Linear(config.hidden_size, self.all_head_size)
            self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # 定义注意力概率的 dropout 层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        # 根据配置确定位置嵌入的类型，默认为绝对位置编码
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            # 如果是相对位置编码，则初始化距离嵌入的 Embedding 层
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

    # 保存注意力梯度
    def save_attn_gradients(self, attn_gradients):
        self.attn_gradients = attn_gradients

    # 获取保存的注意力梯度
    def get_attn_gradients(self):
        return self.attn_gradients

    # 保存注意力映射
    def save_attention_map(self, attention_map):
        self.attention_map = attention_map

    # 获取保存的注意力映射
    def get_attention_map(self):
        return self.attention_map

    # 将输入 tensor 转置为 scores 的形状
    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数定义
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ):
        # 此处省略了具体的前向传播逻辑，根据输入参数计算注意力输出
        pass

# 从 transformers.models.bert.modeling_bert.BertSelfOutput 复制并修改为 BlipTextSelfOutput
# 定义一个自注意力输出模块，继承自 nn.Module
class BlipTextSelfOutput(nn.Module):
    # 初始化函数，用于初始化一个新的实例
    def __init__(self, config):
        # 调用父类（nn.Module）的初始化函数
        super().__init__()
        # 创建一个全连接层，输入和输出的维度都是config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 创建一个 LayerNorm 层，对输入进行归一化处理，eps是用于数值稳定性的小值
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个 Dropout 层，以config.hidden_dropout_prob的概率随机将输入置零，用于防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，定义了模型的计算流程
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 全连接层处理输入的隐藏状态，生成新的隐藏状态
        hidden_states = self.dense(hidden_states)
        # 对新的隐藏状态进行 Dropout 操作，以防止过拟合
        hidden_states = self.dropout(hidden_states)
        # 将 Dropout 后的隐藏状态和输入张量相加，然后经过 LayerNorm 处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的隐藏状态作为输出
        return hidden_states
# 从 https://github.com/salesforce/BLIP/blob/main/models/med.py#242 改编而来的代码

# 定义了一个用于 BlipText 模型中注意力机制的自定义 PyTorch 模块
class BlipTextAttention(nn.Module):
    def __init__(self, config, is_cross_attention=False):
        super().__init__()
        # 初始化 self 层，使用 BlipTextSelfAttention 类处理自注意力或交叉注意力
        self.self = BlipTextSelfAttention(config, is_cross_attention)
        # 初始化 output 层，用于处理自注意力的输出
        self.output = BlipTextSelfOutput(config)
        # 初始化一个用于记录被修剪掉的注意力头的集合
        self.pruned_heads = set()

    # 修剪模型中的注意力头
    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 找到可以修剪的注意力头及其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 在 self 层中修剪线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并记录修剪掉的注意力头
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用 self 层的前向传播，获取自注意力的输出
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 将自注意力的输出传递给 output 层进行处理
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果需要输出注意力，将它们添加到 outputs 中
        outputs = (attention_output,) + self_outputs[1:]  # 如果有的话，添加注意力
        return outputs


# 从 transformers.models.bert.modeling_bert.BertIntermediate 复制并修改为 BlipTextIntermediate
# 定义了 BlipText 模型中间层的自定义 PyTorch 模块
class BlipTextIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 使用线性层将隐藏状态转换为中间层状态
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置中的激活函数设置中间层的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    # 中间层的前向传播函数
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 通过线性层计算中间层的输出
        hidden_states = self.dense(hidden_states)
        # 使用配置中指定的激活函数对中间层进行非线性变换
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertOutput 复制并修改为 BlipTextOutput
# 定义了 BlipText 模型输出层的自定义 PyTorch 模块
class BlipTextOutput(nn.Module):
    # 初始化函数，用于设置模型的各个组件和参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个全连接层，输入维度为config中的中间大小，输出维度为config中的隐藏大小
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个Layer Normalization层，输入维度为config中的隐藏大小，设置epsilon为config中的layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个Dropout层，用于随机置零输入张量的一些元素，防止过拟合
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数，定义了模型的计算流程
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将输入张量经过全连接层变换
        hidden_states = self.dense(hidden_states)
        # 对变换后的张量应用Dropout层
        hidden_states = self.dropout(hidden_states)
        # 将Dropout后的张量与输入张量相加，并应用Layer Normalization
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回经过处理后的张量作为输出
        return hidden_states
# BLIP 文本层的定义，继承自 nn.Module 类
class BlipTextLayer(nn.Module):
    # 初始化函数，接受配置对象 config 和层编号 layer_num 作为参数
    def __init__(self, config, layer_num):
        super().__init__()
        # 将配置对象保存在实例中
        self.config = config
        # 设置前馈过程的块大小为配置中的块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度设定为 1
        self.seq_len_dim = 1
        # 创建 BLIP 文本注意力层对象
        self.attention = BlipTextAttention(config)
        # 保存层编号
        self.layer_num = layer_num
        # 如果配置中包含解码器，则创建 BLIP 文本交叉注意力层对象
        if self.config.is_decoder:
            self.crossattention = BlipTextAttention(config, is_cross_attention=self.config.is_decoder)
        # 创建 BLIP 文本中间层对象
        self.intermediate = BlipTextIntermediate(config)
        # 创建 BLIP 文本输出层对象
        self.output = BlipTextOutput(config)

    # 前向传播函数，接受多个输入参数并返回一个元组
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 如果过去键值不为空，提取解码器单向自注意力的缓存键/值对
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 调用自注意力层的前向传播函数，传入隐藏状态等参数
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        # 获取自注意力层的输出
        attention_output = self_attention_outputs[0]

        # 提取除了第一个和最后一个元素之外的所有输出，用于后续处理
        outputs = self_attention_outputs[1:-1]
        # 获取当前键值
        present_key_value = self_attention_outputs[-1]

        # 如果存在编码器隐藏状态，调用交叉注意力层的前向传播函数
        if encoder_hidden_states is not None:
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                output_attentions=output_attentions,
            )
            # 获取交叉注意力层的输出
            attention_output = cross_attention_outputs[0]
            # 如果输出注意力权重，则添加到输出中
            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
        
        # 将注意力输出应用到前馈块的处理函数中
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        # 将层输出添加到输出元组中
        outputs = (layer_output,) + outputs

        # 添加当前键值到输出元组中
        outputs = outputs + (present_key_value,)

        # 返回所有输出的元组
        return outputs

    # 前馈块处理函数，接受注意力输出并返回处理后的层输出
    def feed_forward_chunk(self, attention_output):
        # 调用中间层对象处理注意力输出
        intermediate_output = self.intermediate(attention_output)
        # 调用输出层对象生成最终的层输出
        layer_output = self.output(intermediate_output, attention_output)
        # 返回处理后的层输出
        return layer_output


# 从 https://github.com/salesforce/BLIP/blob/main/models/med.py#L386 调整的代码
# BLIP 文本编码器的定义，继承自 nn.Module 类
class BlipTextEncoder(nn.Module):
    # 初始化函数，接受配置对象 config 作为参数
    def __init__(self, config):
        super().__init__()
        # 保存配置对象到实例中
        self.config = config
        # 创建包含多个 BLIP 文本层对象的模块列表，根据配置中的隐藏层数量进行创建
        self.layer = nn.ModuleList([BlipTextLayer(config, i) for i in range(config.num_hidden_layers)])
        # 关闭梯度检查点
        self.gradient_checkpointing = False
    # 定义一个方法用于执行前向传播，用于处理Transformer网络中的一个步骤

    self,
        # 指向当前实例的引用，允许方法访问实例的属性和方法

        hidden_states: torch.Tensor,
        # 输入参数：表示模型中的隐藏状态张量，通常是模型输入的表示

        attention_mask: Optional[torch.FloatTensor] = None,
        # 可选参数：用于屏蔽注意力机制中不需要考虑的位置，可以是一个张量或None

        head_mask: Optional[torch.FloatTensor] = None,
        # 可选参数：头部掩码，用于控制哪些注意力头参与计算

        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        # 可选参数：编码器的隐藏状态张量，用于某些模型需要对编码器输出进行注意

        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        # 可选参数：编码器的注意力掩码，用于编码器输出的屏蔽机制

        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        # 可选参数：用于存储过去的键值对，用于处理长序列时的记忆优化

        use_cache: Optional[bool] = None,
        # 可选参数：指示是否使用缓存，以便重复计算可以复用中间结果

        output_attentions: Optional[bool] = False,
        # 可选参数：指示是否输出注意力权重

        output_hidden_states: Optional[bool] = False,
        # 可选参数：指示是否输出所有隐藏状态，而不仅仅是最后一层的隐藏状态

        return_dict: Optional[bool] = True,
        # 可选参数：指示是否以字典形式返回结果，如果为False，则返回元组形式的结果
    # 返回值类型声明，函数返回一个元组，包含 torch.Tensor 类型的元素或者 BaseModelOutputWithPastAndCrossAttentions 类型的对象
    -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        # 如果启用了梯度检查点且处于训练模式
        if self.gradient_checkpointing and self.training:
            # 如果 use_cache 被设置为 True，则警告 use_cache=True 与梯度检查点不兼容，并强制设置 use_cache=False
            if use_cache:
                logger.warning(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False
        # 如果不输出隐藏状态，则初始化 all_hidden_states 为空元组，否则为 None
        all_hidden_states = () if output_hidden_states else None
        # 如果不输出注意力权重，则初始化 all_self_attentions 为空元组，否则为 None
        all_self_attentions = () if output_attentions else None
        # 如果不输出注意力权重或者当前模型不是解码器，则初始化 all_cross_attentions 为空元组，否则为 None
        all_cross_attentions = () if output_attentions and self.config.is_decoder else None

        # 如果 use_cache 为 True，则初始化 next_decoder_cache 为空元组，否则为 None
        next_decoder_cache = () if use_cache else None

        # 遍历每个隐藏层
        for i in range(self.config.num_hidden_layers):
            # 获取第 i 层的模块
            layer_module = self.layer[i]
            # 如果需要输出隐藏状态，则将当前隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取第 i 层的头部掩码，如果 head_mask 不为 None，则为其赋值，否则为 None
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 获取第 i 层的过去键值对，如果 past_key_values 不为 None，则为其赋值，否则为 None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果启用了梯度检查点且处于训练模式
            if self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数来计算当前层的输出
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 直接调用当前层的模块来计算输出
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            # 更新隐藏状态为当前层输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果 use_cache 为 True，则将当前层的输出的最后一个元素添加到 next_decoder_cache 中
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果需要输出注意力权重，则将当前层输出的第二个元素添加到 all_self_attentions 中，
            # 并将当前层输出的第三个元素（如果存在）添加到 all_cross_attentions 中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果需要输出隐藏状态，则将最终的隐藏状态添加到 all_hidden_states 中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果 return_dict 为 False，则返回包含非 None 元素的元组
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 如果 return_dict 为 True，则返回一个 BaseModelOutputWithPastAndCrossAttentions 对象，
        # 包含指定的隐藏状态、缓存、隐藏状态历史、自注意力权重和交叉注意力权重
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->BlipText
class BlipTextPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)  # 定义一个全连接层，输入输出维度都是 config.hidden_size
        self.activation = nn.Tanh()  # 定义激活函数为双曲正切函数

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]  # 获取每个样本中第一个 token 对应的隐藏状态
        pooled_output = self.dense(first_token_tensor)  # 将第一个 token 的隐藏状态通过全连接层
        pooled_output = self.activation(pooled_output)  # 应用激活函数
        return pooled_output


# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->BlipText
class BlipTextPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)  # 定义一个全连接层，输入输出维度都是 config.hidden_size
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]  # 根据配置选择激活函数
        else:
            self.transform_act_fn = config.hidden_act
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)  # 应用 Layer Normalization

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dense(hidden_states)  # 通过全连接层变换隐藏状态
        hidden_states = self.transform_act_fn(hidden_states)  # 应用激活函数
        hidden_states = self.LayerNorm(hidden_states)  # 应用 Layer Normalization
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->BlipText
class BlipTextLMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.transform = BlipTextPredictionHeadTransform(config)  # 使用上面定义的头部变换层

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)  # 定义一个线性层，将隐藏状态映射到词汇表大小的空间
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))  # 定义一个偏置参数，大小为词汇表大小

        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
        self.decoder.bias = self.bias  # 将偏置参数与解码器层关联，以便与 `resize_token_embeddings` 正确调整大小

    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)  # 应用头部变换层
        hidden_states = self.decoder(hidden_states)  # 将变换后的隐藏状态映射到词汇表空间
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->BlipText
class BlipTextOnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.predictions = BlipTextLMPredictionHead(config)  # 使用上面定义的语言模型预测头部

    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
        prediction_scores = self.predictions(sequence_output)  # 生成预测分数
        return prediction_scores


# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548
class BlipTextPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    """
    pass  # 空类，用于处理权重初始化和预训练模型的简单下载和加载接口
    models.
    """

    # 定义配置类为BlipTextConfig
    config_class = BlipTextConfig
    # 设置基础模型前缀为"bert"
    base_model_prefix = "bert"

    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果模块是线性层或嵌入层
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # 使用正态分布初始化权重，均值为0.0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        # 如果模块是 LayerNorm 层
        elif isinstance(module, nn.LayerNorm):
            # 将偏置项初始化为零
            module.bias.data.zero_()
            # 将权重初始化为1.0
            module.weight.data.fill_(1.0)
        # 如果模块是线性层并且具有偏置项
        if isinstance(module, nn.Linear) and module.bias is not None:
            # 将偏置项初始化为零
            module.bias.data.zero_()
# Adapted from https://github.com/salesforce/BLIP/blob/3a29b7410476bf5f2ba0955827390eb6ea1f4f9d/models/med.py#L571
class BlipTextModel(BlipTextPreTrainedModel):
    """
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an
    `encoder_hidden_states` is then expected as an input to the forward pass.
    """

    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config

        # Initialize embeddings layer for text
        self.embeddings = BlipTextEmbeddings(config)
        # Initialize encoder layer for processing text
        self.encoder = BlipTextEncoder(config)
        # Optionally initialize pooling layer if specified
        self.pooler = BlipTextPooler(config) if add_pooling_layer else None

        # Perform any post-initialization steps
        self.post_init()

    def get_input_embeddings(self):
        # Return the word embeddings from the embeddings layer
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        # Set new word embeddings in the embeddings layer
        self.embeddings.word_embeddings = value

    # Copied from transformers.models.bert.modeling_bert.BertModel._prune_heads
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            # Prune specified heads in the attention mechanism of each layer
            self.encoder.layer[layer].attention.prune_heads(heads)

    def get_extended_attention_mask(
        self, attention_mask: Tensor, input_shape: Tuple[int], device: device, is_decoder: bool
    ):
        # Create an extended attention mask to handle different attention scenarios
        # Not fully implemented in the provided snippet
        pass

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        is_decoder: Optional[bool] = False,
    ):
        # Forward pass through the model, not fully implemented here
        pass

# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811
class BlipTextLMHeadModel(BlipTextPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # Initialize the base BlipTextModel for language modeling
        self.bert = BlipTextModel(config, add_pooling_layer=False)
        # Initialize the MLM (Masked Language Modeling) head
        self.cls = BlipTextOnlyMLMHead(config)
        # Define label smoothing factor
        self.label_smoothing = config.label_smoothing

    def get_output_embeddings(self):
        # Return the decoder part of the MLM head's predictions
        return self.cls.predictions.decoder
    # 设置新的输出嵌入到模型预测的解码器中
    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings

    # 模型的前向传播函数，接受多个输入参数并返回模型输出或损失
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,  # 输入的token ID序列，默认为None
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，用于指示哪些token需要注意力，默认为None
        position_ids: Optional[torch.Tensor] = None,  # 位置ID，用于指示每个token的位置信息，默认为None
        head_mask: Optional[torch.Tensor] = None,  # 头部掩码，用于控制不同头部的注意力，默认为None
        inputs_embeds: Optional[torch.Tensor] = None,  # 输入的嵌入表示，默认为None
        encoder_hidden_states: Optional[torch.Tensor] = None,  # 编码器的隐藏状态，默认为None
        encoder_attention_mask: Optional[torch.Tensor] = None,  # 编码器的注意力掩码，默认为None
        labels: Optional[torch.Tensor] = None,  # 真实标签，默认为None
        past_key_values: Optional[List[torch.Tensor]] = None,  # 过去的键值对，用于生成，默认为None
        use_cache: Optional[bool] = None,  # 是否使用缓存，默认为None
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，默认为None
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，默认为None
        return_dict: Optional[bool] = None,  # 是否以字典形式返回，默认为None
        return_logits: Optional[bool] = False,  # 是否返回logits，默认为False
        is_decoder: Optional[bool] = True,  # 是否作为解码器，默认为True
        reduction: Optional[str] = "mean",  # 损失函数的减少方式，默认为"mean"
    ):
        # 准备生成输入的函数，为生成模型准备输入数据
        def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
            input_shape = input_ids.shape
            # 如果注意力掩码为None，则创建一个全1的注意力掩码，形状与输入ID相同
            if attention_mask is None:
                attention_mask = input_ids.new_ones(input_shape)

            # 如果过去的键值对不为None，则根据过去的长度截取输入ID
            if past_key_values is not None:
                past_length = past_key_values[0][0].shape[2]

                # 一些生成方法已经只传递了最后一个输入ID
                if input_ids.shape[1] > past_length:
                    remove_prefix_length = past_length
                else:
                    # 默认保留最后一个ID
                    remove_prefix_length = input_ids.shape[1] - 1

                input_ids = input_ids[:, remove_prefix_length:]

            # 返回准备好的输入数据作为字典形式
            return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "past_key_values": past_key_values,
                "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
                "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
                "is_decoder": True,
            }

    # 重新排序缓存中的过去键值对，根据beam索引重排
    def _reorder_cache(self, past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past

`.\models\blip\modeling_tf_blip.py`

# coding=utf-8
# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TensorFlow BLIP model."""

from __future__ import annotations

import warnings
from dataclasses import dataclass
from typing import Any, Optional, Tuple, Union

import tensorflow as tf

# Importing specific modules and functions from custom TensorFlow utility files
from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
from ...modeling_tf_utils import (
    TFPreTrainedModel,
    get_initializer,
    get_tf_activation,
    keras,
    keras_serializable,
    shape_list,
    unpack_inputs,
)
from ...tf_utils import check_embeddings_within_bounds, stable_softmax
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
# Importing configuration specific to BLIP models
from .configuration_blip import BlipConfig, BlipTextConfig, BlipVisionConfig
# Importing components from text modeling module for BLIP
from .modeling_tf_blip_text import BLIP_TEXT_INPUTS_DOCSTRING, TFBlipTextLMHeadModel, TFBlipTextModel


# Setting up logging specific to the current module
logger = logging.get_logger(__name__)

# Specifying a checkpoint reference for documentation purposes
_CHECKPOINT_FOR_DOC = "Salesforce/blip-vqa-base"

# List of pre-trained model archives for BLIP models
TF_BLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "Salesforce/blip-vqa-base",
    "Salesforce/blip-vqa-capfilt-large",
    "Salesforce/blip-image-captioning-base",
    "Salesforce/blip-image-captioning-large",
    "Salesforce/blip-itm-base-coco",
    "Salesforce/blip-itm-large-coco",
    "Salesforce/blip-itm-base-flickr",
    "Salesforce/blip-itm-large-flickr",
    # See all BLIP models at https://huggingface.co/models?filter=blip
]


# Function for contrastive loss computation, adapted from transformers.models.clip.modeling_tf_clip.contrastive_loss
def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
    """
    Computes the contrastive loss based on the sparse categorical crossentropy.

    Args:
        logits (tf.Tensor): Logits tensor representing predictions.

    Returns:
        tf.Tensor: Mean contrastive loss value.
    """
    return tf.math.reduce_mean(
        keras.metrics.sparse_categorical_crossentropy(
            y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
        )
    )


# Function for BLIP-specific loss computation, adapted from transformers.models.clip.modeling_tf_clip.clip_loss
def blip_loss(similarity: tf.Tensor) -> tf.Tensor:
    """
    Computes the BLIP loss, which is an average of contrastive losses calculated for captions and images.

    Args:
        similarity (tf.Tensor): Tensor representing similarity between captions and images.

    Returns:
        tf.Tensor: Computed BLIP loss value.
    """
    caption_loss = contrastive_loss(similarity)
    image_loss = contrastive_loss(tf.transpose(similarity))
    return (caption_loss + image_loss) / 2.0


@dataclass
class TFBlipForConditionalGenerationModelOutput(ModelOutput):
    """
    Output data structure for TFBlipForConditionalGenerationModel, inheriting from ModelOutput.

    Attributes:
        This class inherits attributes and methods from ModelOutput and adds none in this implementation.
    """
    pass
    Args:
        loss (`tf.Tensor`, *optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`):
            Languge modeling loss from the text decoder.
        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
            Prediction scores of the language modeling head of the text decoder model.
        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)`, *optional*):
            The image embeddings obtained after applying the Vision Transformer model to the input image.
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
            the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """
    # 初始化各个属性为 None，这些属性用于存储模型推断的结果
    loss: Tuple[tf.Tensor] | None = None
    logits: Tuple[tf.Tensor] | None = None
    image_embeds: tf.Tensor | None = None
    last_hidden_state: tf.Tensor = None
    hidden_states: Tuple[tf.Tensor, ...] | None = None
    attentions: Tuple[tf.Tensor, ...] | None = None

    @property
    def decoder_logits(self):
        # 发出警告，提醒用户 `decoder_logits` 属性即将被移除，建议使用 `logits` 属性来获取最终输出
        warnings.warn(
            "`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers."
            " Please use the `logits` attribute to retrieve the final output instead.",
            FutureWarning,
        )
        # 返回 `logits` 属性的值作为输出
        return self.logits
# 定义一个用于 TFBlip 文本视觉模型输出的数据类，继承自 ModelOutput 基类
@dataclass
class TFBlipTextVisionModelOutput(ModelOutput):
    """
    从基类适配的视觉模型输出的扩展，还包含了最后隐藏状态的图像嵌入。该类还添加了文本解码器的损失项。

    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            文本解码器的语言建模损失。
        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            图像嵌入，通过将投影层应用于池化器输出获得。
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的隐藏状态序列。
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            元组 `tf.Tensor` 的隐藏状态（如果模型具有嵌入层，则为输出的初始嵌入输出 + 每一层的输出），
            形状为 `(batch_size, sequence_length, hidden_size)`。
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            元组 `tf.Tensor` 的注意力权重（每层一个），形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
            注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
    """

    # 损失项，类型为 tf.Tensor 或 None
    loss: tf.Tensor | None = None
    # 图像嵌入，类型为 tf.Tensor 或 None
    image_embeds: tf.Tensor | None = None
    # 最后一层隐藏状态，类型为 tf.Tensor 或 None
    last_hidden_state: tf.Tensor = None
    # 隐藏状态元组，包含模型每层的隐藏状态，类型为 Tuple[tf.Tensor] 或 None
    hidden_states: Tuple[tf.Tensor, ...] | None = None
    # 注意力权重元组，包含每层的注意力权重，类型为 Tuple[tf.Tensor] 或 None
    attentions: Tuple[tf.Tensor, ...] | None = None


# 定义一个用于 TFBlip 图像文本匹配模型输出的数据类，继承自 ModelOutput 基类
@dataclass
class TFBlipImageTextMatchingModelOutput(ModelOutput):
    """
    从基类适配的视觉模型输出的扩展，还包含了最后隐藏状态的图像嵌入。该类还添加了文本解码器的损失项以及图像文本相似度分数。

    (此处省略了进一步的文档内容，未提供完整的注释)
    """
    Args:
        itm_score (`tf.Tensor`):
            图像和文本的相似度分数。
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            文本解码器的语言建模损失。
        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            通过投影层应用到池化输出得到的图像嵌入。
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的隐藏状态序列输出。
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            模型每一层的隐藏状态元组，包括可能的初始嵌入层输出。
        vision_pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`, *optional*):
            模型视觉分支中视觉池化层的最后一层隐藏状态。
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            注意力权重的元组，用于计算自注意力头中的加权平均值。
        question_embeds (`tf.Tensor`):
            文本投影层得到的问题嵌入。
    """

    itm_score: tf.Tensor | None = None  # 初始化图像和文本的相似度分数，默认为 None
    loss: tf.Tensor | None = None  # 初始化语言建模损失，默认为 None；在提供 `labels` 时返回
    image_embeds: tf.Tensor | None = None  # 初始化图像嵌入，默认为 None；当 `with_projection=True` 时返回
    last_hidden_state: tf.Tensor = None  # 初始化最后一层隐藏状态，默认为 None
    hidden_states: Tuple[tf.Tensor, ...] | None = None  # 初始化隐藏状态元组，默认为 None；在 `output_hidden_states=True` 时返回
    vision_pooler_output: tf.Tensor | None = None  # 初始化视觉池化层的最后一层隐藏状态，默认为 None
    attentions: Tuple[tf.Tensor, ...] | None = None  # 初始化注意力权重元组，默认为 None；在 `output_attentions=True` 时返回
    question_embeds: Tuple[tf.Tensor] | None = None  # 初始化问题嵌入，默认为 None
@dataclass
class TFBlipOutput(ModelOutput):
    """
    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image: (`tf.Tensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text: (`tf.Tensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds: (`tf.Tensor` of shape `(batch_size, output_dim)`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`].
        image_embeds: (`tf.Tensor` of shape `(batch_size, output_dim)`):
            The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`].
        text_model_output: (`BaseModelOutputWithPooling`):
            The output of the [`BlipTextModel`].
        vision_model_output: (`BaseModelOutputWithPooling`):
            The output of the [`BlipVisionModel`].
    """

    loss: tf.Tensor | None = None
    logits_per_image: tf.Tensor = None
    logits_per_text: tf.Tensor = None
    text_embeds: tf.Tensor = None
    image_embeds: tf.Tensor = None
    text_model_output: TFBaseModelOutputWithPooling = None
    vision_model_output: TFBaseModelOutputWithPooling = None

    def to_tuple(self) -> Tuple[Any]:
        """
        Convert TFBlipOutput object to a tuple, excluding `text_model_output` and `vision_model_output` which are 
        converted to tuples separately.

        Returns:
            Tuple[Any]: A tuple representation of the object.
        """
        return tuple(
            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )


class TFBlipVisionEmbeddings(keras.layers.Layer):
    def __init__(self, config: BlipVisionConfig, **kwargs):
        """
        Initialize the TFBlipVisionEmbeddings layer.

        Args:
            config (BlipVisionConfig): Configuration object for BlipVisionModel.
            **kwargs: Additional keyword arguments passed to the Layer constructor.
        """
        super().__init__(**kwargs)
        self.config = config
        self.embed_dim = config.hidden_size
        self.image_size = config.image_size
        self.patch_size = config.patch_size

        # Define patch embedding layer
        self.patch_embedding = keras.layers.Conv2D(
            filters=self.embed_dim,
            kernel_size=self.patch_size,
            strides=self.patch_size,
            kernel_initializer=get_initializer(self.config.initializer_range),
            data_format="channels_last",
            name="patch_embedding",
        )

        # Calculate number of patches and positions
        self.num_patches = (self.image_size // self.patch_size) ** 2
        self.num_positions = self.num_patches + 1
    # 构建模型的方法，在输入形状为 `input_shape` 的情况下进行构建
    def build(self, input_shape=None):
        # 添加类别嵌入权重，形状为 (1, 1, embed_dim)，使用给定范围的初始化器进行初始化
        self.class_embedding = self.add_weight(
            shape=(1, 1, self.embed_dim),
            initializer=get_initializer(self.config.initializer_range),
            trainable=True,
            name="class_embedding",
        )

        # 添加位置嵌入权重，形状为 (1, num_positions, embed_dim)，使用给定范围的初始化器进行初始化
        self.position_embedding = self.add_weight(
            shape=(1, self.num_positions, self.embed_dim),
            initializer=get_initializer(self.config.initializer_range),
            trainable=True,
            name="position_embedding",
        )

        # 如果模型已经构建，则直接返回
        if self.built:
            return
        self.built = True

        # 如果存在 `patch_embedding` 属性，则对其进行构建
        if getattr(self, "patch_embedding", None) is not None:
            with tf.name_scope(self.patch_embedding.name):
                self.patch_embedding.build([None, None, None, 3])

    # 模型的调用方法，接受像素值张量作为输入，返回嵌入张量作为输出
    def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
        # 输入张量是通道优先的，进行转置以适应模型的通道次序（通道在最后的顺序）
        batch_size = tf.shape(pixel_values)[0]
        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
        
        # 使用 `patch_embedding` 对转置后的像素值进行嵌入
        patch_embeds = self.patch_embedding(pixel_values)
        # 将嵌入张量重新调整形状为 (batch_size, num_patches, -1)
        patch_embeds = tf.reshape(patch_embeds, (batch_size, self.num_patches, -1))

        # 扩展类别嵌入以匹配批次大小，并与 patch 嵌入连接起来
        class_embeds = tf.broadcast_to(self.class_embedding, (batch_size, 1, self.embed_dim))
        embeddings = tf.concat([class_embeds, patch_embeds], axis=1)
        
        # 将位置嵌入加到嵌入张量中（仅限于嵌入张量的长度部分）
        embeddings = embeddings + self.position_embedding[:, : tf.shape(embeddings)[1], :]
        
        # 返回最终的嵌入张量作为模型的输出
        return embeddings
# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings with CLIP->Blip
class TFBlipTextEmbeddings(keras.layers.Layer):
    def __init__(self, config: BlipTextConfig, **kwargs):
        super().__init__(**kwargs)

        self.embed_dim = config.hidden_size  # 设置嵌入维度为配置中的隐藏层大小
        self.config = config  # 保存配置信息

    def build(self, input_shape: tf.TensorShape = None):
        with tf.name_scope("token_embedding"):
            # 创建 token 嵌入权重，形状为 (词汇大小, 嵌入维度)，使用指定初始化方法
            self.weight = self.add_weight(
                shape=(self.config.vocab_size, self.embed_dim),
                initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
                trainable=True,
                name="weight",
            )

        with tf.name_scope("position_embedding"):
            # 创建位置嵌入权重，形状为 (最大位置嵌入数, 嵌入维度)，使用指定初始化方法
            self.position_embedding = self.add_weight(
                shape=(self.config.max_position_embeddings, self.embed_dim),
                initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
                trainable=True,
                name="embeddings",
            )

        super().build(input_shape)  # 调用父类的 build 方法

    def call(
        self,
        input_ids: tf.Tensor = None,
        position_ids: tf.Tensor = None,
        inputs_embeds: tf.Tensor = None,
    ) -> tf.Tensor:
        """
        Applies embedding based on inputs tensor.

        Args:
            input_ids (tf.Tensor, optional): 输入的 token ID 张量
            position_ids (tf.Tensor, optional): 输入的位置 ID 张量
            inputs_embeds (tf.Tensor, optional): 输入的嵌入张量

        Returns:
            final_embeddings (`tf.Tensor`): 输出的嵌入张量.
        """
        if input_ids is None and inputs_embeds is None:
            raise ValueError("You have to specify either input_ids or inputs_embeds")

        if inputs_embeds is None:
            check_embeddings_within_bounds(input_ids, self.config.vocab_size)  # 检查嵌入是否在合理范围内
            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)  # 根据 input_ids 获取嵌入向量

        input_shape = shape_list(inputs_embeds)[:-1]  # 获取输入嵌入张量的形状，去除最后一个维度

        if position_ids is None:
            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)  # 自动生成位置 ID

        position_embeds = tf.gather(params=self.position_embedding, indices=position_ids)  # 根据 position_ids 获取位置嵌入
        position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))  # 复制位置嵌入以匹配输入的形状
        final_embeddings = inputs_embeds + position_embeds  # 计算最终的嵌入张量

        return final_embeddings


class TFBlipAttention(keras.layers.Layer):
    """Multi-headed attention from 'Attention Is All You Need' paper"""
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        self.config = config  # 将传入的配置保存为实例变量
        self.embed_dim = config.hidden_size  # 设置嵌入维度为配置中的隐藏大小
        self.num_heads = config.num_attention_heads  # 设置注意力头的数量为配置中的值
        self.head_dim = self.embed_dim // self.num_heads  # 计算每个注意力头的维度
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
                f" {self.num_heads})."
            )
        self.scale = self.head_dim**-0.5  # 计算缩放因子，用于注意力分数的缩放
        self.dropout = keras.layers.Dropout(config.attention_dropout, name="dropout")  # 初始化丢弃层

        self.qkv = keras.layers.Dense(
            3 * self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="qkv"
        )  # 创建用于查询、键、值的全连接层

        self.projection = keras.layers.Dense(
            self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="projection"
        )  # 创建用于投影的全连接层

    def call(
        self,
        hidden_states: tf.Tensor,
        head_mask: tf.Tensor | None = None,
        output_attentions: Optional[bool] = False,
        training: Optional[bool] = None,
    ) -> Tuple[tf.Tensor, tf.Tensor | None, Tuple[tf.Tensor] | None]:
        """Input shape: Batch x Time x Channel"""

        bsz, tgt_len, embed_dim = shape_list(hidden_states)  # 获取隐藏状态张量的形状信息

        mixed_qkv = self.qkv(hidden_states)  # 将隐藏状态张量映射到查询、键、值空间
        mixed_qkv = tf.reshape(mixed_qkv, (bsz, tgt_len, 3, self.num_heads, self.head_dim))  # 重塑成多头查询、键、值张量
        mixed_qkv = tf.transpose(mixed_qkv, perm=(2, 0, 3, 1, 4))  # 调整张量顺序以便后续操作

        query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]  # 分离查询、键、值张量

        # 计算注意力分数，即查询和键的点积
        attention_scores = query_states @ tf.transpose(key_states, (0, 1, 3, 2))

        attention_scores = attention_scores * self.scale  # 缩放注意力分数

        # 将注意力分数归一化为注意力概率
        attention_probs = stable_softmax(attention_scores, axis=-1)

        # 使用丢弃层在训练时随机丢弃注意力概率中的值
        attention_probs = self.dropout(attention_probs, training=training)

        # 如果存在头部掩码，则应用头部掩码
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        context_layer = tf.transpose(attention_probs @ value_states, perm=(0, 2, 1, 3))  # 计算加权值张量

        new_context_layer_shape = shape_list(context_layer)[:-2] + [self.embed_dim]  # 调整上下文层的形状
        context_layer = tf.reshape(context_layer, new_context_layer_shape)

        output = self.projection(context_layer)  # 对上下文层进行投影

        outputs = (output, attention_probs) if output_attentions else (output, None)  # 根据需求返回输出

        return outputs
    # 构建函数，用于构建模型的层
    def build(self, input_shape=None):
        # 如果已经构建过，直接返回，避免重复构建
        if self.built:
            return
        # 设置标志为已构建
        self.built = True
        
        # 如果存在 dropout 层，构建 dropout 层
        if getattr(self, "dropout", None) is not None:
            # 使用 dropout 层的名称作为命名空间
            with tf.name_scope(self.dropout.name):
                # 调用 dropout 层的 build 方法，传入空的输入形状
                self.dropout.build(None)
        
        # 如果存在 qkv 层，构建 qkv 层
        if getattr(self, "qkv", None) is not None:
            # 使用 qkv 层的名称作为命名空间
            with tf.name_scope(self.qkv.name):
                # 调用 qkv 层的 build 方法，传入输入形状为 [None, None, self.embed_dim]
                self.qkv.build([None, None, self.embed_dim])
        
        # 如果存在 projection 层，构建 projection 层
        if getattr(self, "projection", None) is not None:
            # 使用 projection 层的名称作为命名空间
            with tf.name_scope(self.projection.name):
                # 调用 projection 层的 build 方法，传入输入形状为 [None, None, self.embed_dim]
                self.projection.build([None, None, self.embed_dim])
class TFBlipMLP(keras.layers.Layer):
    # TFBlipMLP 类，用于定义一个多层感知机（MLP）的自定义层
    def __init__(self, config: BlipConfig, **kwargs):
        super().__init__(**kwargs)

        # 获取激活函数
        self.activation_fn = get_tf_activation(config.hidden_act)

        # 计算输入投影的标准差
        in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5)
        # 计算全连接层的初始化标准差
        fc_std = (2 * config.hidden_size) ** -0.5

        # 创建全连接层 fc1，用于中间层，初始化权重
        self.fc1 = keras.layers.Dense(
            units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1"
        )
        # 创建全连接层 fc2，用于输入投影，初始化权重
        self.fc2 = keras.layers.Dense(
            units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
        )
        self.config = config

    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
        # 前向传播函数 call，接收隐藏状态张量并返回处理后的张量

        # 使用 fc1 进行全连接操作
        hidden_states = self.fc1(inputs=hidden_states)
        # 应用激活函数
        hidden_states = self.activation_fn(hidden_states)
        # 使用 fc2 进行全连接操作
        hidden_states = self.fc2(inputs=hidden_states)
        return hidden_states

    def build(self, input_shape=None):
        # 构建函数 build，在第一次调用时构建层的权重

        # 如果已经构建过，则直接返回
        if self.built:
            return
        # 设置为已构建状态
        self.built = True

        # 如果存在 fc1 层，则在 tf 的名称作用域下构建 fc1 层
        if getattr(self, "fc1", None) is not None:
            with tf.name_scope(self.fc1.name):
                self.fc1.build([None, None, self.config.hidden_size])

        # 如果存在 fc2 层，则在 tf 的名称作用域下构建 fc2 层
        if getattr(self, "fc2", None) is not None:
            with tf.name_scope(self.fc2.name):
                self.fc2.build([None, None, self.config.intermediate_size])


class TFBlipEncoderLayer(keras.layers.Layer):
    # TFBlipEncoderLayer 类，用于定义一个编码器层
    def __init__(self, config: BlipConfig, **kwargs):
        super().__init__(**kwargs)
        
        # 设置嵌入维度
        self.embed_dim = config.hidden_size
        # 初始化自注意力层 self_attn
        self.self_attn = TFBlipAttention(config, name="self_attn")
        # 初始化第一层规范化层 layer_norm1
        self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
        # 初始化多层感知机 MLP
        self.mlp = TFBlipMLP(config, name="mlp")
        # 初始化第二层规范化层 layer_norm2
        self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")

    def call(
        self,
        hidden_states: tf.Tensor,
        attention_mask: tf.Tensor,
        output_attentions: Optional[bool] = False,
        training: Optional[bool] = None,
    ) -> Tuple[tf.Tensor]:
        """
        Args:
            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`tf.Tensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states  # 保存输入的隐藏状态作为残差连接的起点

        hidden_states = self.layer_norm1(hidden_states)  # 执行第一个层归一化操作

        # 使用自注意力机制处理隐藏状态，并获取注意力权重
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            head_mask=attention_mask,
            output_attentions=output_attentions,
            training=training,
        )

        hidden_states = hidden_states + residual  # 将残差连接添加到处理后的隐藏状态中
        residual = hidden_states  # 更新残差连接的起点为当前的隐藏状态

        hidden_states = self.layer_norm2(hidden_states)  # 执行第二个层归一化操作

        hidden_states = self.mlp(hidden_states)  # 使用多层感知机处理隐藏状态

        hidden_states = hidden_states + residual  # 再次将残差连接添加到处理后的隐藏状态中

        outputs = (hidden_states,)  # 准备输出为一个元组，包含处理后的隐藏状态

        if output_attentions:
            outputs += (attn_weights,)  # 如果需要输出注意力权重，则添加到输出中

        return outputs  # 返回最终的输出结果

    def build(self, input_shape=None):
        if self.built:
            return  # 如果已经构建过，则直接返回

        self.built = True  # 标记模型已经构建完成

        if getattr(self, "self_attn", None) is not None:
            with tf.name_scope(self.self_attn.name):
                self.self_attn.build(None)  # 构建自注意力层

        if getattr(self, "layer_norm1", None) is not None:
            with tf.name_scope(self.layer_norm1.name):
                self.layer_norm1.build([None, None, self.embed_dim])  # 构建第一个层归一化层

        if getattr(self, "mlp", None) is not None:
            with tf.name_scope(self.mlp.name):
                self.mlp.build(None)  # 构建多层感知机层

        if getattr(self, "layer_norm2", None) is not None:
            with tf.name_scope(self.layer_norm2.name):
                self.layer_norm2.build([None, None, self.embed_dim])  # 构建第二个层归一化层
BLIP_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
        attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`):
            Mask to avoid performing attention on padding tokens.
        token_type_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
            Segment token indices to indicate first and second portions of the inputs. Only used by some models like BERT.
        position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
            Indices of positions of each input token in the position embeddings.
        inputs_embeds (`tf.Tensor`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `input_ids` indices into associated embeddings.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
    Args:
        input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
            # 输入序列中每个token在词汇表中的索引。默认情况下会忽略填充。
            # 可以使用[`AutoProcessor`]获取这些索引。详见[`BlipProcessor.__call__`]。
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 用于避免在填充的token索引上执行注意力计算的掩码。掩码取值在`[0, 1]`之间：
            # - 1表示**未被掩码**的token，
            # - 0表示**被掩码**的token。
            [What are attention masks?](../glossary#attention-mask)
        position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 输入序列中每个token在位置嵌入中的位置索引。取值范围是`[0, config.max_position_embeddings - 1]`。
            [What are position IDs?](../glossary#position-ids)
        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
            # 像素值。默认情况下会忽略填充。
            # 可以使用[`BlipImageProcessor`]获取这些像素值。详见[`BlipImageProcessor.__call__`]。
        return_loss (`bool`, *optional*):
            # 是否返回对比损失。
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。返回结果中的`attentions`字段会有更详细的说明。
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。返回结果中的`hidden_states`字段会有更详细的说明。
        return_dict (`bool`, *optional*):
            # 是否返回[`~utils.ModelOutput`]而不是普通的元组。
"""
@keras_serializable
class TFBlipEncoder(keras.layers.Layer):
    config_class = BlipConfig
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`BlipEncoderLayer`].

    Args:
        config (`BlipConfig`):
            The corresponding vision configuration for the `BlipEncoder`.
    """

    def __init__(self, config: BlipConfig, **kwargs):
        super().__init__(**kwargs)
        self.config = config
        # 创建一个由多个 `TFBlipEncoderLayer` 组成的列表，每个层使用配置参数并命名
        self.layers = [TFBlipEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)]

    @unpack_inputs
    def call(
        self,
        inputs_embeds,
        attention_mask: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = None,
    ) -> Union[Tuple, TFBaseModelOutput]:
        r"""
        Args:
            inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
        # Determine if `output_attentions` should be overridden by `self.config.output_attentions`
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # Determine if `output_hidden_states` should be overridden by `self.config.output_hidden_states`
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # Determine if `return_dict` should be overridden by `self.config.use_return_dict`
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Initialize empty tuple for encoder states if `output_hidden_states` is False
        encoder_states = () if output_hidden_states else None
        # Initialize empty tuple for all attentions if `output_attentions` is False
        all_attentions = () if output_attentions else None

        # Start with the embedded inputs as the initial hidden states
        hidden_states = inputs_embeds

        # Iterate through each encoder layer
        for idx, encoder_layer in enumerate(self.layers):
            # Append current hidden states to encoder states if `output_hidden_states` is True
            if output_hidden_states:
                encoder_states = encoder_states + (hidden_states,)
            
            # Pass the current hidden states through the encoder layer
            layer_outputs = encoder_layer(
                hidden_states,
                attention_mask,
                output_attentions=output_attentions,
                training=training,
            )

            # Update hidden states with the output of the encoder layer
            hidden_states = layer_outputs[0]

            # Append attention weights of the current layer if `output_attentions` is True
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        # Append final hidden states to encoder states if `output_hidden_states` is True
        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)

        # Return outputs based on `return_dict` flag
        if not return_dict:
            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        return TFBaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
        )

    def build(self, input_shape=None):
        # Check if the model is already built; if yes, return immediately
        if self.built:
            return
        
        # Mark the model as built
        self.built = True
        
        # If `self.layers` attribute exists, iterate through each layer and build it
        if getattr(self, "layers", None) is not None:
            for layer in self.layers:
                with tf.name_scope(layer.name):
                    # Build each layer with `None` input shape
                    layer.build(None)
    class TFBlipVisionModel(TFBlipPreTrainedModel):
        # 主要输入名称为 "pixel_values"
        main_input_name = "pixel_values"
        # 配置类为 BlipVisionConfig
        config_class = BlipVisionConfig

        def __init__(self, config: BlipVisionConfig, *args, **kwargs):
            # 调用父类的初始化方法
            super().__init__(config, *args, **kwargs)
            # 保存配置对象
            self.config = config

            # 创建嵌入层对象，使用 TFBlipVisionEmbeddings 类
            self.embeddings = TFBlipVisionEmbeddings(config, name="embeddings")
            # 创建编码器对象，使用 TFBlipEncoder 类
            self.encoder = TFBlipEncoder(config, name="encoder")
            # 创建后层归一化层对象，使用给定的 epsilon 参数
            self.post_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
            # 设置嵌入维度为配置中的隐藏大小
            self.embed_dim = config.hidden_size

        def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
            # 如果配置要求输出隐藏状态，则将隐藏状态转换为张量
            hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
            # 如果配置要求输出注意力，则将注意力转换为张量
            attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None

            # 返回包含指定属性的 TFBaseModelOutputWithPooling 对象
            return TFBaseModelOutputWithPooling(
                last_hidden_state=output.last_hidden_state,
                pooler_output=output.pooler_output,
                hidden_states=hs,
                attentions=attns,
            )

        @unpack_inputs
        @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
        @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=BlipVisionConfig)
        def call(
            self,
            pixel_values: tf.Tensor | None = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            training: Optional[bool] = None,
    ) -> Union[Tuple, TFBaseModelOutputWithPooling]:
        r"""
        返回类型提示：可能是元组或 TFBaseModelOutputWithPooling 类的对象

        """
        # 如果未指定 output_attentions 参数，则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果未指定 output_hidden_states 参数，则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果未指定 return_dict 参数，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果 pixel_values 参数为 None，则抛出数值错误异常
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 将像素值通过嵌入层处理，得到隐藏状态
        hidden_states = self.embeddings(pixel_values)

        # 使用编码器处理隐藏状态，获取编码器的输出
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 获取编码器的最后隐藏状态，并通过后层归一化处理
        last_hidden_state = encoder_outputs[0]
        last_hidden_state = self.post_layernorm(last_hidden_state)

        # 提取汇聚输出，即编码器输出的第一个位置
        pooled_output = last_hidden_state[:, 0, :]
        # TensorFlow 对输入的秩（rank）不一致时可能会出错，因此插入一个单维度来确保一致性
        pooled_output = self.post_layernorm(tf.expand_dims(pooled_output, 1))
        pooled_output = tf.squeeze(pooled_output, 1)

        # 如果不要求返回字典形式，则返回一个元组
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 否则，返回 TFBaseModelOutputWithPooling 对象，其中包含编码器输出的各项属性
        return TFBaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )

    def get_input_embeddings(self):
        return self.embeddings

    def build(self, input_shape=None):
        # 如果模型已经构建，则直接返回
        if self.built:
            return
        self.built = True
        # 如果 embeddings 属性存在，则构建 embeddings 层
        if getattr(self, "embeddings", None) is not None:
            with tf.name_scope(self.embeddings.name):
                self.embeddings.build(None)
        # 如果 encoder 属性存在，则构建 encoder 层
        if getattr(self, "encoder", None) is not None:
            with tf.name_scope(self.encoder.name):
                self.encoder.build(None)
        # 如果 post_layernorm 属性存在，则构建 post_layernorm 层，输入形状为 [None, None, self.embed_dim]
        if getattr(self, "post_layernorm", None) is not None:
            with tf.name_scope(self.post_layernorm.name):
                self.post_layernorm.build([None, None, self.embed_dim])
# 定义 TFBlipMainLayer 类，继承自 keras.layers.Layer，用于实现主层逻辑
class TFBlipMainLayer(keras.layers.Layer):
    # 设置类属性 config_class 为 BlipConfig 类型
    config_class = BlipConfig

    # 初始化方法，接受 BlipConfig 类型的 config 参数及其他位置和关键字参数
    def __init__(self, config: BlipConfig, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # 检查 config.text_config 是否为 BlipTextConfig 类型，若不是则抛出 ValueError 异常
        if not isinstance(config.text_config, BlipTextConfig):
            raise ValueError(
                "config.text_config is expected to be of type BlipTextConfig but is of type"
                f" {type(config.text_config)}."
            )

        # 检查 config.vision_config 是否为 BlipVisionConfig 类型，若不是则抛出 ValueError 异常
        if not isinstance(config.vision_config, BlipVisionConfig):
            raise ValueError(
                "config.vision_config is expected to be of type BlipVisionConfig but is of type"
                f" {type(config.vision_config)}."
            )

        # 从 config 中获取 text_config 和 vision_config 对象
        text_config = config.text_config
        vision_config = config.vision_config

        # 设置实例变量，分别表示投影维度、文本嵌入维度和视觉嵌入维度
        self.projection_dim = config.projection_dim
        self.text_embed_dim = text_config.hidden_size
        self.vision_embed_dim = vision_config.hidden_size

        # 创建 TFBlipTextModel 实例并赋给 self.text_model，命名为 "text_model"
        self.text_model = TFBlipTextModel(text_config, name="text_model")
        
        # 创建 TFBlipVisionModel 实例并赋给 self.vision_model，命名为 "vision_model"
        self.vision_model = TFBlipVisionModel(vision_config, name="vision_model")

        # 创建 Dense 层实例 self.visual_projection，用于视觉投影，设置投影维度、不使用偏置、使用指定初始化器
        self.visual_projection = keras.layers.Dense(
            self.projection_dim,
            use_bias=False,
            kernel_initializer=get_initializer(config.initializer_range),
            name="visual_projection",
        )

        # 创建 Dense 层实例 self.text_projection，用于文本投影，设置投影维度、不使用偏置、使用指定初始化器
        self.text_projection = keras.layers.Dense(
            self.projection_dim,
            use_bias=False,
            kernel_initializer=get_initializer(config.initializer_range),
            name="text_projection",
        )

        # 将 config 参数赋给实例变量 self.config
        self.config = config

    # build 方法，用于构建层，接受 input_shape 参数
    def build(self, input_shape=None):
        # 创建并添加名为 logit_scale 的可训练权重，初始化为 config.logit_scale_init_value
        self.logit_scale = self.add_weight(
            name="logit_scale",
            shape=[],
            initializer=keras.initializers.Constant(self.config.logit_scale_init_value),
            trainable=True,
        )

        # 如果已经构建过，则直接返回
        if self.built:
            return
        
        # 标记为已构建
        self.built = True
        
        # 如果存在 self.text_model，则构建 self.text_model
        if getattr(self, "text_model", None) is not None:
            with tf.name_scope(self.text_model.name):
                self.text_model.build(None)
        
        # 如果存在 self.vision_model，则构建 self.vision_model
        if getattr(self, "vision_model", None) is not None:
            with tf.name_scope(self.vision_model.name):
                self.vision_model.build(None)
        
        # 如果存在 self.visual_projection，则构建 self.visual_projection
        if getattr(self, "visual_projection", None) is not None:
            with tf.name_scope(self.visual_projection.name):
                self.visual_projection.build([None, None, self.vision_embed_dim])
        
        # 如果存在 self.text_projection，则构建 self.text_projection
        if getattr(self, "text_projection", None) is not None:
            with tf.name_scope(self.text_projection.name):
                self.text_projection.build([None, None, self.text_embed_dim])

    # unpack_inputs 装饰器用于处理输入参数的解包操作
    @unpack_inputs
    # 定义 BLIP 模型的调用方法，接受多个输入参数和可选的输出参数，并返回 TFBlipOutput 或元组
    def call(
        self,
        input_ids: tf.Tensor | None = None,  # 输入的文本序列的张量，可选
        pixel_values: tf.Tensor | None = None,  # 输入的图像像素值的张量，可选
        attention_mask: tf.Tensor | None = None,  # 文本的注意力遮罩张量，可选
        position_ids: tf.Tensor | None = None,  # 文本的位置编码张量，可选
        return_loss: Optional[bool] = None,  # 是否返回损失值，可选
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选
        return_dict: Optional[bool] = None,  # 是否返回字典形式的输出，可选
        training: Optional[bool] = None,  # 是否处于训练模式，可选
    ) -> Union[Tuple, TFBlipOutput]:  # 返回值可以是元组或 TFBlipOutput 对象

        # 如果没有显式指定，使用 BLIP 模型配置中的设定值来填充相应的输出参数
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用视觉模型处理图像输入，并根据指定参数输出相应的结果
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 使用文本模型处理文本输入，并根据指定参数输出相应的结果
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 从视觉模型的输出中获取图像嵌入表示，并应用视觉投影层
        image_embeds = vision_outputs[1]
        image_embeds = self.visual_projection(image_embeds)

        # 从文本模型的输出中获取文本嵌入表示，并应用文本投影层
        text_embeds = text_outputs[1]
        text_embeds = self.text_projection(text_embeds)

        # 对图像嵌入进行 L2 范数归一化
        image_embeds = image_embeds / tf.norm(image_embeds, ord=2, axis=-1, keepdims=True)
        # 对文本嵌入进行 L2 范数归一化
        text_embeds = text_embeds / tf.norm(text_embeds, ord=2, axis=-1, keepdims=True)

        # 使用余弦相似度计算作为对数概率（logits）
        logit_scale = tf.exp(self.logit_scale)
        logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale
        logits_per_image = tf.transpose(logits_per_text)

        # 如果需要返回损失值，则计算 BLIP 损失
        loss = None
        if return_loss:
            loss = blip_loss(logits_per_text)
            loss = tf.reshape(loss, (1,))

        # 如果不需要返回字典形式的输出，则返回一个包含多个输出的元组
        if not return_dict:
            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
            return ((loss,) + output) if loss is not None else output

        # 如果需要返回字典形式的输出，则创建 TFBlipOutput 对象并返回
        return TFBlipOutput(
            loss=loss,
            logits_per_image=logits_per_image,
            logits_per_text=logits_per_text,
            text_embeds=text_embeds,
            image_embeds=image_embeds,
            text_model_output=text_outputs,
            vision_model_output=vision_outputs,
        )
class TFBlipModel(TFBlipPreTrainedModel):
    # 指定配置类为BlipConfig
    config_class = BlipConfig
    # 在加载模型时忽略的键列表
    _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
    # 主输入名称为"input_ids"
    main_input_name = "input_ids"

    def __init__(self, config: BlipConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        # 初始化TFBlipMainLayer作为模型的主要层，使用给定的配置
        self.blip = TFBlipMainLayer(config, name="blip")

    def serving_output(self, output: TFBlipOutput) -> TFBlipOutput:
        # 用于模型服务输出，直接返回给定的TFBlipOutput对象
        return TFBlipOutput(
            logits_per_image=output.logits_per_image,
            logits_per_text=output.logits_per_text,
            text_embeds=output.text_embeds,
            image_embeds=output.image_embeds,
        )

    @unpack_inputs
    @add_start_docstrings_to_model_forward(BLIP_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFBlipOutput, config_class=BlipConfig)
    def call(
        self,
        input_ids: tf.Tensor | None = None,
        pixel_values: tf.Tensor | None = None,
        attention_mask: tf.Tensor | None = None,
        position_ids: tf.Tensor | None = None,
        return_loss: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = None,
    ) -> Union[Tuple, TFBlipOutput]:
        r"""
        模型的前向传播方法，接受多种输入参数并返回输出。

        Args:
            input_ids: 输入的token IDs张量，可以为None。
            pixel_values: 图像像素值张量，可以为None。
            attention_mask: 注意力遮罩张量，可以为None。
            position_ids: 位置IDs张量，可以为None。
            return_loss: 是否返回损失值，可选布尔值。
            output_attentions: 是否输出注意力张量，可选布尔值。
            output_hidden_states: 是否输出隐藏状态张量，可选布尔值。
            return_dict: 是否返回字典格式输出，可选布尔值。
            training: 是否处于训练模式，可选布尔值。

        Returns:
            模型的输出结果，类型为TFBlipOutput或一个元组。

        Examples:
        
        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFBlipModel

        >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # 图像文本相似度得分
        >>> probs = tf.nn.softmax(logits_per_image, axis=1)  # 可以使用softmax获取标签概率
        ```
        """
        # 调用self.blip对象的call方法，传递所有参数，并返回其输出
        outputs = self.blip(
            input_ids=input_ids,
            pixel_values=pixel_values,
            attention_mask=attention_mask,
            position_ids=position_ids,
            return_loss=return_loss,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )
        return outputs

    @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
    def get_text_features(
        self,
        input_ids: tf.Tensor | None = None,
        attention_mask: tf.Tensor | None = None,
        position_ids: tf.Tensor | None = None,
        return_dict: Optional[bool] = None,
        # 方法用于获取文本特征，接受文本相关的输入参数并返回对应的特征
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果 return_dict 不为 None，则使用它；否则使用配置中的 use_return_dict

        vision_outputs = self.blip.vision_model(pixel_values=pixel_values, return_dict=return_dict)
        # 使用 BLIP 视觉模型处理像素值，获取视觉输出，根据 return_dict 决定是否返回字典形式的结果

        pooled_output = vision_outputs[1]  # pooled_output
        # 从视觉输出中取出第二个元素作为汇聚输出，通常用于特征投影

        image_features = self.blip.visual_projection(pooled_output)
        # 使用 BLIP 视觉投影层对汇聚输出进行特征投影，得到图像特征

        return image_features
        # 返回经过特征投影后的图像特征张量
@add_start_docstrings(
    """
    BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass
    `input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise,
    the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption
    from the text input. If no text input is provided, the decoder will start with the [BOS] token only.
    """,
    BLIP_START_DOCSTRING,
)
class TFBlipForConditionalGeneration(TFBlipPreTrainedModel):
    """
    TFBlipForConditionalGeneration 类，继承自 TFBlipPreTrainedModel，用于图像字幕生成任务。

    Attributes:
        config_class (BlipConfig): 配置类为 BlipConfig。
        _keys_to_ignore_on_load_missing (list): 在加载时忽略的缺失键列表。
        main_input_name (str): 主要输入名称为 "pixel_values"。
    """

    config_class = BlipConfig
    _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
    main_input_name = "pixel_values"

    def __init__(self, config: BlipConfig, *args, **kwargs):
        """
        初始化方法，接受 BlipConfig 类型的配置参数。

        Args:
            config (BlipConfig): BLIP 模型的配置参数。
            *args: 位置参数。
            **kwargs: 关键字参数。
        """
        super().__init__(config, *args, **kwargs)

        self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model")
        """
        vision_model 属性，TFBlipVisionModel 类型，使用 vision_config 初始化的视觉模型。

        Args:
            config.vision_config: 视觉配置参数。
            name (str): 模型名称为 "vision_model"。
        """

        self.text_decoder = TFBlipTextLMHeadModel(config.text_config, name="text_decoder")
        """
        text_decoder 属性，TFBlipTextLMHeadModel 类型，使用 text_config 初始化的文本解码器。

        Args:
            config.text_config: 文本配置参数。
            name (str): 模型名称为 "text_decoder"。
        """

        self.decoder_input_ids = config.text_config.bos_token_id
        """
        decoder_input_ids 属性，int 类型，表示文本解码器的起始标记 ID。

        Args:
            config.text_config.bos_token_id: 开始序列的标记 ID。
        """

        self.decoder_pad_token_id = config.text_config.pad_token_id
        """
        decoder_pad_token_id 属性，int 类型，表示文本解码器的填充标记 ID。

        Args:
            config.text_config.pad_token_id: 填充标记的 ID。
        """

    def get_input_embeddings(self) -> keras.layers.Layer:
        """
        获取输入嵌入层的方法。

        Returns:
            keras.layers.Layer: 返回视觉模型的 patch_embedding 层作为输入嵌入层。
        """
        return self.vision_model.embeddings.patch_embedding

    @unpack_inputs
    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFBlipForConditionalGenerationModelOutput, config_class=BlipConfig)
    def call(
        self,
        pixel_values: tf.Tensor,
        input_ids: tf.Tensor | None = None,
        attention_mask: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        labels: tf.Tensor | None = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = None,
        **kwargs
    ) -> TFBlipForConditionalGenerationModelOutput:
        """
        模型的前向传播方法，用于推断和训练。

        Args:
            pixel_values (tf.Tensor): 输入的像素值张量。
            input_ids (tf.Tensor, optional): 输入的文本 ID 张量。默认为 None。
            attention_mask (tf.Tensor, optional): 注意力掩码张量。默认为 None。
            output_attentions (bool, optional): 是否输出注意力。默认为 None。
            output_hidden_states (bool, optional): 是否输出隐藏状态。默认为 None。
            labels (tf.Tensor, optional): 标签张量。默认为 None。
            return_dict (bool, optional): 是否返回字典格式结果。默认为 None。
            training (bool, optional): 是否为训练模式。默认为 None。

        Returns:
            TFBlipForConditionalGenerationModelOutput: BLIP 条件生成模型的输出结果。
        """
        ) -> Union[Tuple, TFBlipForConditionalGenerationModelOutput]:
        r"""
        Returns:

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFBlipForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "A picture of"

        >>> inputs = processor(images=image, text=text, return_tensors="tf")

        >>> outputs = model(**inputs)
        ```"""

        # 检查是否需要返回字典形式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 使用视觉模型处理输入的像素值，返回视觉特征
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            training=training,
        )

        # 提取视觉特征的第一个输出，通常是图像嵌入
        image_embeds = vision_outputs[0]

        # 使用文本解码器生成文本输出
        outputs = self.text_decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_hidden_states=image_embeds,
            labels=labels,
            return_dict=False,  # 强制不返回字典
            training=training,
        )

        # 如果不需要返回字典，则按预期输出格式返回结果元组
        if not return_dict:
            outputs = (outputs[0], outputs[1], image_embeds, vision_outputs[0]) + vision_outputs[2:]
            return tuple(output for output in outputs if output is not None)

        # 如果有标签，提取损失和逻辑回归结果
        if labels is not None:
            loss = outputs[0]
            logits = outputs[1]
        else:
            loss = None
            logits = outputs[0]

        # 如果存在损失并且其维度为0，则进行形状调整以保证一致性
        if loss is not None and loss.shape.rank == 0:
            loss = tf.reshape(loss, (1,))

        # 返回模型输出的命名元组，包括损失、逻辑回归结果、图像嵌入和视觉模型的隐藏状态等
        return TFBlipForConditionalGenerationModelOutput(
            loss=loss,
            logits=logits,
            image_embeds=image_embeds,
            last_hidden_state=vision_outputs.last_hidden_state,
            hidden_states=vision_outputs.hidden_states,
            attentions=vision_outputs.attentions,
        )

    def generate(
        self,
        pixel_values: tf.Tensor,
        input_ids: tf.Tensor | None = None,
        attention_mask: tf.Tensor | None = None,
        **generate_kwargs,
    ) -> tf.Tensor:
        r"""
        Overrides *generate* function to be able to use the model as a conditional generator

        Parameters:
            pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, image_height, image_width)`:
                Input image to be processed
            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:


        Examples:
        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFBlipForConditionalGeneration

        >>> model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="tf")

        >>> outputs = model.generate(**inputs)
        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
        two cats sleeping on a couch
        ```
        """

        # 获取批次大小
        batch_size = pixel_values.shape[0]
        
        # 使用视觉模型处理输入图像，返回视觉输出
        vision_outputs = self.vision_model(pixel_values=pixel_values)

        # 从视觉输出中获取图像嵌入
        image_embeds = vision_outputs[0]

        # 创建图像注意力掩码，默认全为1，形状与图像嵌入维度相同
        image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int32)

        # 如果输入的input_ids是列表，则转换为张量
        if isinstance(input_ids, list):
            input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int32)
        # 如果input_ids为None，则使用默认的decoder输入ID和结束标记创建张量
        elif input_ids is None:
            input_ids = tf.convert_to_tensor(
                [[self.decoder_input_ids, self.config.text_config.eos_token_id]], dtype=tf.int32
            )
            # 扩展为与批次大小匹配的形状
            input_ids = tf.tile(input_ids, (batch_size, 1))

        # 添加起始标记到input_ids的开头，与PyTorch中的操作等效
        input_ids = tf.concat(
            [tf.ones((batch_size, 1), dtype=tf.int32) * self.config.text_config.bos_token_id, input_ids[:, 1:]], axis=1
        )
        
        # 调整attention_mask的长度，与输入序列长度相匹配
        attention_mask = attention_mask[:, :-1] if attention_mask is not None else None

        # 调用文本解码器的generate方法生成文本序列
        outputs = self.text_decoder.generate(
            input_ids=input_ids[:, :-1],
            eos_token_id=self.config.text_config.sep_token_id,
            pad_token_id=self.config.text_config.pad_token_id,
            attention_mask=attention_mask,
            encoder_hidden_states=image_embeds,
            encoder_attention_mask=image_attention_mask,
            **generate_kwargs,
        )

        # 返回生成的输出序列
        return outputs
    # 定义模型构建方法，如果已经构建过则直接返回
    def build(self, input_shape=None):
        if self.built:
            return
        # 设置标志位，表示模型已经构建
        self.built = True
        # 如果存在视觉模型，则构建视觉模型
        if getattr(self, "vision_model", None) is not None:
            # 使用视觉模型的名称作为命名空间
            with tf.name_scope(self.vision_model.name):
                # 构建视觉模型，传入空的输入形状
                self.vision_model.build(None)
        # 如果存在文本解码器，则构建文本解码器
        if getattr(self, "text_decoder", None) is not None:
            # 使用文本解码器的名称作为命名空间
            with tf.name_scope(self.text_decoder.name):
                # 构建文本解码器，传入空的输入形状
                self.text_decoder.build(None)
"""
BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text
decoder. The vision encoder will encode the input image, the text encoder will encode the input question together
with the encoding of the image, and the text decoder will output the answer to the question.
"""
# 导入所需的模块和函数装饰器
@add_start_docstrings(
    """
    BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text
    decoder. The vision encoder will encode the input image, the text encoder will encode the input question together
    with the encoding of the image, and the text decoder will output the answer to the question.
    """,
    BLIP_START_DOCSTRING,
)
# 继承自 TFBlipPreTrainedModel 类
class TFBlipForQuestionAnswering(TFBlipPreTrainedModel):
    # 使用 BlipConfig 类来配置模型
    config_class = BlipConfig
    # 在加载时忽略的关键字列表
    _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]

    # 模型初始化方法
    def __init__(self, config: BlipConfig, *args, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *args, **kwargs)

        # 创建视觉模型，使用 TFBlipVisionModel 类
        self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model")

        # 创建文本编码器，使用 TFBlipTextModel 类
        self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False)

        # 创建文本解码器，使用 TFBlipTextLMHeadModel 类
        self.text_decoder = TFBlipTextLMHeadModel(config.text_config, name="text_decoder")

        # 解码器的填充标记 ID
        self.decoder_pad_token_id = config.text_config.pad_token_id
        # 解码器的起始标记 ID
        self.decoder_start_token_id = config.text_config.bos_token_id

    # 获取输入嵌入的方法
    def get_input_embeddings(self) -> keras.layers.Layer:
        # 返回视觉模型的补丁嵌入层
        return self.vision_model.embeddings.patch_embedding

    # 定义的方法来实现标记右移，类似于 transformers.models.t5.modeling_tf_t5.TFT5PreTrainedModel._shift_right 方法
    def _shift_right(self, input_ids):
        # 获取解码器的起始标记 ID 和填充标记 ID
        decoder_start_token_id = self.decoder_start_token_id
        pad_token_id = self.decoder_pad_token_id

        # 如果起始标记 ID 或填充标记 ID 未定义，则抛出 ValueError
        if decoder_start_token_id is None or pad_token_id is None:
            raise ValueError("decoder_start_token_id and pad_token_id must be defined!")

        # 创建起始标记序列，并确保与输入标记兼容的数据类型
        start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id)
        start_tokens = tf.cast(start_tokens, input_ids.dtype)  # 确保拼接时数据类型兼容
        # 将起始标记序列与输入标记序列右移一位进行拼接
        shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)

        # 将标签中可能存在的 -100 值替换为填充标记 ID
        shifted_input_ids = tf.where(
            shifted_input_ids == -100,
            tf.cast(tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids.dtype),
            shifted_input_ids,
        )

        # 断言确保 `labels` 只包含正值和 -100
        tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=shifted_input_ids.dtype))

        return shifted_input_ids

    # 装饰器函数，用于将输入拆包并添加模型前向传播的文档字符串
    @unpack_inputs
    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
    # 替换返回值文档字符串的装饰器函数
    @replace_return_docstrings(output_type=TFBlipTextVisionModelOutput, config_class=BlipVisionConfig)
    # 定义一个方法 `call`，用于执行模型推理或训练过程
    def call(
        self,
        input_ids: tf.Tensor,  # 输入文本的 token IDs，作为模型的输入
        pixel_values: tf.Tensor | None = None,  # 图像像素值，可选，用于图像输入模型
        decoder_input_ids: tf.Tensor | None = None,  # 解码器的输入 token IDs，可选
        decoder_attention_mask: tf.Tensor | None = None,  # 解码器的注意力遮罩，可选
        attention_mask: tf.Tensor | None = None,  # 注意力遮罩，控制模型哪些部分需要关注，可选
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可选
        labels: tf.Tensor | None = None,  # 标签，用于模型的监督学习，可选
        return_dict: Optional[bool] = None,  # 是否以字典形式返回输出，可选
        training: Optional[bool] = None,  # 是否处于训练模式，可选
    ):
        # 定义一个方法 `generate`，用于生成模型输出（如文本生成）
        def generate(
            self,
            input_ids: tf.Tensor,  # 输入文本的 token IDs，作为生成器的输入
            pixel_values: tf.Tensor,  # 图像像素值，用于图像输入模型
            attention_mask: tf.Tensor | None = None,  # 注意力遮罩，控制模型哪些部分需要关注，可选
            **generate_kwargs,  # 其他生成参数，以字典形式传递
    ) -> tf.Tensor:
        r"""
        Overrides *generate* function to be able to use the model as a conditional generator

        Parameters:
            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, image_height, image_width)`:
                Input image to be processed
            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for
                tokens that are NOT MASKED, `0` for MASKED tokens.
            generate_kwargs (dict, *optional*):
                Additional arguments passed to the `generate` function of the decoder


        Examples:
        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, TFBlipForQuestionAnswering

        >>> model = TFBlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "How many cats are in the picture?"

        >>> inputs = processor(images=image, text=text, return_tensors="tf")

        >>> outputs = model.generate(**inputs)
        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
        2
        ```
        """
        # 使用视觉模型处理输入图像，获取视觉输出
        vision_outputs = self.vision_model(pixel_values=pixel_values)

        # 提取图像嵌入表示
        image_embeds = vision_outputs[0]

        # 生成图像注意力掩码，形状与图像嵌入表示的前几维相同，最后一维是整数类型
        image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int32)

        # 如果输入的input_ids是列表，则转换为Tensor类型
        if isinstance(input_ids, list):
            input_ids = tf.Tensor(input_ids)

        # 使用文本编码器处理输入文本序列，得到文本输出
        question_outputs = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_hidden_states=image_embeds,
            encoder_attention_mask=image_attention_mask,
            return_dict=False,
        )

        # 提取问题嵌入表示
        question_embeds = question_outputs[0]

        # 生成问题的注意力掩码，形状与问题嵌入表示的前几维相同，最后一维是整数类型
        question_attention_mask = tf.ones(shape_list(question_embeds)[:-1], dtype=tf.int32)

        # 构造起始标记的Tensor，形状为(batch_size, 1)，值为self.decoder_start_token_id
        bos_ids = tf.fill(
            (tf.shape(question_embeds)[0], 1), value=tf.cast(self.decoder_start_token_id, input_ids.dtype)
        )

        # 使用文本解码器生成输出序列
        outputs = self.text_decoder.generate(
            input_ids=bos_ids,
            eos_token_id=self.config.text_config.sep_token_id,
            pad_token_id=self.config.text_config.pad_token_id,
            encoder_hidden_states=question_embeds,
            encoder_attention_mask=question_attention_mask,
            **generate_kwargs,
        )

        # 返回生成的输出序列
        return outputs
    # 定义神经网络层的构建方法，用于建立模型的输入形状
    def build(self, input_shape=None):
        # 如果模型已经构建过，则直接返回，避免重复构建
        if self.built:
            return
        # 设置标志位，表示模型已经构建
        self.built = True
        
        # 如果存在视觉模型，使用 TensorFlow 的命名空间来构建视觉模型
        if getattr(self, "vision_model", None) is not None:
            with tf.name_scope(self.vision_model.name):
                # 调用视觉模型的build方法，传入空输入形状
                self.vision_model.build(None)
        
        # 如果存在文本编码器，使用 TensorFlow 的命名空间来构建文本编码器
        if getattr(self, "text_encoder", None) is not None:
            with tf.name_scope(self.text_encoder.name):
                # 调用文本编码器的build方法，传入空输入形状
                self.text_encoder.build(None)
        
        # 如果存在文本解码器，使用 TensorFlow 的命名空间来构建文本解码器
        if getattr(self, "text_decoder", None) is not None:
            with tf.name_scope(self.text_decoder.name):
                # 调用文本解码器的build方法，传入空输入形状
                self.text_decoder.build(None)
"""
BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of
image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to
the image.
"""
# 继承自 TFBlipPreTrainedModel 的 BLIP 图像文本检索模型类
class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel):
    # 使用 BlipConfig 类作为配置类
    config_class = BlipConfig

    def __init__(self, config: BlipConfig, *args, **kwargs):
        # 调用父类的初始化方法
        super().__init__(config, *args, **kwargs)

        # 创建 BLIP 视觉模型，使用配置中的视觉配置
        self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model")

        # 创建 BLIP 文本编码器，使用配置中的文本配置，并禁用池化层
        self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False)

        # 视觉投影层，用于将视觉特征投影到共享空间
        self.vision_proj = keras.layers.Dense(
            config.image_text_hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="vision_proj",
        )

        # 文本投影层，用于将文本特征投影到共享空间
        self.text_proj = keras.layers.Dense(
            config.image_text_hidden_size,
            kernel_initializer=get_initializer(config.initializer_range),
            name="text_proj",
        )

        # 图像文本匹配头部，用于预测文本与图像相关性的概率
        self.itm_head = keras.layers.Dense(
            2, kernel_initializer=get_initializer(config.initializer_range), name="itm_head"
        )

        # 解码器的填充标记 ID，根据配置中的文本填充标记 ID 或解码器的开始标记 ID
        self.decoder_pad_token_id = (
            config.text_config.pad_token_id
            if not hasattr(config, "decoder_pad_token_id")
            else config.decoder_pad_token_id
        )
        self.decoder_start_token_id = (
            config.text_config.bos_token_id
            if not hasattr(config, "decoder_start_token_id")
            else config.decoder_start_token_id
        )
        self.config = config

    # 获取输入嵌入的方法，返回视觉模型的补丁嵌入层
    def get_input_embeddings(self) -> keras.layers.Layer:
        return self.vision_model.embeddings.patch_embedding

    # 调用方法，对输入数据进行前向传播
    @unpack_inputs
    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=TFBlipImageTextMatchingModelOutput, config_class=BlipVisionConfig)
    def call(
        self,
        input_ids: tf.Tensor,
        pixel_values: tf.Tensor | None = None,
        use_itm_head: Optional[bool] = True,
        attention_mask: tf.Tensor | None = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        training: Optional[bool] = None,
        # 其他参数用于模型前向传播，如像素值、注意力掩码、是否返回字典等
    ):
    # 构建方法，用于构造模型结构。如果已经构建过，直接返回。
    def build(self, input_shape=None):
        if self.built:
            return
        # 标记模型已经构建
        self.built = True
        
        # 如果存在视觉模型，使用视觉模型的名称作为命名空间，构建视觉模型
        if getattr(self, "vision_model", None) is not None:
            with tf.name_scope(self.vision_model.name):
                self.vision_model.build(None)
        
        # 如果存在文本编码器，使用文本编码器的名称作为命名空间，构建文本编码器
        if getattr(self, "text_encoder", None) is not None:
            with tf.name_scope(self.text_encoder.name):
                self.text_encoder.build(None)
        
        # 如果存在视觉投影层，使用视觉投影层的名称作为命名空间，构建视觉投影层
        if getattr(self, "vision_proj", None) is not None:
            with tf.name_scope(self.vision_proj.name):
                self.vision_proj.build([None, None, self.config.vision_config.hidden_size])
        
        # 如果存在文本投影层，使用文本投影层的名称作为命名空间，构建文本投影层
        if getattr(self, "text_proj", None) is not None:
            with tf.name_scope(self.text_proj.name):
                self.text_proj.build([None, None, self.config.text_config.hidden_size])
        
        # 如果存在itm_head，使用itm_head的名称作为命名空间，构建itm_head
        if getattr(self, "itm_head", None) is not None:
            with tf.name_scope(self.itm_head.name):
                self.itm_head.build([None, None, self.config.text_config.hidden_size])

posted @ 2024-06-29 15:49 绝不原创的飞龙阅读(13) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-二十-

Transformers 源码解析（二十）

`.\models\blenderbot_small\tokenization_blenderbot_small.py`

`.\models\blenderbot_small\tokenization_blenderbot_small_fast.py`

`.\models\blenderbot_small\init.py`

`.\models\blip\configuration_blip.py`

`.\models\blip\convert_blip_original_pytorch_to_hf.py`

`.\models\blip\image_processing_blip.py`

`.\models\blip\modeling_blip.py`

`.\models\blip\modeling_blip_text.py`

`.\models\blip\modeling_tf_blip.py`

公告

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-二十-

Transformers 源码解析（二十）

.\models\blenderbot_small\tokenization_blenderbot_small.py

.\models\blenderbot_small\tokenization_blenderbot_small_fast.py

.\models\blenderbot_small\__init__.py

.\models\blip\configuration_blip.py

.\models\blip\convert_blip_original_pytorch_to_hf.py

.\models\blip\image_processing_blip.py

.\models\blip\modeling_blip.py

.\models\blip\modeling_blip_text.py

.\models\blip\modeling_tf_blip.py

公告

`.\models\blenderbot_small\tokenization_blenderbot_small.py`

`.\models\blenderbot_small\tokenization_blenderbot_small_fast.py`

`.\models\blenderbot_small\init.py`

`.\models\blip\configuration_blip.py`

`.\models\blip\convert_blip_original_pytorch_to_hf.py`

`.\models\blip\image_processing_blip.py`

`.\models\blip\modeling_blip.py`

`.\models\blip\modeling_blip_text.py`

`.\models\blip\modeling_tf_blip.py`