diffusers-源码解析-十九-

diffusers 源码解析（十九）

`.\diffusers\pipelines\audioldm2\pipeline_audioldm2.py`

# 版权信息，声明该文件的所有权和使用条款
# Copyright 2024 CVSSP, ByteDance and The HuggingFace Team. All rights reserved.
#
# 根据 Apache License 2.0 许可证使用该文件
# Licensed under the Apache License, Version 2.0 (the "License");
# 仅在遵循许可证的情况下使用该文件
# you may not use this file except in compliance with the License.
# 可以在以下网址获取许可证副本
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律另有规定或书面协议，否则根据许可证分发的软件是按“原样”提供的
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# 不提供任何形式的明示或暗示的担保或条件
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 请参阅许可证以获取有关权限和限制的具体条款
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect  # 导入 inspect 模块以进行对象检查和获取信息
from typing import Any, Callable, Dict, List, Optional, Union  # 从 typing 模块导入类型注解

import numpy as np  # 导入 numpy 作为 np 以进行数值计算
import torch  # 导入 PyTorch 库以进行深度学习操作
from transformers import (  # 从 transformers 模块导入多个类
    ClapFeatureExtractor,  # 用于提取 Clap 特征的类
    ClapModel,  # Clap 模型类
    GPT2Model,  # GPT-2 模型类
    RobertaTokenizer,  # Roberta 分词器类
    RobertaTokenizerFast,  # 快速 Roberta 分词器类
    SpeechT5HifiGan,  # SpeechT5 HifiGan 类
    T5EncoderModel,  # T5 编码器模型类
    T5Tokenizer,  # T5 分词器类
    T5TokenizerFast,  # 快速 T5 分词器类
    VitsModel,  # VITS 模型类
    VitsTokenizer,  # VITS 分词器类
)

from ...models import AutoencoderKL  # 从上级模块导入 AutoencoderKL 类
from ...schedulers import KarrasDiffusionSchedulers  # 从上级模块导入 KarrasDiffusionSchedulers 类
from ...utils import (  # 从上级模块导入多个工具函数
    is_accelerate_available,  # 检查 accelerate 是否可用的函数
    is_accelerate_version,  # 检查 accelerate 版本的函数
    is_librosa_available,  # 检查 librosa 是否可用的函数
    logging,  # 导入日志记录模块
    replace_example_docstring,  # 替换示例文档字符串的函数
)
from ...utils.torch_utils import randn_tensor  # 从上级模块导入 randn_tensor 函数
from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline  # 从同级模块导入音频管道输出和扩散管道
from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel  # 从当前模块导入音频 LDM2 模型

if is_librosa_available():  # 如果 librosa 可用
    import librosa  # 导入 librosa 库用于音频处理

logger = logging.get_logger(__name__)  # 创建一个日志记录器，使用当前模块名作为标识

EXAMPLE_DOC_STRING = """  # 示例文档字符串的开始
```  # 示例文档的内容
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
```py  # 示例文档字符串的结束
```  # 示例文档的分隔
    Examples:
        ```py
        >>> import scipy  # 导入 scipy 库，用于处理音频文件
        >>> import torch  # 导入 PyTorch 库，用于深度学习模型的计算
        >>> from diffusers import AudioLDM2Pipeline  # 从 diffusers 库导入 AudioLDM2Pipeline 类，用于音频生成

        >>> repo_id = "cvssp/audioldm2"  # 定义模型的仓库 ID
        >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)  # 从预训练模型加载管道，并指定数据类型为 float16
        >>> pipe = pipe.to("cuda")  # 将管道移动到 GPU 上以加速计算

        >>> # define the prompts
        >>> prompt = "The sound of a hammer hitting a wooden surface."  # 定义正向提示语，描述想要生成的音频内容
        >>> negative_prompt = "Low quality."  # 定义负向提示语，表明不希望生成的音频质量

        >>> # set the seed for generator
        >>> generator = torch.Generator("cuda").manual_seed(0)  # 创建一个 GPU 上的随机数生成器并设置种子

        >>> # run the generation
        >>> audio = pipe(  # 调用生成管道生成音频
        ...     prompt,  # 使用正向提示语
        ...     negative_prompt=negative_prompt,  # 使用负向提示语
        ...     num_inference_steps=200,  # 设置推理步骤数为 200
        ...     audio_length_in_s=10.0,  # 设置生成音频的时长为 10 秒
        ...     num_waveforms_per_prompt=3,  # 为每个提示生成 3 个波形
        ...     generator=generator,  # 使用之前创建的随机数生成器
        ... ).audios  # 获取生成的音频数据

        >>> # save the best audio sample (index 0) as a .wav file
        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0])  # 将最佳音频样本（索引 0）保存为 .wav 文件，采样率为 16000
        ```
        ```py
        #Using AudioLDM2 for Text To Speech
        >>> import scipy  # 导入 scipy 库，用于处理音频文件
        >>> import torch  # 导入 PyTorch 库，用于深度学习模型的计算
        >>> from diffusers import AudioLDM2Pipeline  # 从 diffusers 库导入 AudioLDM2Pipeline 类，用于音频生成

        >>> repo_id = "anhnct/audioldm2_gigaspeech"  # 定义 TTS 模型的仓库 ID
        >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)  # 从预训练模型加载管道，并指定数据类型为 float16
        >>> pipe = pipe.to("cuda")  # 将管道移动到 GPU 上以加速计算

        >>> # define the prompts
        >>> prompt = "A female reporter is speaking"  # 定义正向提示语，描述想要生成的语音内容
        >>> transcript = "wish you have a good day"  # 定义要生成的语音的转录文本

        >>> # set the seed for generator
        >>> generator = torch.Generator("cuda").manual_seed(0)  # 创建一个 GPU 上的随机数生成器并设置种子

        >>> # run the generation
        >>> audio = pipe(  # 调用生成管道生成音频
        ...     prompt,  # 使用正向提示语
        ...     transcription=transcript,  # 使用转录文本
        ...     num_inference_steps=200,  # 设置推理步骤数为 200
        ...     audio_length_in_s=10.0,  # 设置生成音频的时长为 10 秒
        ...     num_waveforms_per_prompt=2,  # 为每个提示生成 2 个波形
        ...     generator=generator,  # 使用之前创建的随机数生成器
        ...     max_new_tokens=512,          #必须将 max_new_tokens 设置为 512 以用于 TTS
        ... ).audios  # 获取生成的音频数据

        >>> # save the best audio sample (index 0) as a .wav file
        >>> scipy.io.wavfile.write("tts.wav", rate=16000, data=audio[0])  # 将最佳音频样本（索引 0）保存为 .wav 文件，采样率为 16000
        ``` 
# 文档字符串，用于描述函数或类的功能
"""


# 定义用于生成输入的函数，接收嵌入和其他参数
def prepare_inputs_for_generation(
    inputs_embeds,  # 输入的嵌入表示
    attention_mask=None,  # 可选的注意力掩码
    past_key_values=None,  # 可选的过去的键值对
    **kwargs,  # 其他可选参数
):
    # 如果提供了过去的键值对
    if past_key_values is not None:
        # 只保留输入嵌入的最后一个 token
        inputs_embeds = inputs_embeds[:, -1:]

    # 返回包含输入嵌入、注意力掩码、过去的键值对及缓存使用标志的字典
    return {
        "inputs_embeds": inputs_embeds,  # 输入的嵌入表示
        "attention_mask": attention_mask,  # 注意力掩码
        "past_key_values": past_key_values,  # 过去的键值对
        "use_cache": kwargs.get("use_cache"),  # 获取使用缓存的标志
    }


# 定义音频生成的管道类，继承自 DiffusionPipeline
class AudioLDM2Pipeline(DiffusionPipeline):
    r"""
    用于基于文本生成音频的管道，使用 AudioLDM2 模型。

    该模型继承自 [`DiffusionPipeline`]。请查看超类文档以了解所有管道的通用方法
    （下载、保存、在特定设备上运行等）。
    # 参数说明部分，描述各个参数的用途
        Args:
            vae ([`AutoencoderKL`]):
                # 变分自编码器 (VAE) 模型，用于将图像编码和解码为潜在表示
            text_encoder ([`~transformers.ClapModel`]):
                # 第一个被冻结的文本编码器。AudioLDM2 使用联合音频-文本嵌入模型
                # [CLAP](https://huggingface.co/docs/transformers/model_doc/clap#transformers.CLAPTextModelWithProjection)，
                # 特别是 [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused) 变体。
                # 文本分支用于将文本提示编码为提示嵌入。完整的音频-文本模型用于
                # 通过计算相似度分数来对生成的波形进行排名。
            text_encoder_2 ([`~transformers.T5EncoderModel`, `~transformers.VitsModel`]):
                # 第二个被冻结的文本编码器。AudioLDM2 使用
                # [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel) 的编码器，
                # 特别是 [google/flan-t5-large](https://huggingface.co/google/flan-t5-large) 变体。第二个被冻结的文本编码器
                # 用于文本转语音（TTS）。AudioLDM2 使用
                # [Vits](https://huggingface.co/docs/transformers/model_doc/vits#transformers.VitsModel) 的编码器。
            projection_model ([`AudioLDM2ProjectionModel`]):
                # 一个训练过的模型，用于线性投影第一个和第二个文本编码器模型的隐藏状态，并插入学习到的 SOS 和 EOS 令牌嵌入。
                # 来自两个文本编码器的投影隐藏状态被连接，作为语言模型的输入。
                # 为 Vits 隐藏状态提供学习的位置嵌入。
            language_model ([`~transformers.GPT2Model`]):
                # 自回归语言模型，用于生成一系列基于两个文本编码器的投影输出的隐藏状态。
            tokenizer ([`~transformers.RobertaTokenizer`]):
                # 用于对第一个被冻结的文本编码器进行文本标记化的标记器。
            tokenizer_2 ([`~transformers.T5Tokenizer`, `~transformers.VitsTokenizer`]):
                # 用于对第二个被冻结的文本编码器进行文本标记化的标记器。
            feature_extractor ([`~transformers.ClapFeatureExtractor`]):
                # 特征提取器，用于将生成的音频波形预处理为对数-梅尔谱图，以便进行自动评分。
            unet ([`UNet2DConditionModel`]):
                # 一个 `UNet2DConditionModel`，用于对编码的音频潜在变量进行去噪。
            scheduler ([`SchedulerMixin`]):
                # 调度器，与 `unet` 一起用于去噪编码的音频潜在变量。可以是
                # [`DDIMScheduler`], [`LMSDiscreteScheduler`] 或 [`PNDMScheduler`] 之一。
            vocoder ([`~transformers.SpeechT5HifiGan`]):
                # 类 `SpeechT5HifiGan` 的声码器，用于将梅尔谱图潜在变量转换为最终音频波形。
        """
    # 初始化方法，设置类的属性
    def __init__(
        # VAE（变分自编码器）模型
        self,
        vae: AutoencoderKL,
        # 文本编码器模型
        text_encoder: ClapModel,
        # 第二个文本编码器，可以是 T5 编码器或 Vits 模型
        text_encoder_2: Union[T5EncoderModel, VitsModel],
        # 投影模型，用于音频处理
        projection_model: AudioLDM2ProjectionModel,
        # 语言模型，这里使用 GPT-2 模型
        language_model: GPT2Model,
        # 第一个标记器，可以是 Roberta 标记器或快速版本
        tokenizer: Union[RobertaTokenizer, RobertaTokenizerFast],
        # 第二个标记器，可以是 T5 标记器、快速版本或 Vits 标记器
        tokenizer_2: Union[T5Tokenizer, T5TokenizerFast, VitsTokenizer],
        # 特征提取器，用于音频特征提取
        feature_extractor: ClapFeatureExtractor,
        # UNet 模型，用于条件生成
        unet: AudioLDM2UNet2DConditionModel,
        # 调度器，用于控制生成过程
        scheduler: KarrasDiffusionSchedulers,
        # 语音合成模型
        vocoder: SpeechT5HifiGan,
    ):
        # 调用父类的初始化方法
        super().__init__()

        # 注册所有模块，将其绑定到当前实例
        self.register_modules(
            vae=vae,
            text_encoder=text_encoder,
            text_encoder_2=text_encoder_2,
            projection_model=projection_model,
            language_model=language_model,
            tokenizer=tokenizer,
            tokenizer_2=tokenizer_2,
            feature_extractor=feature_extractor,
            unet=unet,
            scheduler=scheduler,
            vocoder=vocoder,
        )
        # 计算 VAE 的缩放因子，基于块输出通道的数量
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)

    # 启用 VAE 切片解码的方法
    # 当此选项启用时，VAE 将输入张量分成切片进行多步骤解码
    # 有助于节省内存并允许更大的批处理大小
    def enable_vae_slicing(self):
        r"""
        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
        """
        # 调用 VAE 的启用切片解码的方法
        self.vae.enable_slicing()

    # 禁用 VAE 切片解码的方法
    # 如果之前启用了 `enable_vae_slicing`，则返回到单步解码
    def disable_vae_slicing(self):
        r"""
        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
        computing decoding in one step.
        """
        # 调用 VAE 的禁用切片解码的方法
        self.vae.disable_slicing()
    # 定义一个方法，用于将所有模型迁移到 CPU，降低内存使用并保持较低的性能影响
    def enable_model_cpu_offload(self, gpu_id=0):
        # 方法的文档字符串，描述其功能和与其他方法的比较
        r"""
        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
        """
        # 检查是否可用 accelerate 库，并且版本符合要求
        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
            # 从 accelerate 库导入 CPU 离线加载函数
            from accelerate import cpu_offload_with_hook
        else:
            # 如果不符合条件，抛出导入错误
            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
    
        # 设置设备为指定的 GPU
        device = torch.device(f"cuda:{gpu_id}")
    
        # 如果当前设备不是 CPU，则迁移模型到 CPU
        if self.device.type != "cpu":
            # 将当前模型迁移到 CPU，抑制数据类型警告
            self.to("cpu", silence_dtype_warnings=True)
            # 清空 CUDA 缓存以释放内存
            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
    
        # 定义一个模型序列，包含需要迁移的各个模型
        model_sequence = [
            self.text_encoder.text_model,
            self.text_encoder.text_projection,
            self.text_encoder_2,
            self.projection_model,
            self.language_model,
            self.unet,
            self.vae,
            self.vocoder,
            self.text_encoder,
        ]
    
        # 初始化钩子变量
        hook = None
        # 遍历模型序列，将每个模型迁移到 CPU，并设置钩子
        for cpu_offloaded_model in model_sequence:
            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
    
        # 手动离线加载最后一个模型
        self.final_offload_hook = hook
    
    # 定义生成语言模型的方法，接受输入和其他参数
    def generate_language_model(
        self,
        inputs_embeds: torch.Tensor = None,
        max_new_tokens: int = 8,
        **model_kwargs,
    ):
        """
        生成一系列隐藏状态，基于语言模型和嵌入输入进行条件生成。

        参数:
            inputs_embeds (`torch.Tensor` 形状为 `(batch_size, sequence_length, hidden_size)`):
                作为生成提示的序列。
            max_new_tokens (`int`):
                生成的新标记数量。
            model_kwargs (`Dict[str, Any]`, *可选*):
                额外模型特定参数的临时参数化，将传递给模型的 `forward` 函数。

        返回:
            `inputs_embeds (`torch.Tensor` 形状为 `(batch_size, sequence_length, hidden_size)`):
                生成的隐藏状态序列。
        """
        # 如果未指定 max_new_tokens，则使用模型配置中的最大新标记数
        max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens
        # 获取输入嵌入的初始缓存位置，并更新模型参数
        model_kwargs = self.language_model._get_initial_cache_position(inputs_embeds, model_kwargs)
        # 循环生成指定数量的新标记
        for _ in range(max_new_tokens):
            # 准备模型输入
            model_inputs = prepare_inputs_for_generation(inputs_embeds, **model_kwargs)

            # 前向传递以获取下一个隐藏状态
            output = self.language_model(**model_inputs, return_dict=True)

            # 获取最后一个隐藏状态
            next_hidden_states = output.last_hidden_state

            # 更新模型输入，将最新的隐藏状态添加到输入嵌入中
            inputs_embeds = torch.cat([inputs_embeds, next_hidden_states[:, -1:, :]], dim=1)

            # 更新生成的隐藏状态、模型输入和下一步的长度
            model_kwargs = self.language_model._update_model_kwargs_for_generation(output, model_kwargs)

        # 返回生成的隐藏状态序列中的最后 max_new_tokens 个状态
        return inputs_embeds[:, -max_new_tokens:, :]

    def encode_prompt(
        self,
        prompt,
        device,
        num_waveforms_per_prompt,
        do_classifier_free_guidance,
        transcription=None,
        negative_prompt=None,
        prompt_embeds: Optional[torch.Tensor] = None,
        negative_prompt_embeds: Optional[torch.Tensor] = None,
        generated_prompt_embeds: Optional[torch.Tensor] = None,
        negative_generated_prompt_embeds: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        negative_attention_mask: Optional[torch.LongTensor] = None,
        max_new_tokens: Optional[int] = None,
    # 从 diffusers.pipelines.audioldm.pipeline_audioldm.AudioLDMPipeline 复制的函数
    def mel_spectrogram_to_waveform(self, mel_spectrogram):
        # 如果梅尔频谱的维度为4，则去掉维度为1的部分
        if mel_spectrogram.dim() == 4:
            mel_spectrogram = mel_spectrogram.squeeze(1)

        # 使用声码器将梅尔频谱转换为波形
        waveform = self.vocoder(mel_spectrogram)
        # 始终转换为 float32，以便与 bfloat16 兼容且不会导致显著的性能开销
        waveform = waveform.cpu().float()
        # 返回转换后的波形
        return waveform
    # 定义一个方法来评分音频波形与文本的相似度
    def score_waveforms(self, text, audio, num_waveforms_per_prompt, device, dtype):
        # 检查是否安装了 librosa 库
        if not is_librosa_available():
            # 如果没有安装，记录信息并返回原始音频
            logger.info(
                "Automatic scoring of the generated audio waveforms against the input prompt text requires the "
                "`librosa` package to resample the generated waveforms. Returning the audios in the order they were "
                "generated. To enable automatic scoring, install `librosa` with: `pip install librosa`."
            )
            return audio
        # 使用 tokenizer 将文本转换为张量并进行填充
        inputs = self.tokenizer(text, return_tensors="pt", padding=True)
        # 使用 librosa 对音频进行重采样
        resampled_audio = librosa.resample(
            audio.numpy(), orig_sr=self.vocoder.config.sampling_rate, target_sr=self.feature_extractor.sampling_rate
        )
        # 将重采样后的音频转换为输入特征，并设置数据类型
        inputs["input_features"] = self.feature_extractor(
            list(resampled_audio), return_tensors="pt", sampling_rate=self.feature_extractor.sampling_rate
        ).input_features.type(dtype)
        # 将输入转移到指定的设备（CPU/GPU）
        inputs = inputs.to(device)

        # 计算音频与文本的相似度得分，使用 CLAP 模型
        logits_per_text = self.text_encoder(**inputs).logits_per_text
        # 按照与每个提示的匹配程度对生成结果进行排序
        indices = torch.argsort(logits_per_text, dim=1, descending=True)[:, :num_waveforms_per_prompt]
        # 根据排序结果选择音频
        audio = torch.index_select(audio, 0, indices.reshape(-1).cpu())
        # 返回选中的音频
        return audio

    # 从 diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs 复制的方法
    def prepare_extra_step_kwargs(self, generator, eta):
        # 为调度器步骤准备额外的参数，因为不是所有调度器的签名都相同
        # eta（η）仅在 DDIMScheduler 中使用，其他调度器将被忽略
        # eta 对应于 DDIM 论文中的 η：https://arxiv.org/abs/2010.02502
        # 应该在 [0, 1] 范围内

        # 检查调度器是否接受 eta 参数
        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
        extra_step_kwargs = {}
        # 如果接受 eta，则将其添加到额外参数中
        if accepts_eta:
            extra_step_kwargs["eta"] = eta

        # 检查调度器是否接受 generator 参数
        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
        # 如果接受 generator，则将其添加到额外参数中
        if accepts_generator:
            extra_step_kwargs["generator"] = generator
        # 返回额外的参数字典
        return extra_step_kwargs

    # 定义检查输入参数的方法
    def check_inputs(
        self,
        prompt,
        audio_length_in_s,
        vocoder_upsample_factor,
        callback_steps,
        transcription=None,
        negative_prompt=None,
        prompt_embeds=None,
        negative_prompt_embeds=None,
        generated_prompt_embeds=None,
        negative_generated_prompt_embeds=None,
        attention_mask=None,
        negative_attention_mask=None,
    # 从 diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents 复制的方法，宽度->self.vocoder.config.model_in_dim
    # 准备潜在变量，返回处理后的潜在变量张量
        def prepare_latents(self, batch_size, num_channels_latents, height, dtype, device, generator, latents=None):
            # 定义潜在变量的形状
            shape = (
                batch_size,
                num_channels_latents,
                int(height) // self.vae_scale_factor,
                int(self.vocoder.config.model_in_dim) // self.vae_scale_factor,
            )
            # 检查生成器是否为列表且其长度与批量大小匹配
            if isinstance(generator, list) and len(generator) != batch_size:
                raise ValueError(
                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
                )
    
            # 如果没有给定潜在变量，则生成随机张量
            if latents is None:
                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
            else:
                # 如果给定潜在变量，则将其转移到指定设备
                latents = latents.to(device)
    
            # 根据调度器所需的标准差缩放初始噪声
            latents = latents * self.scheduler.init_noise_sigma
            # 返回处理后的潜在变量
            return latents
    
        # 禁用梯度计算，优化内存使用
        @torch.no_grad()
        # 替换示例文档字符串
        @replace_example_docstring(EXAMPLE_DOC_STRING)
        def __call__(
            # 可选的提示字符串或字符串列表
            prompt: Union[str, List[str]] = None,
            # 可选的转录字符串或字符串列表
            transcription: Union[str, List[str]] = None,
            # 可选的音频时长，以秒为单位
            audio_length_in_s: Optional[float] = None,
            # 进行推理的步数
            num_inference_steps: int = 200,
            # 引导尺度
            guidance_scale: float = 3.5,
            # 可选的负面提示字符串或字符串列表
            negative_prompt: Optional[Union[str, List[str]]] = None,
            # 可选的每个提示生成的波形数量
            num_waveforms_per_prompt: Optional[int] = 1,
            # 控制随机性的参数
            eta: float = 0.0,
            # 可选的生成器或生成器列表
            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
            # 可选的潜在变量张量
            latents: Optional[torch.Tensor] = None,
            # 可选的提示嵌入张量
            prompt_embeds: Optional[torch.Tensor] = None,
            # 可选的负面提示嵌入张量
            negative_prompt_embeds: Optional[torch.Tensor] = None,
            # 可选的生成的提示嵌入张量
            generated_prompt_embeds: Optional[torch.Tensor] = None,
            # 可选的负面生成提示嵌入张量
            negative_generated_prompt_embeds: Optional[torch.Tensor] = None,
            # 可选的注意力掩码
            attention_mask: Optional[torch.LongTensor] = None,
            # 可选的负面注意力掩码
            negative_attention_mask: Optional[torch.LongTensor] = None,
            # 可选的最大新令牌数量
            max_new_tokens: Optional[int] = None,
            # 是否返回字典格式的输出
            return_dict: bool = True,
            # 可选的回调函数
            callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
            # 可选的回调步骤
            callback_steps: Optional[int] = 1,
            # 可选的交叉注意力参数
            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
            # 可选的输出类型
            output_type: Optional[str] = "np",

`.\diffusers\pipelines\audioldm2\init.py`

# 从类型检查模块导入 TYPE_CHECKING，用于静态类型检查
from typing import TYPE_CHECKING

# 从上级模块导入所需的工具函数和常量
from ...utils import (
    DIFFUSERS_SLOW_IMPORT,  # 导入用于慢导入的常量
    OptionalDependencyNotAvailable,  # 导入用于处理可选依赖不可用的异常
    _LazyModule,  # 导入延迟加载模块的类
    get_objects_from_module,  # 导入从模块获取对象的函数
    is_torch_available,  # 导入检查 PyTorch 是否可用的函数
    is_transformers_available,  # 导入检查 Transformers 是否可用的函数
    is_transformers_version,  # 导入检查 Transformers 版本的函数
)

# 初始化一个空字典，用于存储占位符对象
_dummy_objects = {}
# 初始化一个空字典，用于存储模块导入结构
_import_structure = {}

# 尝试块，用于处理可能的可选依赖问题
try:
    # 检查 Transformers 和 PyTorch 是否可用，及其版本是否满足要求
    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
        # 如果不满足条件，抛出可选依赖不可用异常
        raise OptionalDependencyNotAvailable()
# 捕获可选依赖不可用异常
except OptionalDependencyNotAvailable:
    # 从工具模块中导入占位符对象
    from ...utils import dummy_torch_and_transformers_objects

    # 更新占位符对象字典，获取占位符对象
    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
# 如果没有异常，执行以下代码
else:
    # 更新导入结构字典，添加音频模型和管道类
    _import_structure["modeling_audioldm2"] = ["AudioLDM2ProjectionModel", "AudioLDM2UNet2DConditionModel"]
    _import_structure["pipeline_audioldm2"] = ["AudioLDM2Pipeline"]

# 检查是否在类型检查模式或慢导入模式下
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    # 尝试块，用于处理可能的可选依赖问题
    try:
        # 检查 Transformers 和 PyTorch 是否可用，及其版本是否满足要求
        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
            # 如果不满足条件，抛出可选依赖不可用异常
            raise OptionalDependencyNotAvailable()
    # 捕获可选依赖不可用异常
    except OptionalDependencyNotAvailable:
        # 从工具模块中导入占位符对象
        from ...utils.dummy_torch_and_transformers_objects import *

    # 如果没有异常，执行以下代码
    else:
        # 从音频模型模块导入指定的模型类
        from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
        # 从音频管道模块导入指定的管道类
        from .pipeline_audioldm2 import AudioLDM2Pipeline

# 如果不在类型检查模式或慢导入模式下
else:
    # 导入系统模块
    import sys

    # 使用延迟加载模块的类来设置当前模块
    sys.modules[__name__] = _LazyModule(
        __name__,  # 当前模块名称
        globals()["__file__"],  # 当前模块文件路径
        _import_structure,  # 模块导入结构
        module_spec=__spec__,  # 模块规范
    )
    # 遍历占位符对象字典，设置当前模块的属性
    for name, value in _dummy_objects.items():
        setattr(sys.modules[__name__], name, value)

`.\diffusers\pipelines\aura_flow\pipeline_aura_flow.py`

# 版权声明，2024年AuraFlow作者和HuggingFace团队保留所有权利
#
# 根据Apache许可证第2.0版（“许可证”）进行授权；
# 除非遵守该许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律或书面协议另有约定，软件
# 按“原样”分发，不附带任何形式的保证或条件，
# 明示或暗示。有关许可证的具体权限和
# 限制，请参见许可证。
import inspect  # 导入inspect模块，用于获取对象的信息
from typing import List, Optional, Tuple, Union  # 从typing模块导入类型提示工具

import torch  # 导入torch库，用于张量计算和深度学习
from transformers import T5Tokenizer, UMT5EncoderModel  # 从transformers导入T5分词器和UMT5编码模型

from ...image_processor import VaeImageProcessor  # 导入变分自编码器图像处理器
from ...models import AuraFlowTransformer2DModel, AutoencoderKL  # 导入模型类
from ...models.attention_processor import AttnProcessor2_0, FusedAttnProcessor2_0, XFormersAttnProcessor  # 导入注意力处理器类
from ...schedulers import FlowMatchEulerDiscreteScheduler  # 导入调度器类
from ...utils import logging, replace_example_docstring  # 导入日志工具和文档替换工具
from ...utils.torch_utils import randn_tensor  # 导入生成随机张量的工具
from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput  # 导入扩散管道和图像输出类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

EXAMPLE_DOC_STRING = """
    示例：
        ```py
        >>> import torch
        >>> from diffusers import AuraFlowPipeline

        >>> pipe = AuraFlowPipeline.from_pretrained("fal/AuraFlow", torch_dtype=torch.float16)  # 从预训练模型创建管道
        >>> pipe = pipe.to("cuda")  # 将管道移动到GPU设备
        >>> prompt = "A cat holding a sign that says hello world"  # 定义输入提示
        >>> image = pipe(prompt).images[0]  # 生成图像
        >>> image.save("aura_flow.png")  # 保存生成的图像
        ```py
"""

# 从diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion导入的函数
def retrieve_timesteps(
    scheduler,  # 调度器对象，用于设置时间步
    num_inference_steps: Optional[int] = None,  # 可选的推理步骤数
    device: Optional[Union[str, torch.device]] = None,  # 可选的设备参数
    timesteps: Optional[List[int]] = None,  # 可选的时间步列表
    sigmas: Optional[List[float]] = None,  # 可选的sigma值列表
    **kwargs,  # 其他参数，传递给调度器
):
    """
    调用调度器的`set_timesteps`方法并在调用后从调度器检索时间步。处理
    自定义时间步。任何kwargs将被传递给`scheduler.set_timesteps`。
    # 定义参数说明
        Args:
            scheduler (`SchedulerMixin`):  # 调度器，用于获取时间步
                The scheduler to get timesteps from.
            num_inference_steps (`int`):  # 用于生成样本的扩散步数
                The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
                must be `None`.
            device (`str` or `torch.device`, *optional*):  # 指定时间步移动的设备
                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
            timesteps (`List[int]`, *optional*):  # 自定义时间步以覆盖调度器的时间步间隔策略
                Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
                `num_inference_steps` and `sigmas` must be `None`.
            sigmas (`List[float]`, *optional*):  # 自定义 sigma 以覆盖调度器的时间步间隔策略
                Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
                `num_inference_steps` and `timesteps` must be `None`.
    
        Returns:
            `Tuple[torch.Tensor, int]`:  # 返回一个元组，第一个元素是调度器的时间步计划，第二个元素是推理步骤数量
            A tuple where the first element is the timestep schedule from the scheduler and the
            second element is the number of inference steps.
        """
        # 检查是否同时传入了 timesteps 和 sigmas
        if timesteps is not None and sigmas is not None:
            raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
        # 如果传入了 timesteps
        if timesteps is not None:
            # 检查当前调度器是否支持自定义时间步
            accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
            if not accepts_timesteps:
                raise ValueError(  # 抛出错误，表示当前调度器不支持自定义时间步
                    f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                    f" timestep schedules. Please check whether you are using the correct scheduler."
                )
            # 设置调度器的时间步
            scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
            # 获取调度器的时间步
            timesteps = scheduler.timesteps
            # 计算推理步骤数量
            num_inference_steps = len(timesteps)
        # 如果传入了 sigmas
        elif sigmas is not None:
            # 检查当前调度器是否支持自定义 sigma
            accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
            if not accept_sigmas:
                raise ValueError(  # 抛出错误，表示当前调度器不支持自定义 sigma
                    f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                    f" sigmas schedules. Please check whether you are using the correct scheduler."
                )
            # 设置调度器的 sigma
            scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
            # 获取调度器的时间步
            timesteps = scheduler.timesteps
            # 计算推理步骤数量
            num_inference_steps = len(timesteps)
        # 如果没有传入 timesteps 或 sigmas
        else:
            # 设置调度器的默认时间步
            scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
            # 获取调度器的时间步
            timesteps = scheduler.timesteps
        # 返回时间步和推理步骤数量
        return timesteps, num_inference_steps
# 定义一个名为 AuraFlowPipeline 的类，继承自 DiffusionPipeline
class AuraFlowPipeline(DiffusionPipeline):
    r"""
    参数：
        tokenizer (`T5TokenizerFast`):
            T5Tokenizer 类的分词器
        text_encoder ([`T5EncoderModel`]):
            冻结的文本编码器。AuraFlow 使用 T5，具体是 
            [EleutherAI/pile-t5-xl](https://huggingface.co/EleutherAI/pile-t5-xl) 变体
        vae ([`AutoencoderKL`]):
            用于将图像编码和解码为潜在表示的变分自编码器模型
        transformer ([`AuraFlowTransformer2DModel`]):
            条件 Transformer 架构 (MMDiT 和 DiT) 用于去噪编码的图像潜在表示
        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
            用于与 `transformer` 结合使用的调度器，以去噪编码的图像潜在表示
    """

    # 可选组件的列表，初始化为空
    _optional_components = []
    # 定义模型在 CPU 上卸载的顺序
    model_cpu_offload_seq = "text_encoder->transformer->vae"

    # 初始化方法，定义类的参数
    def __init__(
        self,
        tokenizer: T5Tokenizer,
        text_encoder: UMT5EncoderModel,
        vae: AutoencoderKL,
        transformer: AuraFlowTransformer2DModel,
        scheduler: FlowMatchEulerDiscreteScheduler,
    ):
        # 调用父类的初始化方法
        super().__init__()

        # 注册模块，将各个组件注册到类中
        self.register_modules(
            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
        )

        # 计算 VAE 的缩放因子，根据配置的通道数决定
        self.vae_scale_factor = (
            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
        )
        # 初始化图像处理器，使用计算得到的缩放因子
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)

    # 检查输入参数的方法
    def check_inputs(
        self,
        prompt,
        height,
        width,
        negative_prompt,
        prompt_embeds=None,
        negative_prompt_embeds=None,
        prompt_attention_mask=None,
        negative_prompt_attention_mask=None,
    ):
        # 检查高度和宽度是否为8的倍数，若不是则抛出错误
        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

        # 检查是否同时提供了提示和提示嵌入，若是则抛出错误
        if prompt is not None and prompt_embeds is not None:
            raise ValueError(
                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
                " only forward one of the two."
            )
        # 检查提示和提示嵌入是否都未提供，若是则抛出错误
        elif prompt is None and prompt_embeds is None:
            raise ValueError(
                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
            )
        # 检查提示类型是否为字符串或列表，若不是则抛出错误
        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

        # 检查是否同时提供了提示和负提示嵌入，若是则抛出错误
        if prompt is not None and negative_prompt_embeds is not None:
            raise ValueError(
                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
            )

        # 检查是否同时提供了负提示和负提示嵌入，若是则抛出错误
        if negative_prompt is not None and negative_prompt_embeds is not None:
            raise ValueError(
                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
            )

        # 检查如果提供了提示嵌入则必须提供相应的注意力掩码，若不然则抛出错误
        if prompt_embeds is not None and prompt_attention_mask is None:
            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")

        # 检查如果提供了负提示嵌入则必须提供相应的注意力掩码，若不然则抛出错误
        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")

        # 检查提示嵌入和负提示嵌入的形状是否一致，若不一致则抛出错误
        if prompt_embeds is not None and negative_prompt_embeds is not None:
            if prompt_embeds.shape != negative_prompt_embeds.shape:
                raise ValueError(
                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
                    f" {negative_prompt_embeds.shape}."
                )
            # 检查提示和负提示注意力掩码的形状是否一致，若不一致则抛出错误
            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
                raise ValueError(
                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
                    f" {negative_prompt_attention_mask.shape}."
                )
    # 定义一个编码提示的函数，接受多个参数以构建提示
    def encode_prompt(
        self,
        prompt: Union[str, List[str]],  # 提示文本，可以是字符串或字符串列表
        negative_prompt: Union[str, List[str]] = None,  # 负面提示文本，可以是字符串或字符串列表，默认为 None
        do_classifier_free_guidance: bool = True,  # 是否使用无分类器引导，默认为 True
        num_images_per_prompt: int = 1,  # 每个提示生成的图像数量，默认为 1
        device: Optional[torch.device] = None,  # 指定设备（如 CPU 或 GPU），默认为 None
        prompt_embeds: Optional[torch.Tensor] = None,  # 提示的嵌入张量，默认为 None
        negative_prompt_embeds: Optional[torch.Tensor] = None,  # 负面提示的嵌入张量，默认为 None
        prompt_attention_mask: Optional[torch.Tensor] = None,  # 提示的注意力掩码，默认为 None
        negative_prompt_attention_mask: Optional[torch.Tensor] = None,  # 负面提示的注意力掩码，默认为 None
        max_sequence_length: int = 256,  # 最大序列长度，默认为 256
    # 从 diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3.StableDiffusion3Pipeline.prepare_latents 复制
    def prepare_latents(
        self,
        batch_size,  # 批处理大小
        num_channels_latents,  # 潜在通道数量
        height,  # 图像高度
        width,  # 图像宽度
        dtype,  # 数据类型
        device,  # 指定设备
        generator,  # 随机数生成器
        latents=None,  # 潜在张量，默认为 None
    ):
        # 如果提供了潜在张量，则将其转换为指定设备和数据类型
        if latents is not None:
            return latents.to(device=device, dtype=dtype)

        # 定义潜在张量的形状
        shape = (
            batch_size,  # 批处理大小
            num_channels_latents,  # 潜在通道数量
            int(height) // self.vae_scale_factor,  # 计算缩放后的高度
            int(width) // self.vae_scale_factor,  # 计算缩放后的宽度
        )

        # 检查生成器列表的长度是否与批处理大小匹配
        if isinstance(generator, list) and len(generator) != batch_size:
            raise ValueError(  # 抛出错误，提示生成器长度与批处理大小不匹配
                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
            )

        # 生成随机潜在张量
        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)

        # 返回生成的潜在张量
        return latents

    # 从 diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae 复制
    def upcast_vae(self):
        # 获取 VAE 的数据类型
        dtype = self.vae.dtype
        # 将 VAE 转换为 float32 数据类型
        self.vae.to(dtype=torch.float32)
        # 检查当前使用的处理器是否为特定类型
        use_torch_2_0_or_xformers = isinstance(
            self.vae.decoder.mid_block.attentions[0].processor,
            (
                AttnProcessor2_0,  # 检查是否为 AttnProcessor2_0 类型
                XFormersAttnProcessor,  # 检查是否为 XFormersAttnProcessor 类型
                FusedAttnProcessor2_0,  # 检查是否为 FusedAttnProcessor2_0 类型
            ),
        )
        # 如果使用了 xformers 或 torch_2_0，则注意力块不需要为 float32，从而节省大量内存
        if use_torch_2_0_or_xformers:
            # 将后量化卷积层转换为原始数据类型
            self.vae.post_quant_conv.to(dtype)
            # 将输入卷积层转换为原始数据类型
            self.vae.decoder.conv_in.to(dtype)
            # 将中间块转换为原始数据类型
            self.vae.decoder.mid_block.to(dtype)

    # 不计算梯度装饰器，通常用于推理时以节省内存和计算
    @torch.no_grad()
    # 替换示例文档字符串的装饰器
    @replace_example_docstring(EXAMPLE_DOC_STRING)
    # 定义一个可调用的方法，支持多种参数配置
        def __call__(
            # 提示文本，可以是单个字符串或字符串列表
            self,
            prompt: Union[str, List[str]] = None,
            # 负面提示文本，可以是单个字符串或字符串列表
            negative_prompt: Union[str, List[str]] = None,
            # 推理步骤的数量，默认为50
            num_inference_steps: int = 50,
            # 时间步列表，用于推理过程
            timesteps: List[int] = None,
            # sigma值列表，控制噪声级别
            sigmas: List[float] = None,
            # 引导缩放因子，默认为3.5
            guidance_scale: float = 3.5,
            # 每个提示生成的图像数量，默认为1
            num_images_per_prompt: Optional[int] = 1,
            # 生成图像的高度，默认为1024
            height: Optional[int] = 1024,
            # 生成图像的宽度，默认为1024
            width: Optional[int] = 1024,
            # 随机数生成器，可以是单个或多个生成器
            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
            # 初始潜在向量，可以是一个张量
            latents: Optional[torch.Tensor] = None,
            # 提示文本的嵌入向量，可以是一个张量
            prompt_embeds: Optional[torch.Tensor] = None,
            # 提示文本的注意力掩码，可以是一个张量
            prompt_attention_mask: Optional[torch.Tensor] = None,
            # 负面提示文本的嵌入向量，可以是一个张量
            negative_prompt_embeds: Optional[torch.Tensor] = None,
            # 负面提示文本的注意力掩码，可以是一个张量
            negative_prompt_attention_mask: Optional[torch.Tensor] = None,
            # 最大序列长度，默认为256
            max_sequence_length: int = 256,
            # 输出类型，默认为"pil"
            output_type: Optional[str] = "pil",
            # 是否返回字典格式的输出，默认为True
            return_dict: bool = True,

`.\diffusers\pipelines\aura_flow\init.py`

# 从 typing 模块导入 TYPE_CHECKING，以支持类型检查
from typing import TYPE_CHECKING

# 从 utils 模块导入所需的工具和常量
from ...utils import (
    DIFFUSERS_SLOW_IMPORT,  # 导入标识符，用于判断慢导入
    OptionalDependencyNotAvailable,  # 导入自定义异常，用于处理可选依赖未找到的情况
    _LazyModule,  # 导入懒加载模块的类
    get_objects_from_module,  # 导入从模块获取对象的函数
    is_torch_available,  # 导入检查 PyTorch 是否可用的函数
    is_transformers_available,  # 导入检查 Transformers 是否可用的函数
)

# 创建一个空字典用于存储虚拟对象
_dummy_objects = {}
# 创建一个字典用于存储模块的导入结构
_import_structure = {}

# 尝试检查依赖关系
try:
    # 如果 Transformers 和 PyTorch 都不可用，抛出异常
    if not (is_transformers_available() and is_torch_available()):
        raise OptionalDependencyNotAvailable()
# 捕获可选依赖未找到的异常
except OptionalDependencyNotAvailable:
    # 从 utils 导入虚拟对象模块，避免依赖缺失时的错误
    from ...utils import dummy_torch_and_transformers_objects  # noqa F403

    # 更新 _dummy_objects 字典，填充虚拟对象
    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
# 如果依赖可用，更新导入结构字典
else:
    # 将 "pipeline_aura_flow" 添加到导入结构字典
    _import_structure["pipeline_aura_flow"] = ["AuraFlowPipeline"]

# 检查是否处于类型检查模式或是否为慢导入
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
        # 如果 Transformers 和 PyTorch 都不可用，抛出异常
        if not (is_transformers_available() and is_torch_available()):
            raise OptionalDependencyNotAvailable()

    # 捕获可选依赖未找到的异常
    except OptionalDependencyNotAvailable:
        # 从虚拟对象模块导入所有对象
        from ...utils.dummy_torch_and_transformers_objects import *
    # 如果依赖可用，从指定模块导入 AuraFlowPipeline
    else:
        from .pipeline_aura_flow import AuraFlowPipeline

# 如果不在类型检查模式或慢导入
else:
    # 导入 sys 模块以进行模块操作
    import sys

    # 将当前模块的 sys.modules 条目替换为 _LazyModule 的实例
    sys.modules[__name__] = _LazyModule(
        __name__,  # 当前模块名称
        globals()["__file__"],  # 当前模块的文件路径
        _import_structure,  # 导入结构字典
        module_spec=__spec__,  # 模块规范
    )

    # 遍历 _dummy_objects 字典，将每个虚拟对象设置到当前模块
    for name, value in _dummy_objects.items():
        setattr(sys.modules[__name__], name, value)

`.\diffusers\pipelines\auto_pipeline.py`

# 指定文件编码为 UTF-8
# coding=utf-8
# 版权声明，指明代码版权所有者为 HuggingFace Inc. 团队
# Copyright 2024 The HuggingFace Inc. team.
#
# 指明该文件遵循 Apache License 2.0，用户需遵循该许可证的条款
# Licensed under the Apache License, Version 2.0 (the "License");
# 用户不得在不遵循许可证的情况下使用此文件
# you may not use this file except in compliance with the License.
# 用户可以在以下地址获取许可证副本
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 如果没有适用的法律规定或书面协议，软件在 "AS IS" 基础上分发，不提供任何保证
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# 不提供任何明示或暗示的保证或条件
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 指明许可证的详细信息，包括权限和限制
# See the License for the specific language governing permissions and
# limitations under the License.

# 从 collections 模块导入有序字典类
from collections import OrderedDict

# 从 huggingface_hub.utils 导入参数验证工具
from huggingface_hub.utils import validate_hf_hub_args

# 从 configuration_utils 导入配置混合器
from ..configuration_utils import ConfigMixin
# 从 utils 导入句子分割可用性检查工具
from ..utils import is_sentencepiece_available
# 从 aura_flow 模块导入 AuraFlow 管道类
from .aura_flow import AuraFlowPipeline
# 从 controlnet 模块导入多种稳定扩散控制网络管道
from .controlnet import (
    StableDiffusionControlNetImg2ImgPipeline,
    StableDiffusionControlNetInpaintPipeline,
    StableDiffusionControlNetPipeline,
    StableDiffusionXLControlNetImg2ImgPipeline,
    StableDiffusionXLControlNetInpaintPipeline,
    StableDiffusionXLControlNetPipeline,
)
# 从 deepfloyd_if 模块导入多种图像处理管道
from .deepfloyd_if import IFImg2ImgPipeline, IFInpaintingPipeline, IFPipeline
# 从 flux 模块导入 Flux 管道类
from .flux import FluxPipeline
# 从 hunyuandit 模块导入 HunyuanDiT 管道类
from .hunyuandit import HunyuanDiTPipeline
# 从 kandinsky 模块导入多种 Kandinsky 管道
from .kandinsky import (
    KandinskyCombinedPipeline,
    KandinskyImg2ImgCombinedPipeline,
    KandinskyImg2ImgPipeline,
    KandinskyInpaintCombinedPipeline,
    KandinskyInpaintPipeline,
    KandinskyPipeline,
)
# 从 kandinsky2_2 模块导入多种 Kandinsky V2.2 管道
from .kandinsky2_2 import (
    KandinskyV22CombinedPipeline,
    KandinskyV22Img2ImgCombinedPipeline,
    KandinskyV22Img2ImgPipeline,
    KandinskyV22InpaintCombinedPipeline,
    KandinskyV22InpaintPipeline,
    KandinskyV22Pipeline,
)
# 从 kandinsky3 模块导入 Kandinsky 3 的图像处理管道
from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
# 从 latent_consistency_models 模块导入潜在一致性模型管道
from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
# 从 pag 模块导入多种 PAG 管道
from .pag import (
    HunyuanDiTPAGPipeline,
    PixArtSigmaPAGPipeline,
    StableDiffusion3PAGPipeline,
    StableDiffusionControlNetPAGPipeline,
    StableDiffusionPAGPipeline,
    StableDiffusionXLControlNetPAGPipeline,
    StableDiffusionXLPAGImg2ImgPipeline,
    StableDiffusionXLPAGInpaintPipeline,
    StableDiffusionXLPAGPipeline,
)
# 从 pixart_alpha 模块导入 PixArt Alpha 和 Sigma 管道
from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
# 从 stable_cascade 模块导入多种稳定级联管道
from .stable_cascade import StableCascadeCombinedPipeline, StableCascadeDecoderPipeline
# 从 stable_diffusion 模块导入多种稳定扩散管道
from .stable_diffusion import (
    StableDiffusionImg2ImgPipeline,
    StableDiffusionInpaintPipeline,
    StableDiffusionPipeline,
)
# 从 stable_diffusion_3 模块导入多种稳定扩散 3 管道
from .stable_diffusion_3 import (
    StableDiffusion3Img2ImgPipeline,
    StableDiffusion3InpaintPipeline,
    StableDiffusion3Pipeline,
)
# 从 stable_diffusion_xl 模块导入多种稳定扩散 XL 管道
from .stable_diffusion_xl import (
    StableDiffusionXLImg2ImgPipeline,
    StableDiffusionXLInpaintPipeline,
    StableDiffusionXLPipeline,
)
# 从 wuerstchen 模块导入多种 Wuerstchen 管道
from .wuerstchen import WuerstchenCombinedPipeline, WuerstchenDecoderPipeline


# 创建一个有序字典，用于映射自动文本到图像的管道
AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
    # 创建一个包含模型名称和对应管道类的列表
        [
            # 定义稳定扩散模型及其管道
            ("stable-diffusion", StableDiffusionPipeline),
            # 定义稳定扩散 XL 模型及其管道
            ("stable-diffusion-xl", StableDiffusionXLPipeline),
            # 定义稳定扩散 3 模型及其管道
            ("stable-diffusion-3", StableDiffusion3Pipeline),
            # 定义稳定扩散 3 PAG 模型及其管道
            ("stable-diffusion-3-pag", StableDiffusion3PAGPipeline),
            # 定义 IF 模型及其管道
            ("if", IFPipeline),
            # 定义 Hunyuan 模型及其管道
            ("hunyuan", HunyuanDiTPipeline),
            # 定义 Hunyuan PAG 模型及其管道
            ("hunyuan-pag", HunyuanDiTPAGPipeline),
            # 定义 Kandinsky 组合模型及其管道
            ("kandinsky", KandinskyCombinedPipeline),
            # 定义 Kandinsky 2.2 组合模型及其管道
            ("kandinsky22", KandinskyV22CombinedPipeline),
            # 定义 Kandinsky 3 模型及其管道
            ("kandinsky3", Kandinsky3Pipeline),
            # 定义稳定扩散控制网模型及其管道
            ("stable-diffusion-controlnet", StableDiffusionControlNetPipeline),
            # 定义稳定扩散 XL 控制网模型及其管道
            ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetPipeline),
            # 定义 Wuerstchen 组合模型及其管道
            ("wuerstchen", WuerstchenCombinedPipeline),
            # 定义稳定级联组合模型及其管道
            ("cascade", StableCascadeCombinedPipeline),
            # 定义潜在一致性模型及其管道
            ("lcm", LatentConsistencyModelPipeline),
            # 定义 PixArt Alpha 模型及其管道
            ("pixart-alpha", PixArtAlphaPipeline),
            # 定义 PixArt Sigma 模型及其管道
            ("pixart-sigma", PixArtSigmaPipeline),
            # 定义稳定扩散 PAG 模型及其管道
            ("stable-diffusion-pag", StableDiffusionPAGPipeline),
            # 定义稳定扩散控制网 PAG 模型及其管道
            ("stable-diffusion-controlnet-pag", StableDiffusionControlNetPAGPipeline),
            # 定义稳定扩散 XL PAG 模型及其管道
            ("stable-diffusion-xl-pag", StableDiffusionXLPAGPipeline),
            # 定义稳定扩散 XL 控制网 PAG 模型及其管道
            ("stable-diffusion-xl-controlnet-pag", StableDiffusionXLControlNetPAGPipeline),
            # 定义 PixArt Sigma PAG 模型及其管道
            ("pixart-sigma-pag", PixArtSigmaPAGPipeline),
            # 定义 AuraFlow 模型及其管道
            ("auraflow", AuraFlowPipeline),
            # 定义 Flux 模型及其管道
            ("flux", FluxPipeline),
        ]
# 定义用于图像到图像转换的管道映射，使用有序字典确保顺序
AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
    [
        # 映射每种图像到图像转换管道
        ("stable-diffusion", StableDiffusionImg2ImgPipeline),
        ("stable-diffusion-xl", StableDiffusionXLImg2ImgPipeline),
        ("stable-diffusion-3", StableDiffusion3Img2ImgPipeline),
        ("if", IFImg2ImgPipeline),
        ("kandinsky", KandinskyImg2ImgCombinedPipeline),
        ("kandinsky22", KandinskyV22Img2ImgCombinedPipeline),
        ("kandinsky3", Kandinsky3Img2ImgPipeline),
        ("stable-diffusion-controlnet", StableDiffusionControlNetImg2ImgPipeline),
        ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetImg2ImgPipeline),
        ("stable-diffusion-xl-pag", StableDiffusionXLPAGImg2ImgPipeline),
        ("lcm", LatentConsistencyModelImg2ImgPipeline),
    ]
)

# 定义用于图像修复的管道映射，同样使用有序字典
AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict(
    [
        # 映射每种图像修复管道
        ("stable-diffusion", StableDiffusionInpaintPipeline),
        ("stable-diffusion-xl", StableDiffusionXLInpaintPipeline),
        ("stable-diffusion-3", StableDiffusion3InpaintPipeline),
        ("if", IFInpaintingPipeline),
        ("kandinsky", KandinskyInpaintCombinedPipeline),
        ("kandinsky22", KandinskyV22InpaintCombinedPipeline),
        ("stable-diffusion-controlnet", StableDiffusionControlNetInpaintPipeline),
        ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetInpaintPipeline),
        ("stable-diffusion-xl-pag", StableDiffusionXLPAGInpaintPipeline),
    ]
)

# 定义用于文本到图像解码器的管道映射
_AUTO_TEXT2IMAGE_DECODER_PIPELINES_MAPPING = OrderedDict(
    [
        # 映射每种文本到图像解码器管道
        ("kandinsky", KandinskyPipeline),
        ("kandinsky22", KandinskyV22Pipeline),
        ("wuerstchen", WuerstchenDecoderPipeline),
        ("cascade", StableCascadeDecoderPipeline),
    ]
)

# 定义用于图像到图像解码器的管道映射
_AUTO_IMAGE2IMAGE_DECODER_PIPELINES_MAPPING = OrderedDict(
    [
        # 映射每种图像到图像解码器管道
        ("kandinsky", KandinskyImg2ImgPipeline),
        ("kandinsky22", KandinskyV22Img2ImgPipeline),
    ]
)

# 定义用于图像修复解码器的管道映射
_AUTO_INPAINT_DECODER_PIPELINES_MAPPING = OrderedDict(
    [
        # 映射每种图像修复解码器管道
        ("kandinsky", KandinskyInpaintPipeline),
        ("kandinsky22", KandinskyV22InpaintPipeline),
    ]
)

# 检查是否可用 sentencepiece 库
if is_sentencepiece_available():
    # 从模块中导入所需的管道类
    from .kolors import KolorsPipeline
    from .pag import KolorsPAGPipeline

    # 将 Kolors 管道添加到文本到图像管道映射
    AUTO_TEXT2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsPipeline
    AUTO_TEXT2IMAGE_PIPELINES_MAPPING["kolors-pag"] = KolorsPAGPipeline
    # 将 Kolors 管道添加到图像到图像管道映射
    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING["kolors"] = KolorsPipeline

# 定义支持的任务映射，包含各种管道映射
SUPPORTED_TASKS_MAPPINGS = [
    AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING,
    AUTO_INPAINT_PIPELINES_MAPPING,
    _AUTO_TEXT2IMAGE_DECODER_PIPELINES_MAPPING,
    _AUTO_IMAGE2IMAGE_DECODER_PIPELINES_MAPPING,
    _AUTO_INPAINT_DECODER_PIPELINES_MAPPING,
]

# 定义函数以获取连接的管道，参数为管道类
def _get_connected_pipeline(pipeline_cls):
    # 当前连接的管道只能从解码器管道加载
    # 检查 pipeline_cls 是否在自动文本到图像解码器管道映射的值中
        if pipeline_cls in _AUTO_TEXT2IMAGE_DECODER_PIPELINES_MAPPING.values():
            # 获取与给定 pipeline_cls 名称对应的任务类，不存在时不抛出错误
            return _get_task_class(
                AUTO_TEXT2IMAGE_PIPELINES_MAPPING, pipeline_cls.__name__, throw_error_if_not_exist=False
            )
        # 检查 pipeline_cls 是否在自动图像到图像解码器管道映射的值中
        if pipeline_cls in _AUTO_IMAGE2IMAGE_DECODER_PIPELINES_MAPPING.values():
            # 获取与给定 pipeline_cls 名称对应的任务类，不存在时不抛出错误
            return _get_task_class(
                AUTO_IMAGE2IMAGE_PIPELINES_MAPPING, pipeline_cls.__name__, throw_error_if_not_exist=False
            )
        # 检查 pipeline_cls 是否在自动修复解码器管道映射的值中
        if pipeline_cls in _AUTO_INPAINT_DECODER_PIPELINES_MAPPING.values():
            # 获取与给定 pipeline_cls 名称对应的任务类，不存在时不抛出错误
            return _get_task_class(AUTO_INPAINT_PIPELINES_MAPPING, pipeline_cls.__name__, throw_error_if_not_exist=False)
# 根据映射获取任务类，如果不存在则抛出异常（默认抛出）
def _get_task_class(mapping, pipeline_class_name, throw_error_if_not_exist: bool = True):
    # 定义内部函数，用于根据管道类名获取模型名称
    def get_model(pipeline_class_name):
        # 遍历所有支持的任务映射
        for task_mapping in SUPPORTED_TASKS_MAPPINGS:
            # 遍历每个任务映射中的模型名称和管道
            for model_name, pipeline in task_mapping.items():
                # 如果管道名称与提供的类名匹配，返回模型名称
                if pipeline.__name__ == pipeline_class_name:
                    return model_name

    # 调用内部函数获取模型名称
    model_name = get_model(pipeline_class_name)

    # 如果找到了模型名称
    if model_name is not None:
        # 从映射中获取相应的任务类
        task_class = mapping.get(model_name, None)
        # 如果找到了任务类，返回该类
        if task_class is not None:
            return task_class

    # 如果模型不存在且需要抛出错误，抛出 ValueError
    if throw_error_if_not_exist:
        raise ValueError(f"AutoPipeline can't find a pipeline linked to {pipeline_class_name} for {model_name}")

# 定义一个文本到图像的自动管道类，继承自 ConfigMixin
class AutoPipelineForText2Image(ConfigMixin):
    r"""

    [`AutoPipelineForText2Image`] 是一个通用管道类，用于实例化文本到图像的管道类。
    特定的基础管道类将通过 [`~AutoPipelineForText2Image.from_pretrained`] 或
    [`~AutoPipelineForText2Image.from_pipe`] 方法自动选择。

    此类不能通过 `__init__()` 实例化（会抛出错误）。

    类属性：

        - **config_name** (`str`) -- 存储所有扩散管道组件的类和模块名称的配置文件名。

    """
    # 配置文件名称，指向模型索引
    config_name = "model_index.json"

    # 初始化方法，禁止直接实例化
    def __init__(self, *args, **kwargs):
        raise EnvironmentError(
            f"{self.__class__.__name__} is designed to be instantiated "
            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
            f"`{self.__class__.__name__}.from_pipe(pipeline)` methods."
        )

    # 类方法，用于验证 HF Hub 参数
    @classmethod
    @validate_hf_hub_args
    @classmethod
# 定义一个图像到图像的自动管道类，继承自 ConfigMixin
class AutoPipelineForImage2Image(ConfigMixin):
    r"""

    [`AutoPipelineForImage2Image`] 是一个通用管道类，用于实例化图像到图像的管道类。
    特定的基础管道类将通过 [`~AutoPipelineForImage2Image.from_pretrained`] 或
    [`~AutoPipelineForImage2Image.from_pipe`] 方法自动选择。

    此类不能通过 `__init__()` 实例化（会抛出错误）。

    类属性：

        - **config_name** (`str`) -- 存储所有扩散管道组件的类和模块名称的配置文件名。

    """
    # 配置文件名称，指向模型索引
    config_name = "model_index.json"

    # 初始化方法，禁止直接实例化
    def __init__(self, *args, **kwargs):
        raise EnvironmentError(
            f"{self.__class__.__name__} is designed to be instantiated "
            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
            f"`{self.__class__.__name__}.from_pipe(pipeline)` methods."
        )

    # 类方法，用于验证 HF Hub 参数
    @classmethod
    @validate_hf_hub_args
    @classmethod
# 定义一个图像修复的自动管道类，继承自 ConfigMixin
class AutoPipelineForInpainting(ConfigMixin):
    r"""

    [`AutoPipelineForInpainting`] 是一个通用管道类，用于实例化图像修复的管道类。该
    # 自动选择特定的基础管道类，可以通过 `from_pretrained` 或 `from_pipe` 方法实现
        specific underlying pipeline class is automatically selected from either the
        # 无法通过 `__init__()` 方法实例化该类（会抛出错误）
        [`~AutoPipelineForInpainting.from_pretrained`] or [`~AutoPipelineForInpainting.from_pipe`] methods.
    
        # 类属性：
        # - **config_name** (`str`) -- 存储所有扩散管道组件类和模块名称的配置文件名
        This class cannot be instantiated using `__init__()` (throws an error).
    
        # 配置文件名，指向模型索引的 JSON 文件
        config_name = "model_index.json"
    
        # 初始化方法，接受任意数量的位置和关键字参数
        def __init__(self, *args, **kwargs):
            # 抛出环境错误，指示使用特定的方法实例化该类
            raise EnvironmentError(
                f"{self.__class__.__name__} is designed to be instantiated "
                f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
                f"`{self.__class__.__name__}.from_pipe(pipeline)` methods."
            )
    
        # 类方法装饰器，表明该方法是属于类而不是实例的
        @classmethod
        @validate_hf_hub_args
        # 再次标记该方法为类方法
        @classmethod

`.\diffusers\pipelines\blip_diffusion\blip_image_processing.py`

# coding=utf-8  # 指定源代码文件的编码为 UTF-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.  # 版权声明，标明版权归 HuggingFace Inc. 团队所有
#
# Licensed under the Apache License, Version 2.0 (the "License");  # 指明该文件遵循 Apache 2.0 许可证
# you may not use this file except in compliance with the License.  # 除非遵守许可证，否则不得使用该文件
# You may obtain a copy of the License at  # 说明如何获取许可证
#
#     http://www.apache.org/licenses/LICENSE-2.0  # 许可证的 URL
#
# Unless required by applicable law or agreed to in writing, software  # 除非法律要求或书面同意
# distributed under the License is distributed on an "AS IS" BASIS,  # 根据许可证分发的软件按“原样”提供
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  # 不提供任何形式的保证或条件
# See the License for the specific language governing permissions and  # 查看许可证以了解有关权限的具体语言
# limitations under the License.  # 以及许可证下的限制
"""Image processor class for BLIP."""  # 该模块是 BLIP 图像处理器类的定义

from typing import Dict, List, Optional, Union  # 导入用于类型注解的模块

import numpy as np  # 导入 NumPy 库，通常用于数组和矩阵操作
import torch  # 导入 PyTorch 库，用于深度学习
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict  # 从 transformers 导入图像处理相关的基类和工具函数
from transformers.image_transforms import convert_to_rgb, resize, to_channel_dimension_format  # 导入图像转换函数
from transformers.image_utils import (  # 导入图像工具函数
    OPENAI_CLIP_MEAN,  # OpenAI CLIP 的均值
    OPENAI_CLIP_STD,  # OpenAI CLIP 的标准差
    ChannelDimension,  # 通道维度相关的定义
    ImageInput,  # 图像输入类型定义
    PILImageResampling,  # PIL 图像重采样功能
    infer_channel_dimension_format,  # 推断通道维度格式的函数
    is_scaled_image,  # 判断是否为缩放图像的函数
    make_list_of_images,  # 将图像转换为图像列表的函数
    to_numpy_array,  # 将数据转换为 NumPy 数组的函数
    valid_images,  # 检查有效图像的函数
)
from transformers.utils import TensorType, is_vision_available, logging  # 导入工具函数和类型定义

from diffusers.utils import numpy_to_pil  # 从 diffusers 导入将 NumPy 数组转换为 PIL 图像的函数


if is_vision_available():  # 如果视觉库可用
    import PIL.Image  # 导入 PIL 图像处理库


logger = logging.get_logger(__name__)  # 创建一个日志记录器，用于记录当前模块的日志


# We needed some extra functions on top of the ones in transformers.image_processing_utils.BaseImageProcessor, namely center crop
# Copy-pasted from transformers.models.blip.image_processing_blip.BlipImageProcessor  # 说明该类在 transformers.image_processing_utils.BaseImageProcessor 的基础上增加了一些额外的功能，如中心裁剪，且复制自 BLIP 图像处理器
class BlipImageProcessor(BaseImageProcessor):  # 定义 BlipImageProcessor 类，继承自 BaseImageProcessor
    r"""  # 开始文档字符串，描述该类的用途
    Constructs a BLIP image processor.  # 构造一个 BLIP 图像处理器
    # 参数说明文档
    Args:
        # 是否调整图像的（高度，宽度）尺寸到指定的 `size`，可通过 `preprocess` 方法中的 `do_resize` 参数覆盖
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
            `do_resize` parameter in the `preprocess` method.
        # 输出图像调整大小后的尺寸，默认为 {"height": 384, "width": 384}，可通过 `preprocess` 方法中的 `size` 参数覆盖
        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
            method.
        # 如果调整图像大小，使用的重采样滤波器，仅在 `do_resize` 设置为 True 时有效，且可通过 `preprocess` 方法中的 `resample` 参数覆盖
        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
            overridden by the `resample` parameter in the `preprocess` method.
        # 是否通过指定的缩放因子 `rescale_factor` 对图像进行重新缩放，默认为 True，可通过 `preprocess` 方法中的 `do_rescale` 参数覆盖
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
            `do_rescale` parameter in the `preprocess` method.
        # 如果对图像进行重新缩放时使用的缩放因子，仅在 `do_rescale` 设置为 True 时有效，且可通过 `preprocess` 方法中的 `rescale_factor` 参数覆盖
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
            overridden by the `rescale_factor` parameter in the `preprocess` method.
        # 是否对图像进行归一化处理，默认为 True，可通过 `preprocess` 方法中的 `do_normalize` 参数覆盖
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
        # 归一化图像时使用的均值，可以是一个浮点数或浮点数列表，其长度与图像通道数相等，可通过 `preprocess` 方法中的 `image_mean` 参数覆盖
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
            overridden by the `image_mean` parameter in the `preprocess` method.
        # 归一化图像时使用的标准差，可以是一个浮点数或浮点数列表，其长度与图像通道数相等，可通过 `preprocess` 方法中的 `image_std` 参数覆盖
        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        # 是否将图像转换为 RGB 格式
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
    """  # 文档字符串结束

    # 定义模型输入的名称列表，包含 "pixel_values"
    model_input_names = ["pixel_values"]
    # 初始化方法，用于设置类的基本属性
        def __init__(
            self,
            do_resize: bool = True,  # 是否进行图像缩放，默认为 True
            size: Dict[str, int] = None,  # 图像尺寸，默认为 None
            resample: PILImageResampling = PILImageResampling.BICUBIC,  # 重采样方法，默认为双三次插值
            do_rescale: bool = True,  # 是否进行像素值重缩放，默认为 True
            rescale_factor: Union[int, float] = 1 / 255,  # 像素值重缩放因子，默认为 1/255
            do_normalize: bool = True,  # 是否对图像进行归一化处理，默认为 True
            image_mean: Optional[Union[float, List[float]]] = None,  # 图像均值，默认为 None
            image_std: Optional[Union[float, List[float]]] = None,  # 图像标准差，默认为 None
            do_convert_rgb: bool = True,  # 是否将图像转换为 RGB 格式，默认为 True
            do_center_crop: bool = True,  # 是否进行中心裁剪，默认为 True
            **kwargs,  # 其他可选参数
        ) -> None:
            super().__init__(**kwargs)  # 调用父类初始化方法，传入其他参数
            size = size if size is not None else {"height": 224, "width": 224}  # 如果 size 为 None，则设置为默认尺寸
            size = get_size_dict(size, default_to_square=True)  # 获取尺寸字典，默认转换为正方形
    
            self.do_resize = do_resize  # 设置实例属性 do_resize
            self.size = size  # 设置实例属性 size
            self.resample = resample  # 设置实例属性 resample
            self.do_rescale = do_rescale  # 设置实例属性 do_rescale
            self.rescale_factor = rescale_factor  # 设置实例属性 rescale_factor
            self.do_normalize = do_normalize  # 设置实例属性 do_normalize
            self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN  # 设置实例属性 image_mean，默认使用 OPENAI_CLIP_MEAN
            self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD  # 设置实例属性 image_std，默认使用 OPENAI_CLIP_STD
            self.do_convert_rgb = do_convert_rgb  # 设置实例属性 do_convert_rgb
            self.do_center_crop = do_center_crop  # 设置实例属性 do_center_crop
    
        # 从 transformers.models.vit.image_processing_vit.ViTImageProcessor.resize 复制而来，重采样方法由 PILImageResampling.BILINEAR 修改为 PILImageResampling.BICUBIC
        def resize(
            self,
            image: np.ndarray,  # 输入图像，类型为 numpy.ndarray
            size: Dict[str, int],  # 指定的新尺寸，类型为字典
            resample: PILImageResampling = PILImageResampling.BICUBIC,  # 重采样方法，默认为双三次插值
            data_format: Optional[Union[str, ChannelDimension]] = None,  # 数据格式，默认为 None
            input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 输入数据格式，默认为 None
            **kwargs,  # 其他可选参数
    ) -> np.ndarray:  # 指定该函数返回一个 numpy 数组
        """  # 开始函数文档字符串
        Resize an image to `(size["height"], size["width"])`.  # 描述函数功能：调整图像大小
        Args:  # 参数说明部分
            image (`np.ndarray`):  # 输入参数：待调整大小的图像，类型为 numpy 数组
                Image to resize.  # 图像的说明
            size (`Dict[str, int]`):  # 输入参数：字典，包含目标图像的高度和宽度
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.  # 字典格式的描述
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):  # 可选参数：指定重采样的方法
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.  # 重采样过滤器的说明
            data_format (`ChannelDimension` or `str`, *optional*):  # 可选参数：输出图像的通道维度格式
                The channel dimension format for the output image. If unset, the channel dimension format of the input  # 描述输入图像通道格式的使用
                image is used. Can be one of:  # 可能的通道格式选项
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.  # 第一种格式的说明
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.  # 第二种格式的说明
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.  # 第三种格式的说明
            input_data_format (`ChannelDimension` or `str`, *optional*):  # 可选参数：输入图像的通道维度格式
                The channel dimension format for the input image. If unset, the channel dimension format is inferred  # 描述输入图像通道格式的推断
                from the input image. Can be one of:  # 可能的输入格式选项
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.  # 第一种格式的说明
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.  # 第二种格式的说明
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.  # 第三种格式的说明
        Returns:  # 返回值说明部分
            `np.ndarray`: The resized image.  # 返回一个调整大小后的 numpy 数组图像
        """  # 结束函数文档字符串
        size = get_size_dict(size)  # 获取标准化的大小字典
        if "height" not in size or "width" not in size:  # 检查字典中是否包含 'height' 和 'width' 键
            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")  # 抛出错误，提示缺少必要的键
        output_size = (size["height"], size["width"])  # 根据 size 字典获取输出图像的尺寸元组
        return resize(  # 调用 resize 函数进行图像调整大小
            image,  # 传入待调整大小的图像
            size=output_size,  # 传入目标大小
            resample=resample,  # 传入重采样选项
            data_format=data_format,  # 传入数据格式选项
            input_data_format=input_data_format,  # 传入输入数据格式选项
            **kwargs,  # 传入其他关键字参数
        )  # 返回调整大小后的图像

    def preprocess(  # 定义 preprocess 函数
        self,  # 类实例本身
        images: ImageInput,  # 输入参数：待处理的图像，类型为 ImageInput
        do_resize: Optional[bool] = None,  # 可选参数：是否执行调整大小操作
        size: Optional[Dict[str, int]] = None,  # 可选参数：调整大小时的目标尺寸
        resample: PILImageResampling = None,  # 可选参数：重采样过滤器
        do_rescale: Optional[bool] = None,  # 可选参数：是否执行重新缩放操作
        do_center_crop: Optional[bool] = None,  # 可选参数：是否执行中心裁剪操作
        rescale_factor: Optional[float] = None,  # 可选参数：重新缩放的比例因子
        do_normalize: Optional[bool] = None,  # 可选参数：是否执行归一化操作
        image_mean: Optional[Union[float, List[float]]] = None,  # 可选参数：图像的均值，用于归一化
        image_std: Optional[Union[float, List[float]]] = None,  # 可选参数：图像的标准差，用于归一化
        return_tensors: Optional[Union[str, TensorType]] = None,  # 可选参数：指定返回的张量类型
        do_convert_rgb: bool = None,  # 可选参数：是否将图像转换为 RGB 格式
        data_format: ChannelDimension = ChannelDimension.FIRST,  # 指定数据格式，默认为通道优先
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 可选参数：输入图像的通道格式
        **kwargs,  # 传入其他关键字参数
    # 继承自 diffusers.VaeImageProcessor.postprocess 方法
    def postprocess(self, sample: torch.Tensor, output_type: str = "pil"):
        # 检查输出类型是否在支持的类型列表中
        if output_type not in ["pt", "np", "pil"]:
            # 如果不在列表中，抛出一个值错误
            raise ValueError(
                f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']"
            )
    
        # 等价于 diffusers.VaeImageProcessor.denormalize，将样本归一化到 [0, 1] 范围
        sample = (sample / 2 + 0.5).clamp(0, 1)
        # 如果输出类型是 'pt'，直接返回处理后的样本
        if output_type == "pt":
            return sample
    
        # 等价于 diffusers.VaeImageProcessor.pt_to_numpy，将样本从 PyTorch 张量转换为 NumPy 数组
        sample = sample.cpu().permute(0, 2, 3, 1).numpy()
        # 如果输出类型是 'np'，返回 NumPy 数组
        if output_type == "np":
            return sample
        # 否则，输出类型必须是 'pil'
        sample = numpy_to_pil(sample)
        # 返回 PIL 图像对象
        return sample

`.\diffusers\pipelines\blip_diffusion\modeling_blip2.py`

# 版权信息，表示该代码归 HuggingFace 团队所有
# 
# 根据 Apache 许可证 2.0 版授权；
# 除非遵守许可证，否则不得使用此文件。
# 可以在以下地址获取许可证：
# 
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律或书面协议另有规定，否则根据许可证分发的软件是“按现状”提供的，
# 不提供任何明示或暗示的担保或条件。
# 请参见许可证，以了解管理权限和
# 限制的具体条款。
from typing import Optional, Tuple, Union  # 导入类型提示模块，包含可选、元组和联合类型

import torch  # 导入 PyTorch 库
import torch.utils.checkpoint  # 导入 PyTorch 检查点工具
from torch import nn  # 从 PyTorch 导入神经网络模块
from transformers import BertTokenizer  # 从 transformers 导入 BERT 分词器
from transformers.activations import QuickGELUActivation as QuickGELU  # 导入快速 GELU 激活函数并重命名
from transformers.modeling_outputs import (  # 从 transformers 导入多种模型输出格式
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPooling,
    BaseModelOutputWithPoolingAndCrossAttentions,
)
from transformers.models.blip_2.configuration_blip_2 import Blip2Config, Blip2VisionConfig  # 导入 BLIP-2 配置类
from transformers.models.blip_2.modeling_blip_2 import (  # 从 BLIP-2 导入模型类
    Blip2Encoder,
    Blip2PreTrainedModel,
    Blip2QFormerAttention,
    Blip2QFormerIntermediate,
    Blip2QFormerOutput,
)
from transformers.pytorch_utils import apply_chunking_to_forward  # 导入应用前向分块的工具
from transformers.utils import (  # 从 transformers 导入工具函数
    logging,
    replace_return_docstrings,
)

logger = logging.get_logger(__name__)  # 创建一个日志记录器，以当前模块名为标识

# 在 `transformers` 中有 BLIP2 的实现：https://github.com/huggingface/transformers/blob/main/src/transformers/models/blip_2/modeling_blip_2.py。
# 但它不支持获取多模态嵌入。因此，可以用将来支持此功能的 `transformers` 版本替换此模块。
class Blip2TextEmbeddings(nn.Module):  # 定义 Blip2 文本嵌入类，继承自 nn.Module
    """从词和位置嵌入构建嵌入。"""

    def __init__(self, config):  # 初始化方法，接受配置参数
        super().__init__()  # 调用父类构造函数
        # 创建词嵌入层，使用词汇大小和隐藏层大小
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 创建位置嵌入层，使用最大位置嵌入和隐藏层大小
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)

        # 将 LayerNorm 命名为非蛇形格式，以便与 TensorFlow 模型变量名一致，从而能够加载
        # 任何 TensorFlow 检查点文件
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)  
        self.dropout = nn.Dropout(config.hidden_dropout_prob)  # 创建 Dropout 层

        # 创建位置 ID 缓冲区，表示连续内存中的位置嵌入
        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
        # 获取位置嵌入类型，默认为绝对位置
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")

        self.config = config  # 保存配置

    def forward(  # 定义前向传播方法
        self,
        input_ids=None,  # 输入的 ID
        position_ids=None,  # 输入的位置 ID
        query_embeds=None,  # 查询嵌入
        past_key_values_length=0,  # 过去的键值长度
    # 方法体开始，接受参数
        ):
            # 如果输入ID不为None
            if input_ids is not None:
                # 获取输入序列的长度
                seq_length = input_ids.size()[1]
            else:
                # 如果输入ID为None，序列长度设为0
                seq_length = 0
    
            # 如果位置ID为None
            if position_ids is None:
                # 从位置ID矩阵中提取所需的部分，并克隆
                position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
    
            # 如果输入ID不为None
            if input_ids is not None:
                # 根据输入ID获取词嵌入
                embeddings = self.word_embeddings(input_ids)
                # 如果位置嵌入类型为绝对位置
                if self.position_embedding_type == "absolute":
                    # 获取位置嵌入
                    position_embeddings = self.position_embeddings(position_ids)
                    # 将词嵌入与位置嵌入相加
                    embeddings = embeddings + position_embeddings
    
                # 如果查询嵌入不为None
                if query_embeds is not None:
                    # 获取批次大小
                    batch_size = embeddings.shape[0]
                    # 重复查询嵌入以匹配批次大小
                    query_embeds = query_embeds.repeat(batch_size, 1, 1)
                    # 将查询嵌入和词嵌入在维度1上拼接
                    embeddings = torch.cat((query_embeds, embeddings), dim=1)
            else:
                # 如果输入ID为None，使用查询嵌入
                embeddings = query_embeds
            # 将嵌入转换为查询嵌入的数据类型
            embeddings = embeddings.to(query_embeds.dtype)
            # 对嵌入进行层归一化
            embeddings = self.LayerNorm(embeddings)
            # 对嵌入应用dropout
            embeddings = self.dropout(embeddings)
            # 返回最终的嵌入
            return embeddings
# 从 transformers.models.blip.modeling_blip.BlipVisionEmbeddings 复制而来，进行了 Blip 到 Blip2 的修改
class Blip2VisionEmbeddings(nn.Module):
    # 初始化 Blip2VisionEmbeddings 类，接收配置参数
    def __init__(self, config: Blip2VisionConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 保存配置参数
        self.config = config
        # 获取隐藏层大小作为嵌入维度
        self.embed_dim = config.hidden_size
        # 获取图像大小
        self.image_size = config.image_size
        # 获取补丁大小
        self.patch_size = config.patch_size

        # 初始化类嵌入参数，随机生成
        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))

        # 创建补丁嵌入卷积层，输入通道为3，输出通道为嵌入维度
        self.patch_embedding = nn.Conv2d(
            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size, bias=False
        )

        # 计算总补丁数量
        self.num_patches = (self.image_size // self.patch_size) ** 2
        # 总位置数量比补丁数量多一个（类嵌入）
        self.num_positions = self.num_patches + 1

        # 初始化位置嵌入参数，随机生成
        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))

    # 前向传播方法，接收像素值输入
    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        # 获取批大小
        batch_size = pixel_values.shape[0]
        # 获取补丁嵌入的权重数据类型
        target_dtype = self.patch_embedding.weight.dtype
        # 通过补丁嵌入层处理像素值，得到补丁嵌入
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
        # 将补丁嵌入展平并转置
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)

        # 扩展类嵌入以匹配批大小，并转换为目标数据类型
        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
        # 将类嵌入和补丁嵌入进行拼接
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
        # 加上位置嵌入
        embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
        # 返回最终的嵌入
        return embeddings


# Qformer 编码器，接收视觉嵌入和文本输入，以获取多模态嵌入
class Blip2QFormerEncoder(nn.Module):
    # 初始化 Blip2QFormerEncoder 类，接收配置参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 保存配置参数
        self.config = config
        # 创建一个包含多个 Blip2QFormerLayer 的模块列表
        self.layer = nn.ModuleList(
            [Blip2QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        # 设置梯度检查点为 False
        self.gradient_checkpointing = False

    # 前向传播方法，接收隐藏状态和可选参数
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_values=None,
        use_cache=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        query_length=0,
# 构成 Qformer 编码器的各层
class Blip2QFormerLayer(nn.Module):
    # 初始化方法，接收配置和层索引作为参数
    def __init__(self, config, layer_idx):
        # 调用父类的初始化方法
        super().__init__()
        # 设置前馈网络的块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 设置序列长度维度
        self.seq_len_dim = 1
        # 初始化注意力机制
        self.attention = Blip2QFormerAttention(config)

        # 存储当前层的索引
        self.layer_idx = layer_idx

        # 判断当前层是否需要交叉注意力
        if layer_idx % config.cross_attention_frequency == 0:
            # 初始化交叉注意力机制
            self.crossattention = Blip2QFormerAttention(config, is_cross_attention=True)
            # 标记当前层有交叉注意力
            self.has_cross_attention = True
        else:
            # 标记当前层没有交叉注意力
            self.has_cross_attention = False

        # 初始化中间层的前馈网络
        self.intermediate = Blip2QFormerIntermediate(config)
        # 初始化中间查询层的前馈网络
        self.intermediate_query = Blip2QFormerIntermediate(config)
        # 初始化输出查询层的前馈网络
        self.output_query = Blip2QFormerOutput(config)
        # 初始化输出层的前馈网络
        self.output = Blip2QFormerOutput(config)

    # 前向传播方法，定义网络的输入和输出
    def forward(
        self,
        hidden_states,  # 隐藏状态输入
        attention_mask=None,  # 注意力掩码（可选）
        head_mask=None,  # 注意力头掩码（可选）
        encoder_hidden_states=None,  # 编码器的隐藏状态（可选）
        encoder_attention_mask=None,  # 编码器的注意力掩码（可选）
        past_key_value=None,  # 过去的键值（可选）
        output_attentions=False,  # 是否输出注意力权重
        query_length=0,  # 查询的长度
    ):
        # 解码器单向自注意力的缓存键/值元组位于位置 1 和 2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 调用自注意力机制，传入隐藏状态及相关参数
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        # 获取自注意力输出中的主要注意力结果
        attention_output = self_attention_outputs[0]
        # 获取自注意力输出中的其他结果，排除首尾
        outputs = self_attention_outputs[1:-1]

        # 获取当前自注意力的键/值元组
        present_key_value = self_attention_outputs[-1]

        # 如果查询长度大于 0
        if query_length > 0:
            # 获取查询的注意力输出
            query_attention_output = attention_output[:, :query_length, :]

            # 如果有交叉注意力
            if self.has_cross_attention:
                # 检查编码器隐藏状态是否提供
                if encoder_hidden_states is None:
                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
                # 调用交叉注意力机制
                cross_attention_outputs = self.crossattention(
                    query_attention_output,
                    attention_mask,
                    head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    output_attentions=output_attentions,
                )
                # 获取交叉注意力的输出
                query_attention_output = cross_attention_outputs[0]
                # 如果输出注意力权重，添加交叉注意力的输出
                outputs = outputs + cross_attention_outputs[1:-1]

            # 应用前馈网络到查询注意力输出
            layer_output = apply_chunking_to_forward(
                self.feed_forward_chunk_query,
                self.chunk_size_feed_forward,
                self.seq_len_dim,
                query_attention_output,
            )

            # 如果注意力输出的序列长度大于查询长度
            if attention_output.shape[1] > query_length:
                # 应用前馈网络到注意力输出的后半部分
                layer_output_text = apply_chunking_to_forward(
                    self.feed_forward_chunk,
                    self.chunk_size_feed_forward,
                    self.seq_len_dim,
                    attention_output[:, query_length:, :],
                )
                # 合并层输出
                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
        else:
            # 如果查询长度为 0，直接应用前馈网络
            layer_output = apply_chunking_to_forward(
                self.feed_forward_chunk,
                self.chunk_size_feed_forward,
                self.seq_len_dim,
                attention_output,
            )
        # 将层输出与其他输出组合
        outputs = (layer_output,) + outputs

        # 将当前键/值元组添加到输出中
        outputs = outputs + (present_key_value,)

        # 返回所有输出
        return outputs

    # 前馈网络的块函数
    def feed_forward_chunk(self, attention_output):
        # 计算中间输出
        intermediate_output = self.intermediate(attention_output)
        # 计算最终层输出
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output

    # 查询的前馈网络块函数
    def feed_forward_chunk_query(self, attention_output):
        # 计算查询的中间输出
        intermediate_output = self.intermediate_query(attention_output)
        # 计算最终查询层输出
        layer_output = self.output_query(intermediate_output, attention_output)
        return layer_output
# ProjLayer 用于将多模态 Blip2 嵌入投影到文本编码器中
class ProjLayer(nn.Module):
    # 初始化方法，定义输入维度、输出维度、隐藏层维度、丢弃率和 epsilon
    def __init__(self, in_dim, out_dim, hidden_dim, drop_p=0.1, eps=1e-12):
        # 调用父类构造函数
        super().__init__()

        # 定义全连接层1 -> 激活函数 -> 全连接层2 -> 丢弃层 -> 残差连接 -> 归一化层
        self.dense1 = nn.Linear(in_dim, hidden_dim)  # 第一层全连接
        self.act_fn = QuickGELU()  # 激活函数使用 QuickGELU
        self.dense2 = nn.Linear(hidden_dim, out_dim)  # 第二层全连接
        self.dropout = nn.Dropout(drop_p)  # 定义丢弃层，减少过拟合

        self.LayerNorm = nn.LayerNorm(out_dim, eps=eps)  # 归一化层

    # 前向传播方法
    def forward(self, x):
        x_in = x  # 保存输入以用于残差连接

        x = self.LayerNorm(x)  # 对输入进行层归一化
        # 通过全连接层1 -> 激活函数 -> 全连接层2 -> 丢弃层，进行处理并加上输入（残差连接）
        x = self.dropout(self.dense2(self.act_fn(self.dense1(x)))) + x_in

        return x  # 返回处理后的输出


# 从 transformers.models.blip.modeling_blip.BlipVisionModel 复制并修改 Blip->Blip2, BLIP->BLIP_2
class Blip2VisionModel(Blip2PreTrainedModel):
    main_input_name = "pixel_values"  # 主要输入的名称
    config_class = Blip2VisionConfig  # 配置类

    # 初始化方法，传入配置对象
    def __init__(self, config: Blip2VisionConfig):
        # 调用父类构造函数
        super().__init__(config)
        self.config = config  # 保存配置
        embed_dim = config.hidden_size  # 嵌入维度
        self.embeddings = Blip2VisionEmbeddings(config)  # 初始化嵌入层
        self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)  # 前置层归一化
        self.encoder = Blip2Encoder(config)  # 初始化编码器
        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)  # 后置层归一化

        self.post_init()  # 后初始化处理

    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Blip2VisionConfig)
    # 前向传播方法
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,  # 输入的像素值
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否以字典形式返回
    ) -> Union[Tuple, BaseModelOutputWithPooling]:  # 指定函数返回类型为元组或带有池化输出的基础模型输出
        r"""  # 文档字符串的开始，通常用于描述函数的用途
        Returns:  # 返回部分的说明
        """  # 文档字符串的结束
        # 判断是否需要输出注意力权重，如果未指定则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 判断是否需要输出隐藏状态，如果未指定则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 判断是否返回字典形式的输出，如果未指定则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果没有提供像素值，则抛出错误
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 通过嵌入层将像素值转换为隐藏状态
        hidden_states = self.embeddings(pixel_values)
        # 进行层归一化处理
        hidden_states = self.pre_layernorm(hidden_states)
        # 将隐藏状态输入编码器，获取编码器输出
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,  # 输入的嵌入
            output_attentions=output_attentions,  # 是否输出注意力
            output_hidden_states=output_hidden_states,  # 是否输出隐藏状态
            return_dict=return_dict,  # 是否返回字典
        )
        # 获取编码器输出中的最后一个隐藏状态
        last_hidden_state = encoder_outputs[0]
        # 对最后的隐藏状态进行后续层归一化
        last_hidden_state = self.post_layernorm(last_hidden_state)

        # 从最后的隐藏状态中提取池化输出，通常是[CLS]标记的表示
        pooled_output = last_hidden_state[:, 0, :]
        # 对池化输出进行后续层归一化
        pooled_output = self.post_layernorm(pooled_output)

        # 如果不返回字典，则返回最后的隐藏状态、池化输出和其他编码器输出
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 返回包含最后隐藏状态、池化输出、隐藏状态和注意力的基础模型输出
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,  # 最后的隐藏状态
            pooler_output=pooled_output,  # 池化输出
            hidden_states=encoder_outputs.hidden_states,  # 编码器的隐藏状态
            attentions=encoder_outputs.attentions,  # 编码器的注意力
        )

    def get_input_embeddings(self):  # 定义获取输入嵌入的方法
        return self.embeddings  # 返回嵌入层
# Qformer model, used to get multimodal embeddings from the text and image inputs
class Blip2QFormerModel(Blip2PreTrainedModel):
    """ 
    Querying Transformer (Q-Former), used in BLIP-2.
    """

    def __init__(self, config: Blip2Config):
        # 初始化父类，传入配置
        super().__init__(config)
        # 保存配置对象
        self.config = config
        # 创建文本嵌入层，使用 Q-Former 的配置
        self.embeddings = Blip2TextEmbeddings(config.qformer_config)
        # 创建视觉编码器，使用视觉模型的配置
        self.visual_encoder = Blip2VisionModel(config.vision_config)
        # 初始化查询 token 的参数，形状为 (1, num_query_tokens, hidden_size)
        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
        # 检查配置是否包含 tokenizer，如果没有则使用默认的 BERT tokenizer
        if not hasattr(config, "tokenizer") or config.tokenizer is None:
            self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side="right")
        else:
            # 使用配置中的 tokenizer
            self.tokenizer = BertTokenizer.from_pretrained(config.tokenizer, truncation_side="right")
        # 添加特殊的开始 token
        self.tokenizer.add_special_tokens({"bos_token": "[DEC]"})
        # 创建投影层，设置输入、输出维度和隐藏层维度
        self.proj_layer = ProjLayer(
            in_dim=config.qformer_config.hidden_size,
            out_dim=config.qformer_config.hidden_size,
            hidden_dim=config.qformer_config.hidden_size * 4,
            drop_p=0.1,
            eps=1e-12,
        )
        # 创建 Q-Former 编码器，使用配置
        self.encoder = Blip2QFormerEncoder(config.qformer_config)
        # 调用后初始化方法
        self.post_init()

    def get_input_embeddings(self):
        # 返回输入嵌入层的词嵌入
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        # 设置输入嵌入层的词嵌入
        self.embeddings.word_embeddings = value

    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历每一层和需要剪枝的头
        for layer, heads in heads_to_prune.items():
            # 对指定层的注意力头进行剪枝
            self.encoder.layer[layer].attention.prune_heads(heads)

    def get_extended_attention_mask(
        self,
        attention_mask: torch.Tensor,
        input_shape: Tuple[int],
        device: torch.device,
        has_query: bool = False,
        # ...
    ) -> torch.Tensor:  # 指定该函数返回一个 torch.Tensor 类型的值
        """
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.  # 准备可广播的注意力和因果掩码，以忽略未来和被掩盖的标记。

        Arguments:  # 参数说明
            attention_mask (`torch.Tensor`):  # 注意力掩码，类型为 torch.Tensor
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.  # 掩码中，1表示要关注的标记，0表示要忽略的标记。
            input_shape (`Tuple[int]`):  # 输入的形状，类型为整数元组
                The shape of the input to the model.  # 模型输入的形状。
            device (`torch.device`):  # 输入的设备类型
                The device of the input to the model.  # 模型输入的设备。
        
        Returns:  # 返回值说明
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.  # 返回扩展的注意力掩码，其数据类型与 attention_mask 的数据类型相同。
        """
        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]  # 可以提供自注意力掩码，维度为 [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.  # 在这种情况下，只需将其设置为可广播到所有头部。
        if attention_mask.dim() == 3:  # 如果注意力掩码是 3 维
            extended_attention_mask = attention_mask[:, None, :, :]  # 扩展掩码以增加一个维度，使其可以广播到所有头
        elif attention_mask.dim() == 2:  # 如果注意力掩码是 2 维
            # Provided a padding mask of dimensions [batch_size, seq_length]  # 提供的填充掩码维度为 [batch_size, seq_length]
            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]  # - 模型是编码器，因此将掩码扩展为可广播到 [batch_size, num_heads, seq_length, seq_length]
            extended_attention_mask = attention_mask[:, None, None, :]  # 扩展掩码以增加两个维度
        else:  # 如果不是以上情况
            raise ValueError(  # 抛出值错误
                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(  # 错误信息，说明 input_ids 或 attention_mask 的形状不正确
                    input_shape, attention_mask.shape  # 显示输入形状和注意力掩码形状
                )
            )

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for  # 由于 attention_mask 在要关注的位置为 1.0，在掩盖的位置为 0.0
        # masked positions, this operation will create a tensor which is 0.0 for  # 这个操作将创建一个张量，在要关注的位置为 0.0，在被掩盖的位置为 -10000.0
        # positions we want to attend and -10000.0 for masked positions.  # 由于我们在 softmax 之前将其添加到原始分数，这实际上与完全删除这些位置相同。
        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility  # 将扩展的注意力掩码转换为与模型数据类型兼容的格式（例如 fp16）
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0  # 将掩码转换为注意力得分，关注位置为 0.0，被掩盖位置为 -10000.0
        return extended_attention_mask  # 返回扩展的注意力掩码

    def forward(  # 定义前向传播函数
        self,  # self 参数，引用当前实例
        text_input=None,  # 文本输入，默认为 None
        image_input=None,  # 图像输入，默认为 None
        head_mask=None,  # 头部掩码，默认为 None
        encoder_hidden_states=None,  # 编码器隐藏状态，默认为 None
        encoder_attention_mask=None,  # 编码器注意力掩码，默认为 None
        past_key_values=None,  # 过去的键值，默认为 None
        use_cache=None,  # 是否使用缓存，默认为 None
        output_attentions=None,  # 是否输出注意力，默认为 None
        output_hidden_states=None,  # 是否输出隐藏状态，默认为 None
        return_dict=None,  # 是否返回字典，默认为 None

`.\diffusers\pipelines\blip_diffusion\modeling_ctx_clip.py`

# 版权所有 2024 Salesforce.com, inc.
# 版权所有 2024 The HuggingFace Team. 保留所有权利。
#
# 根据 Apache 许可证，第 2.0 版（"许可证"）许可；
# 除非遵循许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则根据许可证分发的软件
# 是按“原样”基础提供的，不提供任何形式的保证或条件，
# 明示或暗示。有关许可证的特定语言的权限和限制，请参见
# 许可证。
from typing import Optional, Tuple, Union  # 从 typing 模块导入 Optional、Tuple 和 Union 类型提示

import torch  # 导入 PyTorch 库
from torch import nn  # 从 PyTorch 导入神经网络模块
from transformers import CLIPPreTrainedModel  # 从 transformers 导入 CLIP 预训练模型基类
from transformers.modeling_outputs import BaseModelOutputWithPooling  # 导入带池化的基础模型输出
from transformers.models.clip.configuration_clip import CLIPTextConfig  # 导入 CLIP 文本配置
from transformers.models.clip.modeling_clip import CLIPEncoder  # 导入 CLIP 编码器


def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
    """
    扩展 attention_mask 从 `[bsz, seq_len]` 到 `[bsz, 1, tgt_seq_len, src_seq_len]`。
    """
    bsz, src_len = mask.size()  # 获取输入掩码的批量大小和源序列长度
    tgt_len = tgt_len if tgt_len is not None else src_len  # 如果目标长度为 None，则使用源长度

    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)  # 扩展掩码到目标维度并转换类型

    inverted_mask = 1.0 - expanded_mask  # 反转掩码，将 1 变为 0，0 变为 1

    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)  # 用最小浮点数填充反转掩码中为 True 的位置


# 这是 transformers.models.clip.modeling_clip 中的 CLIPTextModel 的修改版本
# 允许额外输入“上下文嵌入”，即 Qformer 中使用的查询嵌入
# 它们与文本嵌入一起通过 clip 模型，并使用自注意力与之交互
class ContextCLIPTextModel(CLIPPreTrainedModel):  # 定义上下文 CLIP 文本模型类，继承自 CLIP 预训练模型
    config_class = CLIPTextConfig  # 指定配置类为 CLIPTextConfig

    _no_split_modules = ["CLIPEncoderLayer"]  # 定义不应被拆分的模块列表

    def __init__(self, config: CLIPTextConfig):  # 初始化方法，接受 CLIPTextConfig 配置
        super().__init__(config)  # 调用父类的初始化方法
        self.text_model = ContextCLIPTextTransformer(config)  # 创建上下文 CLIP 文本转换器模型
        # 初始化权重并应用最终处理
        self.post_init()  # 调用后处理方法

    def forward(  # 定义前向传播方法
        self,
        ctx_embeddings: torch.Tensor = None,  # 上下文嵌入，默认为 None
        ctx_begin_pos: list = None,  # 上下文开始位置列表，默认为 None
        input_ids: Optional[torch.Tensor] = None,  # 输入 ID，默认为 None
        attention_mask: Optional[torch.Tensor] = None,  # 注意力掩码，默认为 None
        position_ids: Optional[torch.Tensor] = None,  # 位置 ID，默认为 None
        output_attentions: Optional[bool] = None,  # 是否输出注意力，默认为 None
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，默认为 None
        return_dict: Optional[bool] = None,  # 是否以字典形式返回结果，默认为 None
    ) -> Union[Tuple, BaseModelOutputWithPooling]:  # 定义返回类型为元组或带池化的基础模型输出
        return self.text_model(  # 调用文本模型的前向传播方法
            ctx_embeddings=ctx_embeddings,  # 传递上下文嵌入
            ctx_begin_pos=ctx_begin_pos,  # 传递上下文开始位置
            input_ids=input_ids,  # 传递输入 ID
            attention_mask=attention_mask,  # 传递注意力掩码
            position_ids=position_ids,  # 传递位置 ID
            output_attentions=output_attentions,  # 传递输出注意力参数
            output_hidden_states=output_hidden_states,  # 传递输出隐藏状态参数
            return_dict=return_dict,  # 传递返回字典参数
        )  # 返回文本模型的前向传播结果
# 定义一个名为 ContextCLIPTextTransformer 的类，继承自 nn.Module
class ContextCLIPTextTransformer(nn.Module):
    # 初始化方法，接受一个配置对象 config，类型为 CLIPTextConfig
    def __init__(self, config: CLIPTextConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 将配置对象存储为实例属性
        self.config = config
        # 获取隐藏层的维度
        embed_dim = config.hidden_size
        # 创建上下文 CLIP 文本嵌入对象
        self.embeddings = ContextCLIPTextEmbeddings(config)
        # 创建 CLIP 编码器对象
        self.encoder = CLIPEncoder(config)
        # 创建最终层的归一化层
        self.final_layer_norm = nn.LayerNorm(embed_dim)

    # 定义前向传播方法，处理输入的张量和其他参数
    def forward(
        self,
        # 上下文嵌入的张量
        ctx_embeddings: torch.Tensor,
        # 上下文开始位置的列表
        ctx_begin_pos: list,
        # 可选的输入 ID 张量
        input_ids: Optional[torch.Tensor] = None,
        # 可选的注意力掩码张量
        attention_mask: Optional[torch.Tensor] = None,
        # 可选的位置 ID 张量
        position_ids: Optional[torch.Tensor] = None,
        # 可选的输出注意力标志
        output_attentions: Optional[bool] = None,
        # 可选的输出隐藏状态标志
        output_hidden_states: Optional[bool] = None,
        # 可选的返回字典标志
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r""" 
        # 文档字符串，说明返回值类型
        Returns:

        """
        # 如果 output_attentions 为 None，则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果 output_hidden_states 为 None，则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果 return_dict 为 None，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 如果 input_ids 为 None，抛出错误
        if input_ids is None:
            raise ValueError("You have to specify either input_ids")

        # 获取输入张量的形状
        input_shape = input_ids.size()
        # 将 input_ids 调整为二维张量，第二维为输入的最后一维
        input_ids = input_ids.view(-1, input_shape[-1])

        # 使用嵌入层处理输入 ids 以获取隐藏状态
        hidden_states = self.embeddings(
            input_ids=input_ids,
            position_ids=position_ids,
            ctx_embeddings=ctx_embeddings,
            ctx_begin_pos=ctx_begin_pos,
        )

        # 获取批次大小和序列长度
        bsz, seq_len = input_shape
        # 如果存在上下文嵌入，更新序列长度
        if ctx_embeddings is not None:
            seq_len += ctx_embeddings.size(1)
        # 准备因果注意力掩码
        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
            hidden_states.device
        )
        # 如果提供了注意力掩码，则扩展它
        if attention_mask is not None:
            # 将 [bsz, seq_len] 扩展为 [bsz, 1, tgt_seq_len, src_seq_len]
            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)

        # 将嵌入的隐藏状态传入编码器，并获取输出
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器输出的最后隐藏状态
        last_hidden_state = encoder_outputs[0]
        # 对最后的隐藏状态进行层归一化处理
        last_hidden_state = self.final_layer_norm(last_hidden_state)

        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
        # 从 eot 嵌入中获取特征（eot_token 是每个序列中的最大值）
        # 为了与 onnx 兼容，转换为 torch.int：argmax 不支持 opset 14 的 int64 输入
        pooled_output = last_hidden_state[
            torch.arange(last_hidden_state.shape[0], device=input_ids.device),
            input_ids.to(torch.int).argmax(dim=-1),
        ]

        # 如果不需要返回字典格式，则返回最后隐藏状态、池化输出和编码器其他输出
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 返回带有池化输出的模型输出
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
    # 定义构建因果注意力掩码的方法，参数包括批次大小、序列长度和数据类型
    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
        # 延迟创建因果注意力掩码，确保视觉标记之间有完全注意力
        # pytorch 使用加法注意力掩码；填充 -inf
        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)  # 创建一个空的张量作为掩码
        mask.fill_(torch.tensor(torch.finfo(dtype).min))  # 用数据类型的最小值填充掩码
        mask.triu_(1)  # 将下三角部分置零，保留上三角部分
        mask = mask.unsqueeze(1)  # 扩展掩码的维度，以便与其他张量兼容
        return mask  # 返回生成的因果注意力掩码
# 定义一个名为 ContextCLIPTextEmbeddings 的类，继承自 nn.Module
class ContextCLIPTextEmbeddings(nn.Module):
    # 初始化方法，接受一个配置对象 config
    def __init__(self, config: CLIPTextConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 获取嵌入维度，来自配置对象的 hidden_size 属性
        embed_dim = config.hidden_size

        # 创建一个词嵌入层，输入词汇大小和嵌入维度
        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
        # 创建一个位置嵌入层，输入最大位置嵌入数和嵌入维度
        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)

        # 创建一个名为 position_ids 的缓冲区，表示位置 ID 的张量
        # 位置 ID 为 (1, len position emb)，在内存中是连续的，并在序列化时导出
        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))

    # 前向传播方法，接受上下文嵌入和其他可选参数
    def forward(
        self,
        ctx_embeddings: torch.Tensor,
        ctx_begin_pos: list,
        input_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        # 如果 ctx_embeddings 为空，设置上下文长度为 0
        if ctx_embeddings is None:
            ctx_len = 0
        else:
            # 获取上下文嵌入的长度
            ctx_len = ctx_embeddings.shape[1]

        # 计算序列长度，如果 input_ids 为空，则使用 inputs_embeds 的倒数第二维度
        seq_length = (input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]) + ctx_len

        # 如果 position_ids 为空，从位置缓冲区获取相应的 position_ids
        if position_ids is None:
            position_ids = self.position_ids[:, :seq_length]

        # 如果 inputs_embeds 为空，从 token_embedding 获取嵌入
        if inputs_embeds is None:
            inputs_embeds = self.token_embedding(input_ids)

            # 为每个输入嵌入，在正确位置添加上下文嵌入
            input_embeds_ctx = []
            # 获取批次大小
            bsz = inputs_embeds.shape[0]

            # 如果 ctx_embeddings 不为空，进行上下文嵌入的拼接
            if ctx_embeddings is not None:
                # 遍历每个样本
                for i in range(bsz):
                    # 获取当前样本的上下文开始位置
                    cbp = ctx_begin_pos[i]

                    # 获取输入嵌入的前缀部分
                    prefix = inputs_embeds[i, :cbp]
                    # 获取输入嵌入的后缀部分，移除特殊标记的嵌入
                    suffix = inputs_embeds[i, cbp:]

                    # 将前缀、上下文嵌入和后缀拼接起来
                    input_embeds_ctx.append(torch.cat([prefix, ctx_embeddings[i], suffix], dim=0))

                # 将所有样本的输入嵌入堆叠成一个张量
                inputs_embeds = torch.stack(input_embeds_ctx, dim=0)

        # 获取位置嵌入
        position_embeddings = self.position_embedding(position_ids)
        # 计算最终的嵌入，将输入嵌入与位置嵌入相加
        embeddings = inputs_embeds + position_embeddings

        # 返回计算得到的嵌入
        return embeddings

posted @ 2024-10-22 12:36 绝不原创的飞龙阅读(106) 评论(0) 收藏举报

刷新页面返回顶部

龙哥盟

掠夺·扩张·投机·博弈

diffusers-源码解析-十九-

diffusers 源码解析（十九）

`.\diffusers\pipelines\audioldm2\pipeline_audioldm2.py`

`.\diffusers\pipelines\audioldm2\init.py`

`.\diffusers\pipelines\aura_flow\pipeline_aura_flow.py`

`.\diffusers\pipelines\aura_flow\init.py`

`.\diffusers\pipelines\auto_pipeline.py`

`.\diffusers\pipelines\blip_diffusion\blip_image_processing.py`

`.\diffusers\pipelines\blip_diffusion\modeling_blip2.py`

`.\diffusers\pipelines\blip_diffusion\modeling_ctx_clip.py`

公告

龙哥盟

掠夺·扩张·投机·博弈

diffusers-源码解析-十九-

diffusers 源码解析（十九）

.\diffusers\pipelines\audioldm2\pipeline_audioldm2.py

.\diffusers\pipelines\audioldm2\__init__.py

.\diffusers\pipelines\aura_flow\pipeline_aura_flow.py

.\diffusers\pipelines\aura_flow\__init__.py

.\diffusers\pipelines\auto_pipeline.py

.\diffusers\pipelines\blip_diffusion\blip_image_processing.py

.\diffusers\pipelines\blip_diffusion\modeling_blip2.py

.\diffusers\pipelines\blip_diffusion\modeling_ctx_clip.py

公告

`.\diffusers\pipelines\audioldm2\pipeline_audioldm2.py`

`.\diffusers\pipelines\audioldm2\init.py`

`.\diffusers\pipelines\aura_flow\pipeline_aura_flow.py`

`.\diffusers\pipelines\aura_flow\init.py`

`.\diffusers\pipelines\auto_pipeline.py`

`.\diffusers\pipelines\blip_diffusion\blip_image_processing.py`

`.\diffusers\pipelines\blip_diffusion\modeling_blip2.py`

`.\diffusers\pipelines\blip_diffusion\modeling_ctx_clip.py`