diffusers-源码解析-十八-

diffusers 源码解析（十八）

`.\diffusers\pipelines\animatediff\pipeline_animatediff_sparsectrl.py`

# 版权所有 2024 HuggingFace 团队。保留所有权利。
#
# 根据 Apache 许可证，版本 2.0（"许可证"）进行许可；
# 除非遵守该许可证，否则您不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，软件
# 在许可证下分发是按“原样”基础，
# 不提供任何明示或暗示的保证或条件。
# 请参见许可证以了解管理权限和
# 限制的具体语言。

# 导入inspect模块，用于获取对象的信息
import inspect
# 导入类型提示所需的各种类型
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

# 导入numpy库，用于数值计算
import numpy as np
# 导入PIL库，用于图像处理
import PIL
# 导入torch库，用于深度学习计算
import torch
# 从torch.nn.functional导入功能性操作
import torch.nn.functional as F
# 从transformers库导入CLIP相关模型和处理器
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

# 导入自定义图像处理模块
from ...image_processor import PipelineImageInput, VaeImageProcessor
# 导入加载器混合类，用于不同类型的模型加载
from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
# 导入各种模型，包括自动编码器和UNet
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
# 导入稀疏控制网络模型
from ...models.controlnet_sparsectrl import SparseControlNetModel
# 从lora模块导入调整Lora尺度的函数
from ...models.lora import adjust_lora_scale_text_encoder
# 从unet_motion_model模块导入运动适配器
from ...models.unets.unet_motion_model import MotionAdapter
# 导入Karras扩散调度器
from ...schedulers import KarrasDiffusionSchedulers
# 导入一些工具函数
from ...utils import (
    USE_PEFT_BACKEND,  # 用于选择PEFT后端的常量
    logging,           # 导入日志记录模块
    replace_example_docstring,  # 替换示例文档字符串的函数
    scale_lora_layers, # 调整Lora层比例的函数
    unscale_lora_layers, # 反向调整Lora层比例的函数
)
# 导入与Torch相关的实用函数
from ...utils.torch_utils import is_compiled_module, randn_tensor
# 导入视频处理模块
from ...video_processor import VideoProcessor
# 导入FreeInitMixin类
from ..free_init_utils import FreeInitMixin
# 导入扩散管道和稳定扩散的混合类
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
# 导入动画扩散管道输出类
from .pipeline_output import AnimateDiffPipelineOutput

# 创建日志记录器，用于模块的日志记录
logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

# 示例文档字符串的初始化
EXAMPLE_DOC_STRING = """

# 示例代码，展示如何使用 AnimateDiffSparseControlNetPipeline
    Examples:
        ```py
        # 导入 PyTorch 和相关的 Diffusers 模型
        >>> import torch
        >>> from diffusers import AnimateDiffSparseControlNetPipeline
        >>> from diffusers.models import AutoencoderKL, MotionAdapter, SparseControlNetModel
        >>> from diffusers.schedulers import DPMSolverMultistepScheduler
        >>> from diffusers.utils import export_to_gif, load_image

        # 定义模型和适配器的 ID
        >>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
        >>> motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3"
        >>> controlnet_id = "guoyww/animatediff-sparsectrl-scribble"
        >>> lora_adapter_id = "guoyww/animatediff-motion-lora-v1-5-3"
        >>> vae_id = "stabilityai/sd-vae-ft-mse"
        >>> device = "cuda"  # 设置设备为 GPU

        # 从预训练模型加载运动适配器并转移到指定设备
        >>> motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id, torch_dtype=torch.float16).to(device)
        # 从预训练模型加载控制网络并转移到指定设备
        >>> controlnet = SparseControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16).to(device)
        # 从预训练模型加载变换编码器并转移到指定设备
        >>> vae = AutoencoderKL.from_pretrained(vae_id, torch_dtype=torch.float16).to(device)
        # 从预训练模型加载调度器，并设置相关参数
        >>> scheduler = DPMSolverMultistepScheduler.from_pretrained(
        ...     model_id,
        ...     subfolder="scheduler",  # 指定子文件夹
        ...     beta_schedule="linear",  # 设置 beta 调度
        ...     algorithm_type="dpmsolver++",  # 设置算法类型
        ...     use_karras_sigmas=True,  # 使用 Karras sigmas
        ... )
        # 从预训练模型加载动画Diff稀疏控制管道并转移到指定设备
        >>> pipe = AnimateDiffSparseControlNetPipeline.from_pretrained(
        ...     model_id,
        ...     motion_adapter=motion_adapter,
        ...     controlnet=controlnet,
        ...     vae=vae,
        ...     scheduler=scheduler,
        ...     torch_dtype=torch.float16,
        ... ).to(device)
        # 加载 LORA 权重
        >>> pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora")
        # 融合 LORA 权重，设置比例为 1.0
        >>> pipe.fuse_lora(lora_scale=1.0)

        # 定义生成图像的提示词
        >>> prompt = "an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality"
        # 定义生成图像的负面提示词
        >>> negative_prompt = "low quality, worst quality, letterboxed"

        # 定义条件帧的图像文件列表
        >>> image_files = [
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-1.png",
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-2.png",
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-3.png",
        ... ]
        # 定义条件帧的索引
        >>> condition_frame_indices = [0, 8, 15]
        # 加载条件帧的图像
        >>> conditioning_frames = [load_image(img_file) for img_file in image_files]

        # 生成视频，并设置相关参数
        >>> video = pipe(
        ...     prompt=prompt,
        ...     negative_prompt=negative_prompt,
        ...     num_inference_steps=25,  # 设定推理步数
        ...     conditioning_frames=conditioning_frames,
        ...     controlnet_conditioning_scale=1.0,  # 设置控制网络条件比例
        ...     controlnet_frame_indices=condition_frame_indices,  # 设置控制网络帧索引
        ...     generator=torch.Generator().manual_seed(1337),  # 设置随机种子
        ... ).frames[0]  # 获取生成的视频帧
        # 导出视频为 GIF 格式
        >>> export_to_gif(video, "output.gif")
        ```

从 diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents 复制的函数

def retrieve_latents(
# 编码器输出的张量
encoder_output: torch.Tensor,
# 可选的随机数生成器
generator: Optional[torch.Generator] = None,
# 采样模式，默认为 "sample"
sample_mode: str = "sample"
):
# 如果 encoder_output 有 latent_dist 属性且采样模式为 "sample"
if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
# 从潜在分布中采样并返回
return encoder_output.latent_dist.sample(generator)
# 如果 encoder_output 有 latent_dist 属性且采样模式为 "argmax"
elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
# 返回潜在分布的众数
return encoder_output.latent_dist.mode()
# 如果 encoder_output 有 latents 属性
elif hasattr(encoder_output, "latents"):
# 返回潜在变量
return encoder_output.latents
# 否则，抛出属性错误
else:
raise AttributeError("Could not access latents of provided encoder_output")

用于受控文本到视频生成的管道

class AnimateDiffSparseControlNetPipeline(
DiffusionPipeline, # 继承自 DiffusionPipeline
StableDiffusionMixin, # 继承自 StableDiffusionMixin
TextualInversionLoaderMixin, # 继承自 TextualInversionLoaderMixin
IPAdapterMixin, # 继承自 IPAdapterMixin
StableDiffusionLoraLoaderMixin, # 继承自 StableDiffusionLoraLoaderMixin
FreeInitMixin, # 继承自 FreeInitMixin
):
r"""
基于 SparseCtrl: Adding Sparse Controls
to Text-to-Video Diffusion Models 方法的受控文本到视频生成管道。

该模型继承自 [`DiffusionPipeline`]。有关所有管道通用方法的文档，请查看超类文档（下载、保存、在特定设备上运行等）。

此管道还继承以下加载方法：
    - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] 用于加载文本反演嵌入
    - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] 用于加载 LoRA 权重
    - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] 用于保存 LoRA 权重
    - [`~loaders.IPAdapterMixin.load_ip_adapter`] 用于加载 IP 适配器

参数：
    vae ([`AutoencoderKL`]):
        用于将图像编码和解码为潜在表示的变分自编码器模型。
    text_encoder ([`CLIPTextModel`]):
        冻结的文本编码器 ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14))。
    tokenizer (`CLIPTokenizer`):
        用于分词的 [`~transformers.CLIPTokenizer`]。
    unet ([`UNet2DConditionModel`]):
        [`UNet2DConditionModel`] 用于创建 UNetMotionModel，以去噪编码的视频潜在变量。
    motion_adapter ([`MotionAdapter`]):
        用于与 `unet` 结合使用以去噪编码视频潜在变量的 [`MotionAdapter`]。
    scheduler ([`SchedulerMixin`]):
        用于与 `unet` 结合使用以去噪编码图像潜在变量的调度器。可以是
        [`DDIMScheduler`], [`LMSDiscreteScheduler`] 或 [`PNDMScheduler`]。
"""

# 指定模型 CPU 卸载顺序
model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
# 可选组件列表
_optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
# 回调张量输入列表
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
# 初始化类的构造函数
    def __init__(
        # 自动编码器模型，用于数据压缩和重构
        vae: AutoencoderKL,
        # 文本编码器，负责将文本转换为嵌入表示
        text_encoder: CLIPTextModel,
        # 分词器，用于将文本分解为标记
        tokenizer: CLIPTokenizer,
        # 条件生成模型，可以是 UNet2D 或运动模型
        unet: Union[UNet2DConditionModel, UNetMotionModel],
        # 动作适配器，用于处理运动数据
        motion_adapter: MotionAdapter,
        # 稀疏控制网络模型，增强生成的灵活性
        controlnet: SparseControlNetModel,
        # 调度器，控制生成过程中的时间步长
        scheduler: KarrasDiffusionSchedulers,
        # 特征提取器，可选，用于处理图像特征
        feature_extractor: CLIPImageProcessor = None,
        # 图像编码器，可选，用于将图像转换为嵌入
        image_encoder: CLIPVisionModelWithProjection = None,
    ):
        # 调用父类的构造函数
        super().__init__()
        # 检查 UNet 的类型，如果是 UNet2D，则转换为 UNetMotion
        if isinstance(unet, UNet2DConditionModel):
            unet = UNetMotionModel.from_unet2d(unet, motion_adapter)

        # 注册多个模块，使其可以在模型中使用
        self.register_modules(
            vae=vae,
            text_encoder=text_encoder,
            tokenizer=tokenizer,
            unet=unet,
            motion_adapter=motion_adapter,
            controlnet=controlnet,
            scheduler=scheduler,
            feature_extractor=feature_extractor,
            image_encoder=image_encoder,
        )
        # 计算 VAE 的缩放因子，基于其配置中的通道数
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        # 创建视频处理器实例，不进行缩放，使用 VAE 缩放因子
        self.video_processor = VideoProcessor(do_resize=False, vae_scale_factor=self.vae_scale_factor)
        # 创建图像处理器实例，使用 VAE 缩放因子，进行 RGB 转换，不进行归一化
        self.control_image_processor = VaeImageProcessor(
            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
        )

    # 从 StableDiffusionPipeline 中复制的编码提示方法，参数中的 num_images_per_prompt 更改为 num_videos_per_prompt
    def encode_prompt(
        # 输入的提示文本
        prompt,
        # 设备信息，用于指定计算的设备（如 CPU 或 GPU）
        device,
        # 每个提示生成的图像数量
        num_images_per_prompt,
        # 是否进行无分类器引导
        do_classifier_free_guidance,
        # 可选的负面提示文本
        negative_prompt=None,
        # 可选的提示嵌入，预计算的文本嵌入
        prompt_embeds: Optional[torch.Tensor] = None,
        # 可选的负面提示嵌入，预计算的负面文本嵌入
        negative_prompt_embeds: Optional[torch.Tensor] = None,
        # 可选的 Lora 缩放因子，用于调整影响力
        lora_scale: Optional[float] = None,
        # 可选的跳过剪辑的层数
        clip_skip: Optional[int] = None,
    # 从 StableDiffusionPipeline 中复制的编码图像方法
# 定义一个编码图像的函数，接收图像、设备、每个提示的图像数量及可选的隐藏状态
    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
        # 获取图像编码器参数的数值类型
        dtype = next(self.image_encoder.parameters()).dtype

        # 如果输入的图像不是张量，则使用特征提取器处理图像，返回张量格式
        if not isinstance(image, torch.Tensor):
            image = self.feature_extractor(image, return_tensors="pt").pixel_values

        # 将图像移动到指定设备并设置数据类型
        image = image.to(device=device, dtype=dtype)
        # 如果需要输出隐藏状态
        if output_hidden_states:
            # 编码图像并获取倒数第二层的隐藏状态
            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
            # 按照每个提示的图像数量重复隐藏状态
            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
            # 对无条件图像编码进行处理，创建零张量作为输入
            uncond_image_enc_hidden_states = self.image_encoder(
                torch.zeros_like(image), output_hidden_states=True
            ).hidden_states[-2]
            # 同样按照每个提示的图像数量重复无条件隐藏状态
            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
                num_images_per_prompt, dim=0
            )
            # 返回编码的图像隐藏状态和无条件隐藏状态
            return image_enc_hidden_states, uncond_image_enc_hidden_states
        else:
            # 编码图像并获取图像嵌入
            image_embeds = self.image_encoder(image).image_embeds
            # 按照每个提示的图像数量重复图像嵌入
            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
            # 创建与图像嵌入相同形状的零张量作为无条件嵌入
            uncond_image_embeds = torch.zeros_like(image_embeds)

            # 返回图像嵌入和无条件嵌入
            return image_embeds, uncond_image_embeds

    # 从 diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds 复制的函数
    def prepare_ip_adapter_image_embeds(
        # 接收适配器图像、图像嵌入、设备、每个提示的图像数量和分类器自由引导的标志
        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
):
    # 初始化一个空列表，用于存储图像嵌入
    image_embeds = []
    # 如果启用无分类器引导，则初始化一个空列表，用于存储负图像嵌入
    if do_classifier_free_guidance:
        negative_image_embeds = []
    # 如果 IP 适配器图像嵌入为 None
    if ip_adapter_image_embeds is None:
        # 检查 ip_adapter_image 是否为列表，如果不是，则将其转换为列表
        if not isinstance(ip_adapter_image, list):
            ip_adapter_image = [ip_adapter_image]

        # 检查 ip_adapter_image 的长度是否与 IP 适配器数量相同
        if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
            raise ValueError(
                # 抛出错误，说明图像数量与 IP 适配器数量不匹配
                f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
            )

        # 遍历每个单独的 IP 适配器图像及其对应的图像投影层
        for single_ip_adapter_image, image_proj_layer in zip(
            ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
        ):
            # 检查图像投影层是否为 ImageProjection，以确定输出隐藏状态的需求
            output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
            # 编码单个图像以获取图像嵌入和负图像嵌入
            single_image_embeds, single_negative_image_embeds = self.encode_image(
                single_ip_adapter_image, device, 1, output_hidden_state
            )

            # 将单个图像嵌入添加到列表中
            image_embeds.append(single_image_embeds[None, :])
            # 如果启用无分类器引导，则将负图像嵌入也添加到列表中
            if do_classifier_free_guidance:
                negative_image_embeds.append(single_negative_image_embeds[None, :])
    else:
        # 遍历已有的 IP 适配器图像嵌入
        for single_image_embeds in ip_adapter_image_embeds:
            # 如果启用无分类器引导，则将嵌入拆分为负和正
            if do_classifier_free_guidance:
                single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
                # 将负图像嵌入添加到列表中
                negative_image_embeds.append(single_negative_image_embeds)
            # 将正图像嵌入添加到列表中
            image_embeds.append(single_image_embeds)

    # 初始化一个空列表，用于存储最终的 IP 适配器图像嵌入
    ip_adapter_image_embeds = []
    # 遍历图像嵌入及其索引
    for i, single_image_embeds in enumerate(image_embeds):
        # 将每个图像嵌入复制 num_images_per_prompt 次
        single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
        # 如果启用无分类器引导，处理负图像嵌入
        if do_classifier_free_guidance:
            single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
            # 将负图像嵌入与正图像嵌入连接
            single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)

        # 将图像嵌入转移到指定设备
        single_image_embeds = single_image_embeds.to(device=device)
        # 将处理后的图像嵌入添加到最终列表中
        ip_adapter_image_embeds.append(single_image_embeds)

    # 返回最终的 IP 适配器图像嵌入列表
    return ip_adapter_image_embeds

# 从 diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline 复制
def decode_latents(self, latents):
    # 按照 VAE 配置的缩放因子调整潜变量
    latents = 1 / self.vae.config.scaling_factor * latents

    # 获取潜变量的形状信息
    batch_size, channels, num_frames, height, width = latents.shape
    # 调整潜变量的维度顺序并重塑为新的形状
    latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)

    # 解码潜变量，获取图像
    image = self.vae.decode(latents).sample
    # 重塑图像以形成视频的形状
    video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
    # 将视频转换为 float32，以确保兼容性
    video = video.float()
    # 返回解码后的视频
    return video
# 从 diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs 复制而来
def prepare_extra_step_kwargs(self, generator, eta):
    # 准备额外的参数用于调度器步骤，因为并非所有调度器具有相同的参数签名
    # eta (η) 仅在 DDIMScheduler 中使用，对于其他调度器将被忽略
    # eta 对应于 DDIM 论文中的 η: https://arxiv.org/abs/2010.02502
    # 并且应该在 [0, 1] 之间

    # 检查调度器的步骤是否接受 eta 参数
    accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
    # 初始化一个空字典用于存储额外的步骤参数
    extra_step_kwargs = {}
    # 如果调度器接受 eta，则将其添加到额外步骤参数中
    if accepts_eta:
        extra_step_kwargs["eta"] = eta

    # 检查调度器的步骤是否接受 generator 参数
    accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
    # 如果调度器接受 generator，则将其添加到额外步骤参数中
    if accepts_generator:
        extra_step_kwargs["generator"] = generator
    # 返回包含额外参数的字典
    return extra_step_kwargs

def check_inputs(
    self,
    prompt,
    height,
    width,
    negative_prompt=None,
    prompt_embeds=None,
    negative_prompt_embeds=None,
    ip_adapter_image=None,
    ip_adapter_image_embeds=None,
    callback_on_step_end_tensor_inputs=None,
    image=None,
    controlnet_conditioning_scale: float = 1.0,
    # 从 diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image 复制而来
# 定义一个检查图像的函数，参数包括图像、提示和提示嵌入
def check_image(self, image, prompt, prompt_embeds):
    # 检查图像是否为 PIL 图像对象
    image_is_pil = isinstance(image, PIL.Image.Image)
    # 检查图像是否为 PyTorch 张量
    image_is_tensor = isinstance(image, torch.Tensor)
    # 检查图像是否为 NumPy 数组
    image_is_np = isinstance(image, np.ndarray)
    # 检查图像是否为 PIL 图像列表
    image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
    # 检查图像是否为 PyTorch 张量列表
    image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
    # 检查图像是否为 NumPy 数组列表
    image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)

    # 如果图像不属于任何支持的类型，则抛出类型错误
    if (
        not image_is_pil
        and not image_is_tensor
        and not image_is_np
        and not image_is_pil_list
        and not image_is_tensor_list
        and not image_is_np_list
    ):
        raise TypeError(
            f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
        )

    # 如果图像是 PIL 图像，设置批处理大小为 1
    if image_is_pil:
        image_batch_size = 1
    else:
        # 否则，图像批处理大小为图像的长度
        image_batch_size = len(image)

    # 如果提示不为 None 且是字符串，设置提示批处理大小为 1
    if prompt is not None and isinstance(prompt, str):
        prompt_batch_size = 1
    # 如果提示为列表，则设置提示批处理大小为列表长度
    elif prompt is not None and isinstance(prompt, list):
        prompt_batch_size = len(prompt)
    # 如果提示嵌入不为 None，设置提示批处理大小为提示嵌入的第一个维度
    elif prompt_embeds is not None:
        prompt_batch_size = prompt_embeds.shape[0]

    # 如果图像批处理大小不为 1 且与提示批处理大小不一致，则抛出值错误
    if image_batch_size != 1 and image_batch_size != prompt_batch_size:
        raise ValueError(
            f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
        )

# 从 diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents 复制的函数
def prepare_latents(
    self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
):
    # 根据输入参数构造拉丁特征的形状
    shape = (
        batch_size,
        num_channels_latents,
        num_frames,
        height // self.vae_scale_factor,
        width // self.vae_scale_factor,
    )
    # 如果生成器是列表且其长度与批处理大小不匹配，则抛出值错误
    if isinstance(generator, list) and len(generator) != batch_size:
        raise ValueError(
            f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
            f" size of {batch_size}. Make sure the batch size matches the length of the generators."
        )

    # 如果拉丁特征为 None，则生成随机的拉丁特征
    if latents is None:
        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
    else:
        # 如果拉丁特征不为 None，则将其移动到指定设备
        latents = latents.to(device)

    # 将初始噪声按调度器所需的标准差进行缩放
    latents = latents * self.scheduler.init_noise_sigma
    # 返回处理后的拉丁特征
    return latents
# 定义准备图像的函数，接收图像及其尺寸、设备和数据类型作为参数
def prepare_image(self, image, width, height, device, dtype):
    # 对输入图像进行预处理，调整为指定的高度和宽度
    image = self.control_image_processor.preprocess(image, height=height, width=width)
    # 增加一个维度，并将图像移动到指定设备上，转换为指定数据类型
    controlnet_images = image.unsqueeze(0).to(device, dtype)
    # 获取控制网图像的批大小、帧数、通道数、高度和宽度
    batch_size, num_frames, channels, height, width = controlnet_images.shape

    # TODO: 移除下面这一行，检查控制网图像的最小值和最大值是否在0到1之间
    assert controlnet_images.min() >= 0 and controlnet_images.max() <= 1

    # 如果使用简化的条件嵌入，则进行形状调整和数据标准化
    if self.controlnet.use_simplified_condition_embedding:
        # 调整控制网图像的形状以便于后续处理
        controlnet_images = controlnet_images.reshape(batch_size * num_frames, channels, height, width)
        # 将图像数据从[0, 1]范围映射到[-1, 1]范围
        controlnet_images = 2 * controlnet_images - 1
        # 编码图像并获取条件帧，乘以配置的缩放因子
        conditioning_frames = retrieve_latents(self.vae.encode(controlnet_images)) * self.vae.config.scaling_factor
        # 将条件帧调整为原始批大小和帧数
        conditioning_frames = conditioning_frames.reshape(
            batch_size, num_frames, 4, height // self.vae_scale_factor, width // self.vae_scale_factor
        )
    else:
        # 否则，条件帧直接使用控制网图像
        conditioning_frames = controlnet_images

    # 重新排列维度以适应后续处理，格式为[b, c, f, h, w]
    conditioning_frames = conditioning_frames.permute(0, 2, 1, 3, 4)
    # 返回处理后的条件帧
    return conditioning_frames

# 定义准备稀疏控制条件的函数，接收条件帧及其它参数
def prepare_sparse_control_conditioning(
    self,
    conditioning_frames: torch.Tensor,
    num_frames: int,
    controlnet_frame_indices: int,
    device: torch.device,
    dtype: torch.dtype,
) -> Tuple[torch.Tensor, torch.Tensor]:
    # 确保条件帧的帧数大于控制网帧索引的数量
    assert conditioning_frames.shape[2] >= len(controlnet_frame_indices)

    # 获取条件帧的批大小、通道数、高度和宽度
    batch_size, channels, _, height, width = conditioning_frames.shape
    # 创建一个零张量作为控制网条件，形状为[batch_size, channels, num_frames, height, width]
    controlnet_cond = torch.zeros((batch_size, channels, num_frames, height, width), dtype=dtype, device=device)
    # 创建一个零张量作为控制网条件掩码
    controlnet_cond_mask = torch.zeros((batch_size, 1, num_frames, height, width), dtype=dtype, device=device)
    # 将条件帧的对应索引值赋值到控制网条件张量中
    controlnet_cond[:, :, controlnet_frame_indices] = conditioning_frames[:, :, : len(controlnet_frame_indices)]
    # 更新控制网条件掩码的对应索引为1
    controlnet_cond_mask[:, :, controlnet_frame_indices] = 1

    # 返回控制网条件和条件掩码
    return controlnet_cond, controlnet_cond_mask

# 定义一个属性，用于获取引导缩放的值
@property
def guidance_scale(self):
    return self._guidance_scale

# 定义一个属性，用于获取剪辑跳过的值
@property
def clip_skip(self):
    return self._clip_skip

# 这里的 `guidance_scale` 是根据 Imagen 论文的方程（2）定义的引导权重 `w`
# 当 `guidance_scale = 1` 时，表示没有进行分类器自由引导
@property
def do_classifier_free_guidance(self):
    # 检查引导缩放是否大于1，以确定是否进行分类器自由引导
    return self._guidance_scale > 1

# 定义一个属性，用于获取交叉注意力的关键字参数
@property
def cross_attention_kwargs(self):
    return self._cross_attention_kwargs

# 定义一个属性，用于获取时间步数
@property
def num_timesteps(self):
    return self._num_timesteps

# 在不计算梯度的情况下执行下面的装饰器，确保效率
@torch.no_grad()
# 替换示例文档字符串
@replace_example_docstring(EXAMPLE_DOC_STRING)
# 定义可调用的类方法，接受多个可选参数
    def __call__(
        # 提示文本，可以是字符串或字符串列表，默认为 None
        self,
        prompt: Optional[Union[str, List[str]]] = None,
        # 生成图像的高度，默认为 None
        height: Optional[int] = None,
        # 生成图像的宽度，默认为 None
        width: Optional[int] = None,
        # 每个提示生成的帧数，默认为 16
        num_frames: int = 16,
        # 推理步骤的数量，默认为 50
        num_inference_steps: int = 50,
        # 指导缩放因子，默认为 7.5
        guidance_scale: float = 7.5,
        # 负提示文本，可以是字符串或字符串列表，默认为 None
        negative_prompt: Optional[Union[str, List[str]]] = None,
        # 每个提示生成的视频数量，默认为 1
        num_videos_per_prompt: int = 1,
        # eta 参数，默认为 0.0
        eta: float = 0.0,
        # 随机数生成器，可以是单个或多个生成器，默认为 None
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        # 先前的潜在表示，默认为 None
        latents: Optional[torch.Tensor] = None,
        # 提示的嵌入表示，默认为 None
        prompt_embeds: Optional[torch.Tensor] = None,
        # 负提示的嵌入表示，默认为 None
        negative_prompt_embeds: Optional[torch.Tensor] = None,
        # IP 适配器的图像输入，默认为 None
        ip_adapter_image: Optional[PipelineImageInput] = None,
        # IP 适配器的图像嵌入表示，默认为 None
        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
        # 条件帧的图像输入列表，默认为 None
        conditioning_frames: Optional[List[PipelineImageInput]] = None,
        # 输出类型，默认为 "pil"
        output_type: str = "pil",
        # 是否返回字典格式的结果，默认为 True
        return_dict: bool = True,
        # 交叉注意力的参数，默认为 None
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
        # ControlNet 的条件缩放因子，默认为 1.0
        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
        # ControlNet 的帧索引，默认为 [0]
        controlnet_frame_indices: List[int] = [0],
        # 是否启用猜测模式，默认为 False
        guess_mode: bool = False,
        # 跳过的 CLIP 步数，默认为 None
        clip_skip: Optional[int] = None,
        # 步骤结束时的回调函数，默认为 None
        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
        # 步骤结束时的回调函数的张量输入列表，默认为 ["latents"]
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],


# `.\diffusers\pipelines\animatediff\pipeline_animatediff_video2video.py`

```py
# 版权声明，标明版权持有者和年份
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# 许可声明，说明该文件的使用条件
# Licensed under the Apache License, Version 2.0 (the "License");
# 仅在遵循许可条件下使用此文件
# you may not use this file except in compliance with the License.
# 可以在以下网址获取许可证副本
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律或书面协议另有规定，否则本许可下分发的软件是“按现状”提供的，
# 不提供任何明示或暗示的担保或条件
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 查看许可证以获取关于权限和限制的具体信息
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect  # 导入 inspect 模块，用于检查对象信息
from typing import Any, Callable, Dict, List, Optional, Union  # 导入类型提示所需的类型

import torch  # 导入 PyTorch 库，用于深度学习
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection  # 导入 transformers 库中的 CLIP 相关模型和处理器

from ...image_processor import PipelineImageInput  # 从相对路径导入 PipelineImageInput 类
from ...loaders import IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin  # 导入加载器相关的混合类
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel  # 导入各种模型类
from ...models.lora import adjust_lora_scale_text_encoder  # 导入调整文本编码器 Lora 缩放的函数
from ...models.unets.unet_motion_model import MotionAdapter  # 从 UNet 运动模型中导入 MotionAdapter 类
from ...schedulers import (  # 从调度器模块导入各种调度器类
    DDIMScheduler,  # 导入 DDIM 调度器
    DPMSolverMultistepScheduler,  # 导入多步 DPM 求解器调度器
    EulerAncestralDiscreteScheduler,  # 导入 Euler 祖先离散调度器
    EulerDiscreteScheduler,  # 导入 Euler 离散调度器
    LMSDiscreteScheduler,  # 导入 LMS 离散调度器
    PNDMScheduler,  # 导入 PNDM 调度器
)
from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers  # 导入实用工具函数和变量
from ...utils.torch_utils import randn_tensor  # 从 torch_utils 导入生成随机张量的函数
from ...video_processor import VideoProcessor  # 导入视频处理器类
from ..free_init_utils import FreeInitMixin  # 从相对路径导入 FreeInitMixin 类
from ..free_noise_utils import AnimateDiffFreeNoiseMixin  # 从相对路径导入 AnimateDiffFreeNoiseMixin 类
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin  # 导入扩散管道和稳定扩散混合类
from .pipeline_output import AnimateDiffPipelineOutput  # 导入 AnimateDiffPipelineOutput 类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器，便于日志记录 # pylint: disable=invalid-name

EXAMPLE_DOC_STRING = """  # 定义一个多行字符串，可能用于示例文档或说明

# 示例代码，展示如何使用 AnimateDiffVideoToVideoPipeline 处理视频
    Examples:
        ```py
        # 导入所需的库
        >>> import imageio
        >>> import requests
        >>> import torch
        >>> from diffusers import AnimateDiffVideoToVideoPipeline, DDIMScheduler, MotionAdapter
        >>> from diffusers.utils import export_to_gif
        >>> from io import BytesIO
        >>> from PIL import Image

        # 从预训练模型加载运动适配器
        >>> adapter = MotionAdapter.from_pretrained(
        ...     "guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16
        ... )
        # 从预训练模型加载视频到视频的管道，并将其移动到 GPU
        >>> pipe = AnimateDiffVideoToVideoPipeline.from_pretrained(
        ...     "SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter
        ... ).to("cuda")
        # 设置调度器的参数
        >>> pipe.scheduler = DDIMScheduler(
        ...     beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace"
        ... )

        # 定义加载视频的函数
        >>> def load_video(file_path: str):
        ...     images = []  # 初始化一个空列表以存储图像帧

        ...     # 检查文件路径是否是 URL
        ...     if file_path.startswith(("http://", "https://")):
        ...         # 如果 file_path 是 URL，发送 GET 请求
        ...         response = requests.get(file_path)
        ...         response.raise_for_status()  # 检查请求是否成功
        ...         content = BytesIO(response.content)  # 将响应内容转为字节流
        ...         vid = imageio.get_reader(content)  # 使用字节流读取视频
        ...     else:
        ...         # 假设是本地文件路径
        ...         vid = imageio.get_reader(file_path)  # 从文件路径读取视频

        ...     # 遍历视频中的每一帧
        ...     for frame in vid:
        ...         pil_image = Image.fromarray(frame)  # 将帧转换为 PIL 图像
        ...         images.append(pil_image)  # 将图像添加到列表中

        ...     return images  # 返回包含所有图像的列表


        # 加载视频并传入 URL
        >>> video = load_video(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif"
        ... )
        # 处理视频并生成输出，设置提示语和强度
        >>> output = pipe(
        ...     video=video, prompt="panda playing a guitar, on a boat, in the ocean, high quality", strength=0.5
        ... )
        # 获取处理后的视频帧
        >>> frames = output.frames[0]
        # 将帧导出为 GIF 文件
        >>> export_to_gif(frames, "animation.gif")
        ```

""" # 开始多行字符串注释，通常用于文档说明

Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents # 表示该函数是从指定模块复制的

def retrieve_latents( # 定义函数 retrieve_latents，接受编码器输出、可选生成器和采样模式参数
encoder_output: torch.Tensor, # 编码器输出，类型为 torch.Tensor
generator: Optional[torch.Generator] = None, # 可选的随机数生成器，默认值为 None
sample_mode: str = "sample" # 采样模式，默认为 "sample"
):
if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": # 检查 encoder_output 是否具有 latent_dist 属性，且采样模式为 "sample"
return encoder_output.latent_dist.sample(generator) # 从 latent_dist 中采样并返回
elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": # 检查 encoder_output 是否具有 latent_dist 属性，且采样模式为 "argmax"
return encoder_output.latent_dist.mode() # 返回 latent_dist 的众数
elif hasattr(encoder_output, "latents"): # 检查 encoder_output 是否具有 latents 属性
return encoder_output.latents # 返回 latents
else: # 如果上述条件都不满足
raise AttributeError("Could not access latents of provided encoder_output") # 抛出属性错误，表示无法访问 latents

Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps # 表示该函数是从指定模块复制的

def retrieve_timesteps( # 定义函数 retrieve_timesteps，接受调度器和其他可选参数
scheduler, # 调度器对象
num_inference_steps: Optional[int] = None, # 可选的推理步骤数量，默认值为 None
device: Optional[Union[str, torch.device]] = None, # 可选的设备类型，默认值为 None
timesteps: Optional[List[int]] = None, # 可选的时间步列表，默认值为 None
sigmas: Optional[List[float]] = None, # 可选的 sigma 列表，默认值为 None
**kwargs, # 接受其他关键字参数
):
""" # 开始文档字符串注释
Calls the scheduler's set_timesteps method and retrieves timesteps from the scheduler after the call. Handles # 调用调度器的 set_timesteps 方法并在调用后获取时间步
custom timesteps. Any kwargs will be supplied to scheduler.set_timesteps. # 处理自定义时间步，任何关键字参数将传递给 scheduler.set_timesteps

Args:  # 参数说明
    scheduler (`SchedulerMixin`):  # 调度器类型
        The scheduler to get timesteps from.  # 从中获取时间步的调度器
    num_inference_steps (`int`):  # 推理步骤数量类型
        The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`  # 生成样本时使用的扩散步骤数量，如果使用该参数，timesteps 必须为 None
        must be `None`.  # timesteps 必须为 None
    device (`str` or `torch.device`, *optional*):  # 设备类型说明
        The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.  # 将时间步移动到的设备，如果为 None，则不移动
    timesteps (`List[int]`, *optional*):  # 自定义时间步列表说明
        Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,  # 自定义时间步，用于覆盖调度器的时间步间隔策略，如果传入该参数
        `num_inference_steps` and `sigmas` must be `None`.  # num_inference_steps 和 sigmas 必须为 None
    sigmas (`List[float]`, *optional*):  # 自定义 sigma 列表说明
        Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,  # 自定义 sigma，用于覆盖调度器的时间步间隔策略，如果传入该参数
        `num_inference_steps` and `timesteps` must be `None`.  # num_inference_steps 和 timesteps 必须为 None

Returns:  # 返回值说明
    `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the  # 返回一个元组，包含调度器的时间步序列和推理步骤数量
    second element is the number of inference steps.  # 第二个元素是推理步骤数量
"""  # 结束文档字符串注释
if timesteps is not None and sigmas is not None:  # 检查 timesteps 和 sigmas 是否都不为 None
    raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")  # 抛出值错误，提示只能传递一个参数
# 检查 timesteps 是否为 None
    if timesteps is not None:
        # 判断 scheduler.set_timesteps 方法是否接受 timesteps 参数
        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
        # 如果不接受，抛出异常并提示用户
        if not accepts_timesteps:
            raise ValueError(
                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                f" timestep schedules. Please check whether you are using the correct scheduler."
            )
        # 设置 scheduler 的 timesteps，指定设备和其他参数
        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
        # 获取设置后的 timesteps
        timesteps = scheduler.timesteps
        # 计算推理步骤的数量
        num_inference_steps = len(timesteps)
    # 如果 sigmas 不为 None
    elif sigmas is not None:
        # 判断 scheduler.set_timesteps 方法是否接受 sigmas 参数
        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
        # 如果不接受，抛出异常并提示用户
        if not accept_sigmas:
            raise ValueError(
                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
                f" sigmas schedules. Please check whether you are using the correct scheduler."
            )
        # 设置 scheduler 的 sigmas，指定设备和其他参数
        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
        # 获取设置后的 timesteps
        timesteps = scheduler.timesteps
        # 计算推理步骤的数量
        num_inference_steps = len(timesteps)
    # 如果 timesteps 和 sigmas 都为 None
    else:
        # 设置 scheduler 的 timesteps 为推理步骤的数量，指定设备和其他参数
        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
        # 获取设置后的 timesteps
        timesteps = scheduler.timesteps
    # 返回 timesteps 和推理步骤的数量
    return timesteps, num_inference_steps

定义一个名为 AnimateDiffVideoToVideoPipeline 的类，继承自多个基类

class AnimateDiffVideoToVideoPipeline(
DiffusionPipeline, # 继承自 DiffusionPipeline，提供通用的管道功能
StableDiffusionMixin, # 继承自 StableDiffusionMixin，提供稳定扩散功能
TextualInversionLoaderMixin, # 继承自 TextualInversionLoaderMixin，提供文本反转加载功能
IPAdapterMixin, # 继承自 IPAdapterMixin，提供 IP 适配器功能
StableDiffusionLoraLoaderMixin, # 继承自 StableDiffusionLoraLoaderMixin，提供 LoRA 加载功能
FreeInitMixin, # 继承自 FreeInitMixin，提供自由初始化功能
AnimateDiffFreeNoiseMixin, # 继承自 AnimateDiffFreeNoiseMixin，提供动画差异无噪声功能
):
r""" # 开始文档字符串，描述管道的功能

Pipeline for video-to-video generation.  # 视频到视频生成的管道

This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).  # 提示查看父类文档以获取通用方法

The pipeline also inherits the following loading methods:  # 列出管道继承的加载方法
    - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings  # 文本反转嵌入加载方法
    - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights  # LoRA 权重加载方法
    - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights  # LoRA 权重保存方法
    - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters  # IP 适配器加载方法

Args:  # 参数说明
    vae ([`AutoencoderKL`]):  # VAE 模型，用于图像的编码和解码
        Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.  # 描述 VAE 的作用
    text_encoder ([`CLIPTextModel`]):  # 冻结的文本编码器
        Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).  # 指定文本编码器模型
    tokenizer (`CLIPTokenizer`):  # 文本标记器
        A [`~transformers.CLIPTokenizer`] to tokenize text.  # 描述标记器的功能
    unet ([`UNet2DConditionModel`]):  # 用于生成去噪的 UNet 模型
        A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.  # 描述 UNet 的作用
    motion_adapter ([`MotionAdapter`]):  # 动作适配器，用于视频去噪
        A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.  # 描述动作适配器的作用
    scheduler ([`SchedulerMixin`]):  # 调度器，用于图像去噪
        A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of  # 描述调度器的功能和选项
        [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].  # 可选的调度器类型
"""  # 结束文档字符串

model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"  # 定义模型在 CPU 上的卸载顺序
_optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]  # 定义可选组件的列表
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]  # 定义回调张量输入的列表

def __init__(  # 初始化方法
    self,  # 当前实例
    vae: AutoencoderKL,  # VAE 模型参数
    text_encoder: CLIPTextModel,  # 文本编码器参数
    tokenizer: CLIPTokenizer,  # 标记器参数
    unet: UNet2DConditionModel,  # UNet 模型参数
    motion_adapter: MotionAdapter,  # 动作适配器参数
    scheduler: Union[  # 调度器参数，支持多种类型
        DDIMScheduler,  # DDIM 调度器
        PNDMScheduler,  # PNDM 调度器
        LMSDiscreteScheduler,  # LMS 离散调度器
        EulerDiscreteScheduler,  # 欧拉离散调度器
        EulerAncestralDiscreteScheduler,  # 欧拉祖先离散调度器
        DPMSolverMultistepScheduler,  # DPM 多步调度器
    ],
    feature_extractor: CLIPImageProcessor = None,  # 特征提取器参数，默认为 None
    image_encoder: CLIPVisionModelWithProjection = None,  # 图像编码器参数，默认为 None
):
    # 调用父类的初始化方法
    super().__init__()
    # 检查传入的 unet 是否为 UNet2DConditionModel 类型
    if isinstance(unet, UNet2DConditionModel):
        # 将 UNet2DConditionModel 转换为 UNetMotionModel
        unet = UNetMotionModel.from_unet2d(unet, motion_adapter)

    # 注册各个模块到当前对象中
    self.register_modules(
        # 注册变分自编码器
        vae=vae,
        # 注册文本编码器
        text_encoder=text_encoder,
        # 注册分词器
        tokenizer=tokenizer,
        # 注册 UNet 模型
        unet=unet,
        # 注册运动适配器
        motion_adapter=motion_adapter,
        # 注册调度器
        scheduler=scheduler,
        # 注册特征提取器
        feature_extractor=feature_extractor,
        # 注册图像编码器
        image_encoder=image_encoder,
    )
    # 计算 VAE 的缩放因子
    self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
    # 创建视频处理器，使用计算出的缩放因子
    self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor)

# 从 diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt 复制，参数 num_images_per_prompt 改为 num_videos_per_prompt
def encode_prompt(
    self,
    # 提示文本
    prompt,
    # 设备类型
    device,
    # 每个提示生成的图像数量
    num_images_per_prompt,
    # 是否进行分类器自由引导
    do_classifier_free_guidance,
    # 可选的负面提示文本
    negative_prompt=None,
    # 可选的提示嵌入
    prompt_embeds: Optional[torch.Tensor] = None,
    # 可选的负面提示嵌入
    negative_prompt_embeds: Optional[torch.Tensor] = None,
    # 可选的 LORA 缩放因子
    lora_scale: Optional[float] = None,
    # 可选的跳过 CLIP 层数
    clip_skip: Optional[int] = None,
# 从 diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image 复制
def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
    # 获取图像编码器参数的数据类型
    dtype = next(self.image_encoder.parameters()).dtype

    # 如果输入的图像不是张量，则通过特征提取器处理
    if not isinstance(image, torch.Tensor):
        image = self.feature_extractor(image, return_tensors="pt").pixel_values

    # 将图像移动到指定设备并设置数据类型
    image = image.to(device=device, dtype=dtype)
    # 如果需要输出隐藏状态
    if output_hidden_states:
        # 获取图像编码的隐藏状态
        image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
        # 重复隐藏状态以匹配每个提示的图像数量
        image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
        # 获取无条件图像的隐藏状态
        uncond_image_enc_hidden_states = self.image_encoder(
            torch.zeros_like(image), output_hidden_states=True
        ).hidden_states[-2]
        # 重复无条件隐藏状态以匹配每个提示的图像数量
        uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
            num_images_per_prompt, dim=0
        )
        # 返回有条件和无条件的图像隐藏状态
        return image_enc_hidden_states, uncond_image_enc_hidden_states
    else:
        # 获取图像嵌入
        image_embeds = self.image_encoder(image).image_embeds
        # 重复图像嵌入以匹配每个提示的图像数量
        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
        # 创建与图像嵌入相同形状的零张量作为无条件嵌入
        uncond_image_embeds = torch.zeros_like(image_embeds)

        # 返回有条件和无条件的图像嵌入
        return image_embeds, uncond_image_embeds

# 从 diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds 复制
def prepare_ip_adapter_image_embeds(
    # 输入的适配器图像
    self, ip_adapter_image, 
    # 输入的适配器图像嵌入
    ip_adapter_image_embeds, 
    # 设备类型
    device, 
    # 每个提示生成的图像数量
    num_images_per_prompt, 
    # 是否进行分类器自由引导
    do_classifier_free_guidance
):
    # 初始化一个空列表，用于存储图像嵌入
    image_embeds = []
    # 如果启用分类器自由引导，则初始化一个空列表，用于存储负图像嵌入
    if do_classifier_free_guidance:
        negative_image_embeds = []
    # 如果输入适配器图像嵌入为空
    if ip_adapter_image_embeds is None:
        # 如果输入适配器图像不是列表，则将其转换为列表
        if not isinstance(ip_adapter_image, list):
            ip_adapter_image = [ip_adapter_image]

        # 检查输入适配器图像的长度是否与 IP 适配器的数量相同
        if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
            # 如果不相同，则抛出值错误
            raise ValueError(
                f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
            )

        # 遍历每个单独的输入适配器图像及其对应的图像投影层
        for single_ip_adapter_image, image_proj_layer in zip(
            ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
        ):
            # 判断当前图像投影层是否为 ImageProjection 类型
            output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
            # 编码单个适配器图像，获取图像嵌入和负图像嵌入
            single_image_embeds, single_negative_image_embeds = self.encode_image(
                single_ip_adapter_image, device, 1, output_hidden_state
            )

            # 将单个图像嵌入添加到列表中，并在第一维增加维度
            image_embeds.append(single_image_embeds[None, :])
            # 如果启用分类器自由引导，则将负图像嵌入添加到列表中
            if do_classifier_free_guidance:
                negative_image_embeds.append(single_negative_image_embeds[None, :])
    else:
        # 遍历输入适配器图像嵌入
        for single_image_embeds in ip_adapter_image_embeds:
            # 如果启用分类器自由引导，则将图像嵌入分成两部分
            if do_classifier_free_guidance:
                single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
                # 将负图像嵌入添加到列表中
                negative_image_embeds.append(single_negative_image_embeds)
            # 将图像嵌入添加到列表中
            image_embeds.append(single_image_embeds)

    # 初始化一个空列表，用于存储最终的输入适配器图像嵌入
    ip_adapter_image_embeds = []
    # 遍历每个图像嵌入
    for i, single_image_embeds in enumerate(image_embeds):
        # 根据每个提示需要生成的图像数量，复制图像嵌入
        single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
        # 如果启用分类器自由引导，则处理负图像嵌入
        if do_classifier_free_guidance:
            single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
            # 将负图像嵌入与正图像嵌入拼接
            single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)

        # 将图像嵌入移动到指定设备上
        single_image_embeds = single_image_embeds.to(device=device)
        # 将处理后的图像嵌入添加到最终列表中
        ip_adapter_image_embeds.append(single_image_embeds)

    # 返回最终的输入适配器图像嵌入列表
    return ip_adapter_image_embeds

# 编码视频的方法，接受视频、生成器和解码块大小参数
def encode_video(self, video, generator, decode_chunk_size: int = 16) -> torch.Tensor:
    # 初始化一个空列表，用于存储潜在表示
    latents = []
    # 按照解码块大小遍历视频
    for i in range(0, len(video), decode_chunk_size):
        # 获取当前块的视频帧
        batch_video = video[i : i + decode_chunk_size]
        # 编码当前块的视频帧并检索潜在表示
        batch_video = retrieve_latents(self.vae.encode(batch_video), generator=generator)
        # 将编码结果添加到潜在表示列表中
        latents.append(batch_video)
    # 将所有潜在表示拼接成一个张量并返回
    return torch.cat(latents)

# 从 diffusers.pipelines.animatediff.pipeline_animatediff.AnimateDiffPipeline.decode_latents 复制的代码
# 解码潜在变量，生成视频
def decode_latents(self, latents, decode_chunk_size: int = 16):
    # 将潜在变量按比例缩放
    latents = 1 / self.vae.config.scaling_factor * latents

    # 获取潜在变量的形状信息：批量大小、通道数、帧数、高度和宽度
    batch_size, channels, num_frames, height, width = latents.shape
    # 调整潜在变量的维度顺序，并展平帧维度
    latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)

    # 初始化视频列表以存储解码后的帧
    video = []
    # 逐批解码潜在变量
    for i in range(0, latents.shape[0], decode_chunk_size):
        # 提取当前批次的潜在变量
        batch_latents = latents[i : i + decode_chunk_size]
        # 使用 VAE 解码当前批次的潜在变量，获取样本
        batch_latents = self.vae.decode(batch_latents).sample
        # 将解码后的帧添加到视频列表中
        video.append(batch_latents)

    # 将所有解码后的帧沿着第一个维度拼接
    video = torch.cat(video)
    # 重塑视频的形状为 (批量大小, 帧数, 其他维度)，并调整维度顺序
    video = video[None, :].reshape((batch_size, num_frames, -1) + video.shape[2:]).permute(0, 2, 1, 3, 4)
    # 转换视频数据为 float32 类型，确保与 bfloat16 兼容且不会造成显著开销
    video = video.float()
    # 返回解码后的视频数据
    return video

# 从 diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs 复制的函数
def prepare_extra_step_kwargs(self, generator, eta):
    # 准备调度器步骤的额外关键字参数，因为并非所有调度器的签名相同
    # eta (η) 仅在 DDIMScheduler 中使用，对于其他调度器将被忽略
    # eta 对应于 DDIM 论文中的 η：https://arxiv.org/abs/2010.02502
    # 并且应该在 [0, 1] 之间

    # 检查调度器是否接受 eta 参数
    accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
    # 初始化额外步骤的关键字参数字典
    extra_step_kwargs = {}
    if accepts_eta:
        # 如果接受 eta，添加到额外参数字典中
        extra_step_kwargs["eta"] = eta

    # 检查调度器是否接受 generator 参数
    accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
    if accepts_generator:
        # 如果接受 generator，添加到额外参数字典中
        extra_step_kwargs["generator"] = generator
    # 返回包含额外参数的字典
    return extra_step_kwargs

# 检查输入的有效性
def check_inputs(
    self,
    prompt,
    strength,
    height,
    width,
    video=None,
    latents=None,
    negative_prompt=None,
    prompt_embeds=None,
    negative_prompt_embeds=None,
    ip_adapter_image=None,
    ip_adapter_image_embeds=None,
    callback_on_step_end_tensor_inputs=None,
# 获取时间步
def get_timesteps(self, num_inference_steps, timesteps, strength, device):
    # 使用 init_timestep 获取原始时间步
    init_timestep = min(int(num_inference_steps * strength), num_inference_steps)

    # 计算开始时间步，确保不小于 0
    t_start = max(num_inference_steps - init_timestep, 0)
    # 从时间步中提取有效时间步
    timesteps = timesteps[t_start * self.scheduler.order :]

    # 返回有效时间步和剩余的推理步骤数
    return timesteps, num_inference_steps - t_start

# 准备潜在变量
def prepare_latents(
    self,
    video,
    height,
    width,
    num_channels_latents,
    batch_size,
    timestep,
    dtype,
    device,
    generator,
    latents=None,
    decode_chunk_size: int = 16,
# 获取引导比例
@property
def guidance_scale(self):
    # 返回引导比例的值
    return self._guidance_scale

@property
# 定义一个方法，返回当前实例的 clip_skip 属性
def clip_skip(self):
    return self._clip_skip

# 这里定义了 `guidance_scale`，它与 Imagen 论文中公式 (2) 的分类权重 `w` 类似
# `guidance_scale = 1` 表示不进行无分类器引导
@property
def do_classifier_free_guidance(self):
    # 返回一个布尔值，指示是否进行无分类器引导，依据是 guidance_scale 是否大于 1
    return self._guidance_scale > 1

@property
def cross_attention_kwargs(self):
    # 返回跨注意力的关键字参数
    return self._cross_attention_kwargs

@property
def num_timesteps(self):
    # 返回时间步数
    return self._num_timesteps

# 禁用梯度计算，以节省内存和计算资源
@torch.no_grad()
def __call__(
    # 定义调用方法的参数，允许输入视频列表，提示信息，图像的高度和宽度等
    video: List[List[PipelineImageInput]] = None,
    prompt: Optional[Union[str, List[str]]] = None,
    height: Optional[int] = None,
    width: Optional[int] = None,
    # 设置默认的推理步骤数为 50
    num_inference_steps: int = 50,
    timesteps: Optional[List[int]] = None,
    sigmas: Optional[List[float]] = None,
    # 默认的引导比例为 7.5
    guidance_scale: float = 7.5,
    # 默认的强度参数为 0.8
    strength: float = 0.8,
    negative_prompt: Optional[Union[str, List[str]]] = None,
    # 每个提示生成的视频数量默认为 1
    num_videos_per_prompt: Optional[int] = 1,
    # 默认的 eta 值为 0.0
    eta: float = 0.0,
    # 随机数生成器，支持单个或多个生成器
    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
    latents: Optional[torch.Tensor] = None,
    prompt_embeds: Optional[torch.Tensor] = None,
    negative_prompt_embeds: Optional[torch.Tensor] = None,
    ip_adapter_image: Optional[PipelineImageInput] = None,
    ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
    # 输出类型默认为 "pil"
    output_type: Optional[str] = "pil",
    # 返回字典的布尔标志，默认为 True
    return_dict: bool = True,
    # 允许输入跨注意力的关键字参数
    cross_attention_kwargs: Optional[Dict[str, Any]] = None,
    # clip_skip 的可选参数
    clip_skip: Optional[int] = None,
    # 逐步结束时的回调函数
    callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
    # 逐步结束时的张量输入回调名称列表，默认为 ["latents"]
    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
    # 解码时的块大小，默认为 16
    decode_chunk_size: int = 16,


# `.\diffusers\pipelines\animatediff\pipeline_output.py`

```py
# 从数据类模块导入数据类装饰器
from dataclasses import dataclass
# 导入用于类型提示的 List 和 Union
from typing import List, Union

# 导入 NumPy 库
import numpy as np
# 导入图像处理库 PIL
import PIL.Image
# 导入 PyTorch 库
import torch

# 从上级目录导入 BaseOutput 类
from ...utils import BaseOutput

# 定义 AnimateDiffPipelineOutput 数据类，继承自 BaseOutput
@dataclass
class AnimateDiffPipelineOutput(BaseOutput):
    r"""
     输出类，用于 AnimateDiff 管道。

    参数：
         frames (`torch.Tensor`, `np.ndarray` 或 List[List[PIL.Image.Image]]):
             视频输出的列表 - 可以是一个嵌套列表，长度为 `batch_size`，每个子列表包含去噪后的
             PIL 图像序列，长度为 `num_frames`。也可以是形状为
    `(batch_size, num_frames, channels, height, width)` 的 NumPy 数组或 Torch 张量。
    """

    # 定义 frames 属性，可以是不同类型的数据结构
    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]

`.\diffusers\pipelines\animatediff\init.py`

# 从 typing 模块导入 TYPE_CHECKING，用于静态类型检查
from typing import TYPE_CHECKING

# 从上级目录的 utils 模块导入多个工具和常量
from ...utils import (
    DIFFUSERS_SLOW_IMPORT,  # 用于判断是否需要慢速导入
    OptionalDependencyNotAvailable,  # 可选依赖未满足时的异常
    _LazyModule,  # 用于懒加载模块的工具
    get_objects_from_module,  # 从模块中获取对象的工具
    is_torch_available,  # 检查 PyTorch 是否可用
    is_transformers_available,  # 检查 Transformers 库是否可用
)

# 初始化一个空字典用于存储占位对象
_dummy_objects = {}
# 定义模块的导入结构，初始化 pipeline_output 的导入
_import_structure = {"pipeline_output": ["AnimateDiffPipelineOutput"]}

# 尝试检查依赖库是否可用
try:
    # 如果 Transformers 和 Torch 库都不可用，则抛出异常
    if not (is_transformers_available() and is_torch_available()):
        raise OptionalDependencyNotAvailable()
# 捕获可选依赖未满足的异常
except OptionalDependencyNotAvailable:
    # 从 utils 模块导入占位对象
    from ...utils import dummy_torch_and_transformers_objects

    # 更新占位对象字典，获取占位对象
    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
    # 如果依赖可用，更新导入结构以包含动画相关的管道
    _import_structure["pipeline_animatediff"] = ["AnimateDiffPipeline"]
    _import_structure["pipeline_animatediff_controlnet"] = ["AnimateDiffControlNetPipeline"]
    _import_structure["pipeline_animatediff_sdxl"] = ["AnimateDiffSDXLPipeline"]
    _import_structure["pipeline_animatediff_sparsectrl"] = ["AnimateDiffSparseControlNetPipeline"]
    _import_structure["pipeline_animatediff_video2video"] = ["AnimateDiffVideoToVideoPipeline"]

# 如果是类型检查或慢速导入，则执行以下代码
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
        # 检查依赖库是否可用
        if not (is_transformers_available() and is_torch_available()):
            raise OptionalDependencyNotAvailable()
    # 捕获可选依赖未满足的异常
    except OptionalDependencyNotAvailable:
        # 从 utils 模块导入所有占位对象
        from ...utils.dummy_torch_and_transformers_objects import *

    else:
        # 如果依赖可用，导入动画管道的具体实现
        from .pipeline_animatediff import AnimateDiffPipeline
        from .pipeline_animatediff_controlnet import AnimateDiffControlNetPipeline
        from .pipeline_animatediff_sdxl import AnimateDiffSDXLPipeline
        from .pipeline_animatediff_sparsectrl import AnimateDiffSparseControlNetPipeline
        from .pipeline_animatediff_video2video import AnimateDiffVideoToVideoPipeline
        from .pipeline_output import AnimateDiffPipelineOutput

# 否则，执行懒加载
else:
    import sys

    # 用懒加载模块初始化当前模块
    sys.modules[__name__] = _LazyModule(
        __name__,
        globals()["__file__"],
        _import_structure,  # 使用之前定义的导入结构
        module_spec=__spec__,  # 模块的规范信息
    )
    # 将占位对象添加到当前模块
    for name, value in _dummy_objects.items():
        setattr(sys.modules[__name__], name, value)

`.\diffusers\pipelines\audioldm\pipeline_audioldm.py`

# 版权声明，标识该文件的版权归 HuggingFace 团队所有
# 
# 根据 Apache 许可证第 2.0 版（“许可证”）进行许可；
# 除非遵守许可证，否则您不得使用此文件。
# 您可以在以下位置获取许可证副本：
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# 除非适用法律或书面同意，否则根据许可证分发的软件是以“按现状”基础提供，
# 不提供任何形式的明示或暗示的担保或条件。
# 有关许可证下特定语言管理权限和限制的详细信息，请参见许可证。

import inspect  # 导入 inspect 模块，用于获取对象的内部信息
from typing import Any, Callable, Dict, List, Optional, Union  # 从 typing 模块导入类型注解，支持类型提示

import numpy as np  # 导入 numpy 库，通常用于数组和数值计算
import torch  # 导入 PyTorch 库，主要用于深度学习
import torch.nn.functional as F  # 导入 PyTorch 的神经网络功能模块，简化函数调用
from transformers import ClapTextModelWithProjection, RobertaTokenizer, RobertaTokenizerFast, SpeechT5HifiGan  # 从 transformers 库导入特定的模型和标记器

from ...models import AutoencoderKL, UNet2DConditionModel  # 从相对路径导入 AutoencoderKL 和 UNet2DConditionModel 模型
from ...schedulers import KarrasDiffusionSchedulers  # 从相对路径导入 KarrasDiffusionSchedulers，用于调度
from ...utils import logging, replace_example_docstring  # 从相对路径导入工具函数，提供日志和文档字符串替换功能
from ...utils.torch_utils import randn_tensor  # 从相对路径导入 randn_tensor 函数，生成随机张量
from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, StableDiffusionMixin  # 从上一级导入音频管道输出和 DiffusionPipeline 相关类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器，便于记录日志信息

EXAMPLE_DOC_STRING = """  # 示例文档字符串，提供使用示例以指导用户
    Examples:  # 示例部分的开始
        ```py  # 使用代码块标记 Python 示例
        >>> from diffusers import AudioLDMPipeline  # 导入 AudioLDMPipeline 类
        >>> import torch  # 导入 PyTorch 库
        >>> import scipy  # 导入 scipy 库，用于科学计算

        >>> repo_id = "cvssp/audioldm-s-full-v2"  # 定义模型的存储库 ID
        >>> pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)  # 从预训练模型创建管道
        >>> pipe = pipe.to("cuda")  # 将管道移动到 CUDA 设备以加速计算

        >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"  # 定义生成音频的提示
        >>> audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0]  # 生成音频并获取第一个音频输出

        >>> # 保存音频样本为 .wav 文件
        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)  # 使用 scipy 将音频写入文件
        ```py
"""  # 示例文档字符串的结束


class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin):  # 定义 AudioLDMPipeline 类，继承自 DiffusionPipeline 和 StableDiffusionMixin
    r"""  # 文档字符串，描述类的功能
    Pipeline for text-to-audio generation using AudioLDM.  # 说明该管道用于文本到音频生成

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods  # 指明该模型继承自 DiffusionPipeline，建议查看父类文档以了解通用方法
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).  # 提及所有管道通用方法的实现（下载、保存、设备运行等）
    # 参数说明，定义模型及其组件
    Args:
        vae ([`AutoencoderKL`]):
            # 变分自编码器模型，用于将图像编码为潜在表示并解码
        text_encoder ([`~transformers.ClapTextModelWithProjection`]):
            # 冻结的文本编码器，具体为ClapTextModelWithProjection变体
            [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused)。
        tokenizer ([`PreTrainedTokenizer`]):
            # 用于文本分词的RobertaTokenizer
        unet ([`UNet2DConditionModel`]):
            # 用于去噪编码音频潜在表示的UNet2DConditionModel
        scheduler ([`SchedulerMixin`]):
            # 与unet结合使用的调度器，用于去噪音频潜在表示，可以是
            [`DDIMScheduler`], [`LMSDiscreteScheduler`]或[`PNDMScheduler`]。
        vocoder ([`~transformers.SpeechT5HifiGan`]):
            # SpeechT5HifiGan类的声码器
    """

    # 定义模型的CPU卸载顺序
    model_cpu_offload_seq = "text_encoder->unet->vae"

    def __init__(
        # 构造函数参数定义
        self,
        vae: AutoencoderKL,
        text_encoder: ClapTextModelWithProjection,
        tokenizer: Union[RobertaTokenizer, RobertaTokenizerFast],
        unet: UNet2DConditionModel,
        scheduler: KarrasDiffusionSchedulers,
        vocoder: SpeechT5HifiGan,
    ):
        # 调用父类构造函数
        super().__init__()

        # 注册模型组件
        self.register_modules(
            vae=vae,
            text_encoder=text_encoder,
            tokenizer=tokenizer,
            unet=unet,
            scheduler=scheduler,
            vocoder=vocoder,
        )
        # 计算VAE的缩放因子
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)

    def _encode_prompt(
        # 编码提示参数定义
        self,
        prompt,
        device,
        num_waveforms_per_prompt,
        do_classifier_free_guidance,
        negative_prompt=None,
        prompt_embeds: Optional[torch.Tensor] = None,
        negative_prompt_embeds: Optional[torch.Tensor] = None,
    def decode_latents(self, latents):
        # 解码潜在表示，调整比例并生成梅尔谱
        latents = 1 / self.vae.config.scaling_factor * latents
        mel_spectrogram = self.vae.decode(latents).sample
        # 返回生成的梅尔谱
        return mel_spectrogram

    def mel_spectrogram_to_waveform(self, mel_spectrogram):
        # 如果梅尔谱有四维，去掉多余维度
        if mel_spectrogram.dim() == 4:
            mel_spectrogram = mel_spectrogram.squeeze(1)

        # 使用声码器生成波形
        waveform = self.vocoder(mel_spectrogram)
        # 始终转换为float32，确保与bfloat16兼容且开销不大
        waveform = waveform.cpu().float()
        # 返回生成的波形
        return waveform

    # 从diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs复制
    # 定义一个方法，准备调度器步骤所需的额外参数
        def prepare_extra_step_kwargs(self, generator, eta):
            # 为调度器步骤准备额外参数，因为并非所有调度器都有相同的参数签名
            # eta (η) 仅在 DDIMScheduler 中使用，对于其他调度器将被忽略
            # eta 对应于 DDIM 论文中的 η: https://arxiv.org/abs/2010.02502
            # 并且应该在 [0, 1] 之间
    
            # 检查调度器的步骤是否接受 eta 参数
            accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
            # 初始化一个字典用于存储额外参数
            extra_step_kwargs = {}
            # 如果调度器接受 eta，则将其添加到额外参数字典中
            if accepts_eta:
                extra_step_kwargs["eta"] = eta
    
            # 检查调度器的步骤是否接受 generator 参数
            accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
            # 如果调度器接受 generator，则将其添加到额外参数字典中
            if accepts_generator:
                extra_step_kwargs["generator"] = generator
            # 返回包含额外参数的字典
            return extra_step_kwargs
    
        # 定义一个方法，检查输入参数的有效性
        def check_inputs(
            self,
            prompt,  # 输入的提示文本
            audio_length_in_s,  # 音频长度（以秒为单位）
            vocoder_upsample_factor,  # 声码器上采样因子
            callback_steps,  # 回调步骤
            negative_prompt=None,  # 可选的负向提示文本
            prompt_embeds=None,  # 可选的提示嵌入
            negative_prompt_embeds=None,  # 可选的负向提示嵌入
    ):
        # 计算最小音频长度，以秒为单位，基于重采样因子和 VAE 缩放因子
        min_audio_length_in_s = vocoder_upsample_factor * self.vae_scale_factor
        # 检查输入音频长度是否小于最小音频长度
        if audio_length_in_s < min_audio_length_in_s:
            # 抛出值错误，说明音频长度必须大于或等于最小值
            raise ValueError(
                f"`audio_length_in_s` has to be a positive value greater than or equal to {min_audio_length_in_s}, but "
                f"is {audio_length_in_s}."
            )

        # 检查频率 bins 数是否能被 VAE 缩放因子整除
        if self.vocoder.config.model_in_dim % self.vae_scale_factor != 0:
            # 抛出值错误，说明频率 bins 数必须能被 VAE 缩放因子整除
            raise ValueError(
                f"The number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the "
                f"VAE scale factor, but got {self.vocoder.config.model_in_dim} bins and a scale factor of "
                f"{self.vae_scale_factor}."
            )

        # 检查回调步骤是否为正整数
        if (callback_steps is None) or (
            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
        ):
            # 抛出值错误，说明回调步骤必须是正整数
            raise ValueError(
                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
                f" {type(callback_steps)}."
            )

        # 检查是否同时提供了 `prompt` 和 `prompt_embeds`
        if prompt is not None and prompt_embeds is not None:
            # 抛出值错误，说明只能提供一个
            raise ValueError(
                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
                " only forward one of the two."
            )
        # 检查是否两者都未提供
        elif prompt is None and prompt_embeds is None:
            # 抛出值错误，说明至少需要提供一个
            raise ValueError(
                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
            )
        # 检查 `prompt` 类型是否有效
        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
            # 抛出值错误，说明 `prompt` 必须是字符串或列表
            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

        # 检查是否同时提供了 `negative_prompt` 和 `negative_prompt_embeds`
        if negative_prompt is not None and negative_prompt_embeds is not None:
            # 抛出值错误，说明只能提供一个
            raise ValueError(
                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
            )

        # 检查 `prompt_embeds` 和 `negative_prompt_embeds` 的形状是否匹配
        if prompt_embeds is not None and negative_prompt_embeds is not None:
            if prompt_embeds.shape != negative_prompt_embeds.shape:
                # 抛出值错误，说明两者形状必须相同
                raise ValueError(
                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
                    f" {negative_prompt_embeds.shape}."
                )

    # 从 diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents 复制，宽度调整为 self.vocoder.config.model_in_dim
    # 准备潜在变量，用于生成模型的输入
        def prepare_latents(self, batch_size, num_channels_latents, height, dtype, device, generator, latents=None):
            # 定义潜在变量的形状，包括批大小、通道数、高度和宽度
            shape = (
                batch_size,
                num_channels_latents,
                int(height) // self.vae_scale_factor,
                int(self.vocoder.config.model_in_dim) // self.vae_scale_factor,
            )
            # 检查生成器是否为列表且其长度是否与批大小一致
            if isinstance(generator, list) and len(generator) != batch_size:
                # 如果不一致，则引发值错误
                raise ValueError(
                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
                )
    
            # 如果未提供潜在变量，则生成随机潜在变量
            if latents is None:
                latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
            else:
                # 如果提供了潜在变量，则将其转移到指定设备
                latents = latents.to(device)
    
            # 按调度器要求的标准差缩放初始噪声
            latents = latents * self.scheduler.init_noise_sigma
            # 返回处理后的潜在变量
            return latents
    
        # 禁用梯度计算以节省内存
        @torch.no_grad()
        # 替换示例文档字符串
        @replace_example_docstring(EXAMPLE_DOC_STRING)
        def __call__(
            # 提示信息，可以是字符串或字符串列表
            prompt: Union[str, List[str]] = None,
            # 音频长度（秒）
            audio_length_in_s: Optional[float] = None,
            # 推理步骤数量
            num_inference_steps: int = 10,
            # 引导缩放因子
            guidance_scale: float = 2.5,
            # 负提示信息，可以是字符串或字符串列表
            negative_prompt: Optional[Union[str, List[str]]] = None,
            # 每个提示生成的波形数量
            num_waveforms_per_prompt: Optional[int] = 1,
            # eta参数
            eta: float = 0.0,
            # 生成器，可以是单个生成器或生成器列表
            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
            # 潜在变量
            latents: Optional[torch.Tensor] = None,
            # 提示嵌入
            prompt_embeds: Optional[torch.Tensor] = None,
            # 负提示嵌入
            negative_prompt_embeds: Optional[torch.Tensor] = None,
            # 是否返回字典格式的结果
            return_dict: bool = True,
            # 回调函数，接收当前步骤和张量
            callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
            # 回调步骤频率
            callback_steps: Optional[int] = 1,
            # 跨注意力参数
            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
            # 输出类型，默认为 numpy 格式
            output_type: Optional[str] = "np",

`.\diffusers\pipelines\audioldm\init.py`

# 导入类型检查工具
from typing import TYPE_CHECKING

# 从 utils 模块导入所需的工具和常量
from ...utils import (
    DIFFUSERS_SLOW_IMPORT,  # 慢导入标志
    OptionalDependencyNotAvailable,  # 可选依赖未找到异常
    _LazyModule,  # 懒加载模块工具
    is_torch_available,  # 检查 PyTorch 是否可用
    is_transformers_available,  # 检查 Transformers 是否可用
    is_transformers_version,  # 检查 Transformers 版本
)

# 存储占位符对象的字典
_dummy_objects = {}
# 存储导入结构的字典
_import_structure = {}

try:
    # 检查 Transformers 和 Torch 是否可用，以及 Transformers 版本是否符合要求
    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
        # 如果检查失败，抛出异常
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    # 导入占位符类，如果依赖未满足
    from ...utils.dummy_torch_and_transformers_objects import (
        AudioLDMPipeline,  # 占位符音频管道类
    )

    # 更新占位符对象字典
    _dummy_objects.update({"AudioLDMPipeline": AudioLDMPipeline})
else:
    # 如果依赖满足，记录导入结构
    _import_structure["pipeline_audioldm"] = ["AudioLDMPipeline"]

# 检查类型或是否慢导入
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
        # 再次检查依赖是否满足
        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        # 导入占位符类
        from ...utils.dummy_torch_and_transformers_objects import (
            AudioLDMPipeline,  # 占位符音频管道类
        )

    else:
        # 正常导入真实类
        from .pipeline_audioldm import AudioLDMPipeline
else:
    # 如果不是类型检查或慢导入，使用懒加载模块
    import sys

    # 用懒加载模块替换当前模块
    sys.modules[__name__] = _LazyModule(
        __name__,
        globals()["__file__"],
        _import_structure,  # 导入结构
        module_spec=__spec__,
    )

    # 将占位符对象添加到模块中
    for name, value in _dummy_objects.items():
        setattr(sys.modules[__name__], name, value)

`.\diffusers\pipelines\audioldm2\modeling_audioldm2.py`

# 版权信息，说明此文件的版权归 HuggingFace 团队所有
# 
# 根据 Apache License 2.0（“许可证”）许可；
# 你只能在遵循许可证的情况下使用此文件。
# 你可以在以下地址获取许可证副本：
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# 除非适用法律或书面协议另有规定，软件
# 根据许可证分发是“按原样”提供的，
# 不附带任何形式的保证或条件，无论是明示还是暗示的。
# 请参阅许可证了解特定语言的权限和
# 限制。

# 从 dataclasses 模块导入 dataclass 装饰器，用于简化类的创建
from dataclasses import dataclass
# 从 typing 模块导入类型注释工具
from typing import Any, Dict, List, Optional, Tuple, Union

# 导入 PyTorch 库
import torch
# 导入 nn 模块以构建神经网络
import torch.nn as nn
# 导入 checkpoint 功能以支持模型检查点
import torch.utils.checkpoint

# 从配置工具模块导入 ConfigMixin 和 register_to_config
from ...configuration_utils import ConfigMixin, register_to_config
# 从加载器模块导入 UNet2DConditionLoadersMixin
from ...loaders import UNet2DConditionLoadersMixin
# 从激活函数模块导入 get_activation 函数
from ...models.activations import get_activation
# 从注意力处理器模块导入各种注意力处理器
from ...models.attention_processor import (
    ADDED_KV_ATTENTION_PROCESSORS,  # 额外键值注意力处理器
    CROSS_ATTENTION_PROCESSORS,      # 交叉注意力处理器
    AttentionProcessor,               # 注意力处理器基类
    AttnAddedKVProcessor,            # 额外键值注意力处理器
    AttnProcessor,                   # 注意力处理器
)
# 从嵌入模块导入时间步嵌入和时间步类
from ...models.embeddings import (
    TimestepEmbedding,  # 时间步嵌入
    Timesteps,         # 时间步类
)
# 从建模工具模块导入 ModelMixin
from ...models.modeling_utils import ModelMixin
# 从 ResNet 模块导入下采样、ResNet 块和上采样类
from ...models.resnet import Downsample2D, ResnetBlock2D, Upsample2D
# 从 2D 转换器模块导入 Transformer2DModel
from ...models.transformers.transformer_2d import Transformer2DModel
# 从 UNet 2D 块模块导入下块和上块类
from ...models.unets.unet_2d_blocks import DownBlock2D, UpBlock2D
# 从 UNet 2D 条件模块导入 UNet2DConditionOutput
from ...models.unets.unet_2d_condition import UNet2DConditionOutput
# 从工具模块导入 BaseOutput、is_torch_version 和 logging
from ...utils import BaseOutput, is_torch_version, logging

# 创建一个日志记录器，使用当前模块的名称
logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

# 定义一个函数，用于添加特殊标记到隐藏状态和注意力掩码
def add_special_tokens(hidden_states, attention_mask, sos_token, eos_token):
    # 获取批量大小
    batch_size = hidden_states.shape[0]

    # 如果存在注意力掩码
    if attention_mask is not None:
        # 为注意力掩码添加两个额外的步骤
        new_attn_mask_step = attention_mask.new_ones((batch_size, 1))  # 创建全为 1 的新步骤
        # 将新的步骤添加到注意力掩码的前后
        attention_mask = torch.concat([new_attn_mask_step, attention_mask, new_attn_mask_step], dim=-1)

    # 在序列的开始/结束处添加 SOS / EOS 标记
    sos_token = sos_token.expand(batch_size, 1, -1)  # 扩展 SOS 标记的维度
    eos_token = eos_token.expand(batch_size, 1, -1)  # 扩展 EOS 标记的维度
    # 将 SOS 和 EOS 标记添加到隐藏状态
    hidden_states = torch.concat([sos_token, hidden_states, eos_token], dim=1)
    # 返回更新后的隐藏状态和注意力掩码
    return hidden_states, attention_mask

# 定义一个数据类，用于表示音频 LDM2 投影模型的输出
@dataclass
class AudioLDM2ProjectionModelOutput(BaseOutput):
    """
    参数：
    # 定义一个类，用于存储 AudioLDM2 投影层的输出。
    # hidden_states: 一个形状为 (batch_size, sequence_length, hidden_size) 的张量，表示每个文本编码器的隐藏状态序列
    hidden_states: torch.Tensor
    # attention_mask: 一个形状为 (batch_size, sequence_length) 的可选张量，用于避免在填充标记索引上执行注意力
    attention_mask: Optional[torch.LongTensor] = None
# 定义一个音频 LDM2 投影模型类，继承自 ModelMixin 和 ConfigMixin
class AudioLDM2ProjectionModel(ModelMixin, ConfigMixin):
    """
    一个简单的线性投影模型，用于将两个文本嵌入映射到共享的潜在空间。
    它还在每个文本嵌入序列的开始和结束分别插入学习到的嵌入向量。
    每个以 `_1` 结尾的变量对应于第二个文本编码器的变量，其他则来自第一个。

    参数:
        text_encoder_dim (`int`):
            第一个文本编码器（CLAP）生成的文本嵌入的维度。
        text_encoder_1_dim (`int`):
            第二个文本编码器（T5 或 VITS）生成的文本嵌入的维度。
        langauge_model_dim (`int`):
            语言模型（GPT2）生成的文本嵌入的维度。
    """

    @register_to_config
    # 构造函数，初始化各个参数
    def __init__(
        self,
        text_encoder_dim,
        text_encoder_1_dim,
        langauge_model_dim,
        use_learned_position_embedding=None,
        max_seq_length=None,
    ):
        # 调用父类的构造函数
        super().__init__()
        # 为每个文本编码器创建额外的投影层
        self.projection = nn.Linear(text_encoder_dim, langauge_model_dim)
        self.projection_1 = nn.Linear(text_encoder_1_dim, langauge_model_dim)

        # 为每个文本编码器的可学习 SOS/EOS 令牌嵌入
        self.sos_embed = nn.Parameter(torch.ones(langauge_model_dim))
        self.eos_embed = nn.Parameter(torch.ones(langauge_model_dim))

        self.sos_embed_1 = nn.Parameter(torch.ones(langauge_model_dim))
        self.eos_embed_1 = nn.Parameter(torch.ones(langauge_model_dim))

        # 保存是否使用学习到的位置嵌入
        self.use_learned_position_embedding = use_learned_position_embedding

        # 为 vits 编码器创建可学习的位置嵌入
        if self.use_learned_position_embedding is not None:
            self.learnable_positional_embedding = torch.nn.Parameter(
                # 初始化一个零张量作为可学习的位置嵌入
                torch.zeros((1, text_encoder_1_dim, max_seq_length))
            )

    # 定义前向传播方法
    def forward(
        self,
        hidden_states: Optional[torch.Tensor] = None,
        hidden_states_1: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
        attention_mask_1: Optional[torch.LongTensor] = None,
    # 进行隐藏状态的线性变换
        ):
            hidden_states = self.projection(hidden_states)
            # 为隐藏状态添加特殊标记，返回更新后的隐藏状态和注意力掩码
            hidden_states, attention_mask = add_special_tokens(
                hidden_states, attention_mask, sos_token=self.sos_embed, eos_token=self.eos_embed
            )
    
            # 如果使用学习的位置嵌入，则为 Vits 的隐藏状态添加位置嵌入
            if self.use_learned_position_embedding is not None:
                hidden_states_1 = (hidden_states_1.permute(0, 2, 1) + self.learnable_positional_embedding).permute(0, 2, 1)
    
            # 对隐藏状态进行线性变换
            hidden_states_1 = self.projection_1(hidden_states_1)
            # 为第二组隐藏状态添加特殊标记，返回更新后的隐藏状态和注意力掩码
            hidden_states_1, attention_mask_1 = add_special_tokens(
                hidden_states_1, attention_mask_1, sos_token=self.sos_embed_1, eos_token=self.eos_embed_1
            )
    
            # 将 clap 和 t5 的文本编码进行拼接
            hidden_states = torch.cat([hidden_states, hidden_states_1], dim=1)
    
            # 拼接注意力掩码
            if attention_mask is None and attention_mask_1 is not None:
                # 创建与 hidden_states 形状一致的全1张量作为注意力掩码
                attention_mask = attention_mask_1.new_ones((hidden_states[:2]))
            elif attention_mask is not None and attention_mask_1 is None:
                # 创建与 hidden_states_1 形状一致的全1张量作为注意力掩码
                attention_mask_1 = attention_mask.new_ones((hidden_states_1[:2]))
    
            # 如果两个注意力掩码都存在，则进行拼接
            if attention_mask is not None and attention_mask_1 is not None:
                attention_mask = torch.cat([attention_mask, attention_mask_1], dim=-1)
            else:
                # 如果没有有效的注意力掩码，则设为 None
                attention_mask = None
    
            # 返回包含隐藏状态和注意力掩码的输出对象
            return AudioLDM2ProjectionModelOutput(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
            )
# 定义一个条件 2D UNet 模型，继承自多个混入类
class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
    r"""
    一个条件 2D UNet 模型，它接收一个带噪声的样本、条件状态和时间步，并返回一个样本
    形状的输出。与普通的 [`UNet2DConditionModel`] 相比，此变体可选择性地在每个 Transformer 块中包含额外的
    自注意力层，以及多个交叉注意力层。它还允许最多使用两个交叉注意力嵌入，即 `encoder_hidden_states` 和 `encoder_hidden_states_1`。

    此模型继承自 [`ModelMixin`]。请查看父类文档以获取为所有模型实现的通用方法
    （如下载或保存）。
    """

    # 支持梯度检查点
    _supports_gradient_checkpointing = True

    # 将初始化方法注册到配置中
    @register_to_config
    def __init__(
        # 样本大小的可选参数
        sample_size: Optional[int] = None,
        # 输入通道数，默认为 4
        in_channels: int = 4,
        # 输出通道数，默认为 4
        out_channels: int = 4,
        # 控制正弦和余弦的翻转，默认为 True
        flip_sin_to_cos: bool = True,
        # 频移量，默认为 0
        freq_shift: int = 0,
        # 各层块类型的元组，指定了下采样块的类型
        down_block_types: Tuple[str] = (
            "CrossAttnDownBlock2D",
            "CrossAttnDownBlock2D",
            "CrossAttnDownBlock2D",
            "DownBlock2D",
        ),
        # 中间块的类型，默认为 "UNetMidBlock2DCrossAttn"
        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
        # 各层块类型的元组，指定了上采样块的类型
        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
        # 仅使用交叉注意力的布尔值或元组，默认为 False
        only_cross_attention: Union[bool, Tuple[bool]] = False,
        # 各块的输出通道数的元组
        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
        # 每个块的层数，默认为 2
        layers_per_block: Union[int, Tuple[int]] = 2,
        # 下采样填充，默认为 1
        downsample_padding: int = 1,
        # 中间块的缩放因子，默认为 1
        mid_block_scale_factor: float = 1,
        # 激活函数类型，默认为 "silu"
        act_fn: str = "silu",
        # 规范化的组数，默认为 32
        norm_num_groups: Optional[int] = 32,
        # 规范化的 epsilon 值，默认为 1e-5
        norm_eps: float = 1e-5,
        # 交叉注意力的维度，可以是整数或元组，默认为 1280
        cross_attention_dim: Union[int, Tuple[int]] = 1280,
        # 每个块的 Transformer 层数，可以是整数或元组，默认为 1
        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
        # 注意力头的维度，可以是整数或元组，默认为 8
        attention_head_dim: Union[int, Tuple[int]] = 8,
        # 注意力头的数量，可以是整数或元组，默认为 None
        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
        # 使用线性投影的布尔值，默认为 False
        use_linear_projection: bool = False,
        # 类嵌入类型的可选字符串
        class_embed_type: Optional[str] = None,
        # 类嵌入数量的可选整数
        num_class_embeds: Optional[int] = None,
        # 上溢注意力的布尔值，默认为 False
        upcast_attention: bool = False,
        # ResNet 时间缩放偏移，默认为 "default"
        resnet_time_scale_shift: str = "default",
        # 时间嵌入类型，默认为 "positional"
        time_embedding_type: str = "positional",
        # 时间嵌入维度的可选整数
        time_embedding_dim: Optional[int] = None,
        # 时间嵌入激活函数的可选字符串
        time_embedding_act_fn: Optional[str] = None,
        # 时间步后激活的可选字符串
        timestep_post_act: Optional[str] = None,
        # 时间条件投影维度的可选整数
        time_cond_proj_dim: Optional[int] = None,
        # 输入卷积核大小，默认为 3
        conv_in_kernel: int = 3,
        # 输出卷积核大小，默认为 3
        conv_out_kernel: int = 3,
        # 投影类嵌入输入维度的可选整数
        projection_class_embeddings_input_dim: Optional[int] = None,
        # 类嵌入是否连接的布尔值，默认为 False
        class_embeddings_concat: bool = False,
    @property
    # 从 diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors 复制的属性
    # 返回模型中所有注意力处理器的字典，按权重名称索引
    def attn_processors(self) -> Dict[str, AttentionProcessor]:
        r"""
        Returns:
            `dict` of attention processors: A dictionary containing all attention processors used in the model with
            indexed by its weight name.
        """
        # 初始化一个空字典，用于存储注意力处理器
        processors = {}
    
        # 定义一个递归函数，用于添加处理器
        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
            # 检查模块是否具有获取处理器的方法
            if hasattr(module, "get_processor"):
                # 将处理器添加到字典中，键为处理器的名称
                processors[f"{name}.processor"] = module.get_processor()
    
            # 遍历模块的所有子模块
            for sub_name, child in module.named_children():
                # 递归调用，以添加子模块的处理器
                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
    
            # 返回处理器字典
            return processors
    
        # 遍历当前对象的所有子模块
        for name, module in self.named_children():
            # 调用递归函数，以添加子模块的处理器
            fn_recursive_add_processors(name, module, processors)
    
        # 返回包含所有处理器的字典
        return processors
    
    # 从 diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor 复制
    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
        r"""
        Sets the attention processor to use to compute attention.
    
        Parameters:
            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
                The instantiated processor class or a dictionary of processor classes that will be set as the processor
                for **all** `Attention` layers.
    
                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
                processor. This is strongly recommended when setting trainable attention processors.
    
        """
        # 获取当前处理器字典中的处理器数量
        count = len(self.attn_processors.keys())
    
        # 检查传入的处理器字典与当前层数是否匹配
        if isinstance(processor, dict) and len(processor) != count:
            # 抛出值错误，提示处理器数量不匹配
            raise ValueError(
                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
            )
    
        # 定义一个递归函数，用于设置处理器
        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
            # 检查模块是否具有设置处理器的方法
            if hasattr(module, "set_processor"):
                # 如果处理器不是字典，直接设置
                if not isinstance(processor, dict):
                    module.set_processor(processor)
                else:
                    # 从字典中弹出处理器并设置
                    module.set_processor(processor.pop(f"{name}.processor"))
    
            # 遍历模块的所有子模块
            for sub_name, child in module.named_children():
                # 递归调用，以设置子模块的处理器
                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
    
        # 遍历当前对象的所有子模块
        for name, module in self.named_children():
            # 调用递归函数，以设置子模块的处理器
            fn_recursive_attn_processor(name, module, processor)
    
    # 从 diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor 复制
    # 设置默认的注意力处理器，禁用自定义注意力处理器
    def set_default_attn_processor(self):
        # 文档字符串，说明此方法的功能
        """
        Disables custom attention processors and sets the default attention implementation.
        """
        # 检查所有注意力处理器是否属于自定义 KV 注意力处理器类型
        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
            # 如果是，创建一个添加 KV 的注意力处理器实例
            processor = AttnAddedKVProcessor()
        # 检查所有注意力处理器是否属于交叉注意力处理器类型
        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
            # 如果是，创建一个标准的注意力处理器实例
            processor = AttnProcessor()
        else:
            # 如果两者都不是，抛出值错误异常
            raise ValueError(
                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
            )
    
        # 设置使用的注意力处理器
        self.set_attn_processor(processor)
    
        # 从 UNet2DConditionModel 复制的方法，用于设置梯度检查点
        # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice
        # 从 UNet2DConditionModel 复制的方法，用于设置梯度检查点
        # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel._set_gradient_checkpointing
        def _set_gradient_checkpointing(self, module, value=False):
            # 检查模块是否有梯度检查点属性
            if hasattr(module, "gradient_checkpointing"):
                # 如果有，将其设置为指定值
                module.gradient_checkpointing = value
    
        # 定义前向传播方法，接受多个参数
        def forward(
            self,
            sample: torch.Tensor,
            timestep: Union[torch.Tensor, float, int],
            encoder_hidden_states: torch.Tensor,
            class_labels: Optional[torch.Tensor] = None,
            timestep_cond: Optional[torch.Tensor] = None,
            attention_mask: Optional[torch.Tensor] = None,
            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
            encoder_attention_mask: Optional[torch.Tensor] = None,
            return_dict: bool = True,
            encoder_hidden_states_1: Optional[torch.Tensor] = None,
            encoder_attention_mask_1: Optional[torch.Tensor] = None,
# 定义一个获取下采样块的函数
def get_down_block(
    # 下采样块类型
    down_block_type,
    # 层数
    num_layers,
    # 输入通道数
    in_channels,
    # 输出通道数
    out_channels,
    # 时间嵌入通道数
    temb_channels,
    # 是否添加下采样
    add_downsample,
    # ResNet 的 epsilon 值
    resnet_eps,
    # ResNet 的激活函数
    resnet_act_fn,
    # 每个块的变换器层数（默认1）
    transformer_layers_per_block=1,
    # 注意力头数（可选）
    num_attention_heads=None,
    # ResNet 组数（可选）
    resnet_groups=None,
    # 跨注意力维度（可选）
    cross_attention_dim=None,
    # 下采样的填充（可选）
    downsample_padding=None,
    # 是否使用线性投影（默认 False）
    use_linear_projection=False,
    # 是否仅使用跨注意力（默认 False）
    only_cross_attention=False,
    # 是否上溯注意力（默认 False）
    upcast_attention=False,
    # ResNet 时间尺度偏移（默认值）
    resnet_time_scale_shift="default",
):
    # 如果下采样块类型以 "UNetRes" 开头，则去掉前缀
    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
    # 如果下采样块类型为 "DownBlock2D"
    if down_block_type == "DownBlock2D":
        # 返回 DownBlock2D 对象，并传递相关参数
        return DownBlock2D(
            num_layers=num_layers,
            in_channels=in_channels,
            out_channels=out_channels,
            temb_channels=temb_channels,
            add_downsample=add_downsample,
            resnet_eps=resnet_eps,
            resnet_act_fn=resnet_act_fn,
            resnet_groups=resnet_groups,
            downsample_padding=downsample_padding,
            resnet_time_scale_shift=resnet_time_scale_shift,
        )
    # 如果下采样块类型为 "CrossAttnDownBlock2D"
    elif down_block_type == "CrossAttnDownBlock2D":
        # 如果跨注意力维度未指定，抛出异常
        if cross_attention_dim is None:
            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
        # 返回 CrossAttnDownBlock2D 对象，并传递相关参数
        return CrossAttnDownBlock2D(
            num_layers=num_layers,
            transformer_layers_per_block=transformer_layers_per_block,
            in_channels=in_channels,
            out_channels=out_channels,
            temb_channels=temb_channels,
            add_downsample=add_downsample,
            resnet_eps=resnet_eps,
            resnet_act_fn=resnet_act_fn,
            resnet_groups=resnet_groups,
            downsample_padding=downsample_padding,
            cross_attention_dim=cross_attention_dim,
            num_attention_heads=num_attention_heads,
            use_linear_projection=use_linear_projection,
            only_cross_attention=only_cross_attention,
            upcast_attention=upcast_attention,
            resnet_time_scale_shift=resnet_time_scale_shift,
        )
    # 如果下采样块类型不匹配，抛出异常
    raise ValueError(f"{down_block_type} does not exist.")


# 定义一个获取上采样块的函数
def get_up_block(
    # 上采样块类型
    up_block_type,
    # 层数
    num_layers,
    # 输入通道数
    in_channels,
    # 输出通道数
    out_channels,
    # 上一输出通道数
    prev_output_channel,
    # 时间嵌入通道数
    temb_channels,
    # 是否添加上采样
    add_upsample,
    # ResNet 的 epsilon 值
    resnet_eps,
    # ResNet 的激活函数
    resnet_act_fn,
    # 每个块的变换器层数（默认1）
    transformer_layers_per_block=1,
    # 注意力头数（可选）
    num_attention_heads=None,
    # ResNet 组数（可选）
    resnet_groups=None,
    # 跨注意力维度（可选）
    cross_attention_dim=None,
    # 是否使用线性投影（默认 False）
    use_linear_projection=False,
    # 是否仅使用跨注意力（默认 False）
    only_cross_attention=False,
    # 是否上溯注意力（默认 False）
    upcast_attention=False,
    # ResNet 时间尺度偏移（默认值）
    resnet_time_scale_shift="default",
):
    # 如果上采样块类型以 "UNetRes" 开头，则去掉前缀
    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
    # 判断上升块的类型是否为 "UpBlock2D"
        if up_block_type == "UpBlock2D":
            # 返回 UpBlock2D 对象，传入所需参数
            return UpBlock2D(
                # 指定层数
                num_layers=num_layers,
                # 输入通道数
                in_channels=in_channels,
                # 输出通道数
                out_channels=out_channels,
                # 前一层的输出通道数
                prev_output_channel=prev_output_channel,
                # 时间嵌入通道数
                temb_channels=temb_channels,
                # 是否添加上采样
                add_upsample=add_upsample,
                # ResNet 的 epsilon 值
                resnet_eps=resnet_eps,
                # ResNet 的激活函数
                resnet_act_fn=resnet_act_fn,
                # ResNet 的分组数
                resnet_groups=resnet_groups,
                # ResNet 的时间尺度偏移
                resnet_time_scale_shift=resnet_time_scale_shift,
            )
        # 判断上升块的类型是否为 "CrossAttnUpBlock2D"
        elif up_block_type == "CrossAttnUpBlock2D":
            # 如果未指定 cross_attention_dim，抛出错误
            if cross_attention_dim is None:
                raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
            # 返回 CrossAttnUpBlock2D 对象，传入所需参数
            return CrossAttnUpBlock2D(
                # 指定层数
                num_layers=num_layers,
                # 每个块的变换层数
                transformer_layers_per_block=transformer_layers_per_block,
                # 输入通道数
                in_channels=in_channels,
                # 输出通道数
                out_channels=out_channels,
                # 前一层的输出通道数
                prev_output_channel=prev_output_channel,
                # 时间嵌入通道数
                temb_channels=temb_channels,
                # 是否添加上采样
                add_upsample=add_upsample,
                # ResNet 的 epsilon 值
                resnet_eps=resnet_eps,
                # ResNet 的激活函数
                resnet_act_fn=resnet_act_fn,
                # ResNet 的分组数
                resnet_groups=resnet_groups,
                # 跨注意力维度
                cross_attention_dim=cross_attention_dim,
                # 注意力头的数量
                num_attention_heads=num_attention_heads,
                # 是否使用线性投影
                use_linear_projection=use_linear_projection,
                # 是否仅使用跨注意力
                only_cross_attention=only_cross_attention,
                # 是否上采样注意力
                upcast_attention=upcast_attention,
                # ResNet 的时间尺度偏移
                resnet_time_scale_shift=resnet_time_scale_shift,
            )
        # 抛出错误，指示上升块类型不存在
        raise ValueError(f"{up_block_type} does not exist.")
# 定义一个二维交叉注意力下采样块，继承自 nn.Module
class CrossAttnDownBlock2D(nn.Module):
    # 初始化方法，设置下采样块的参数
    def __init__(
        # 输入通道数
        self,
        in_channels: int,
        # 输出通道数
        out_channels: int,
        # 时间嵌入通道数
        temb_channels: int,
        # Dropout 概率，默认为 0.0
        dropout: float = 0.0,
        # 层数，默认为 1
        num_layers: int = 1,
        # 每个块中的变换器层数，默认为 1
        transformer_layers_per_block: int = 1,
        # ResNet 的 epsilon 值，默认为 1e-6
        resnet_eps: float = 1e-6,
        # ResNet 的时间缩放偏移，默认为 "default"
        resnet_time_scale_shift: str = "default",
        # ResNet 的激活函数，默认为 "swish"
        resnet_act_fn: str = "swish",
        # ResNet 的组数，默认为 32
        resnet_groups: int = 32,
        # 是否在 ResNet 中使用预归一化，默认为 True
        resnet_pre_norm: bool = True,
        # 注意力头的数量，默认为 1
        num_attention_heads=1,
        # 交叉注意力的维度，默认为 1280
        cross_attention_dim=1280,
        # 输出缩放因子，默认为 1.0
        output_scale_factor=1.0,
        # 下采样的填充，默认为 1
        downsample_padding=1,
        # 是否添加下采样层，默认为 True
        add_downsample=True,
        # 是否使用线性投影，默认为 False
        use_linear_projection=False,
        # 是否仅使用交叉注意力，默认为 False
        only_cross_attention=False,
        # 是否上采样注意力，默认为 False
        upcast_attention=False,
    ):
        # 调用父类的构造函数
        super().__init__()
        # 初始化用于存储 ResNet 模块的列表
        resnets = []
        # 初始化用于存储注意力模块的列表
        attentions = []

        # 设置是否使用交叉注意力
        self.has_cross_attention = True
        # 设置注意力头的数量
        self.num_attention_heads = num_attention_heads

        # 检查交叉注意力维度是否为整数类型
        if isinstance(cross_attention_dim, int):
            # 将交叉注意力维度转换为元组
            cross_attention_dim = (cross_attention_dim,)
        # 检查交叉注意力维度是否为列表或元组，且长度超过 4
        if isinstance(cross_attention_dim, (list, tuple)) and len(cross_attention_dim) > 4:
            # 如果超过，抛出值错误
            raise ValueError(
                "Only up to 4 cross-attention layers are supported. Ensure that the length of cross-attention "
                f"dims is less than or equal to 4. Got cross-attention dims {cross_attention_dim} of length {len(cross_attention_dim)}"
            )
        # 设置交叉注意力维度
        self.cross_attention_dim = cross_attention_dim

        # 遍历层数以构建 ResNet 模块
        for i in range(num_layers):
            # 确定输入通道数，第一层使用 in_channels，其余层使用 out_channels
            in_channels = in_channels if i == 0 else out_channels
            # 添加 ResNet 块到列表中
            resnets.append(
                ResnetBlock2D(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    temb_channels=temb_channels,
                    eps=resnet_eps,
                    groups=resnet_groups,
                    dropout=dropout,
                    time_embedding_norm=resnet_time_scale_shift,
                    non_linearity=resnet_act_fn,
                    output_scale_factor=output_scale_factor,
                    pre_norm=resnet_pre_norm,
                )
            )
            # 为每个交叉注意力维度添加 Transformer 模型
            for j in range(len(cross_attention_dim)):
                attentions.append(
                    Transformer2DModel(
                        num_attention_heads,
                        out_channels // num_attention_heads,
                        in_channels=out_channels,
                        num_layers=transformer_layers_per_block,
                        cross_attention_dim=cross_attention_dim[j],
                        norm_num_groups=resnet_groups,
                        use_linear_projection=use_linear_projection,
                        only_cross_attention=only_cross_attention,
                        upcast_attention=upcast_attention,
                        double_self_attention=True if cross_attention_dim[j] is None else False,
                    )
                )
        # 将注意力模块列表转换为 nn.ModuleList 以便于管理
        self.attentions = nn.ModuleList(attentions)
        # 将 ResNet 模块列表转换为 nn.ModuleList 以便于管理
        self.resnets = nn.ModuleList(resnets)

        # 检查是否添加下采样模块
        if add_downsample:
            # 创建下采样模块的 nn.ModuleList
            self.downsamplers = nn.ModuleList(
                [
                    Downsample2D(
                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
                    )
                ]
            )
        else:
            # 如果不添加下采样，则设置为 None
            self.downsamplers = None

        # 初始化梯度检查点标志为 False
        self.gradient_checkpointing = False
    # 定义一个前向传播的方法，接受多个输入参数
        def forward(
            self,
            # 输入的隐藏状态张量
            hidden_states: torch.Tensor,
            # 可选的时间嵌入张量
            temb: Optional[torch.Tensor] = None,
            # 可选的编码器隐藏状态张量
            encoder_hidden_states: Optional[torch.Tensor] = None,
            # 可选的注意力掩码张量
            attention_mask: Optional[torch.Tensor] = None,
            # 可选的交叉注意力参数字典
            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
            # 可选的编码器注意力掩码张量
            encoder_attention_mask: Optional[torch.Tensor] = None,
            # 可选的第二个编码器隐藏状态张量
            encoder_hidden_states_1: Optional[torch.Tensor] = None,
            # 可选的第二个编码器注意力掩码张量
            encoder_attention_mask_1: Optional[torch.Tensor] = None,
# 定义一个继承自 nn.Module 的 UNetMidBlock2DCrossAttn 类
class UNetMidBlock2DCrossAttn(nn.Module):
    # 初始化方法，设置各类参数
    def __init__(
        # 输入通道数
        self,
        in_channels: int,
        # 时间嵌入通道数
        temb_channels: int,
        # Dropout 概率，默认为 0.0
        dropout: float = 0.0,
        # 层数，默认为 1
        num_layers: int = 1,
        # 每个块中的变换器层数，默认为 1
        transformer_layers_per_block: int = 1,
        # ResNet 的 epsilon 值，默认为 1e-6
        resnet_eps: float = 1e-6,
        # ResNet 时间缩放偏移方式，默认为 "default"
        resnet_time_scale_shift: str = "default",
        # ResNet 激活函数类型，默认为 "swish"
        resnet_act_fn: str = "swish",
        # ResNet 中的分组数，默认为 32
        resnet_groups: int = 32,
        # 是否在 ResNet 中使用预归一化，默认为 True
        resnet_pre_norm: bool = True,
        # 注意力头的数量，默认为 1
        num_attention_heads=1,
        # 输出缩放因子，默认为 1.0
        output_scale_factor=1.0,
        # 跨注意力维度，默认为 1280
        cross_attention_dim=1280,
        # 是否使用线性投影，默认为 False
        use_linear_projection=False,
        # 是否提升注意力精度，默认为 False
        upcast_attention=False,
    ):
        # 调用父类的初始化方法
        super().__init__()

        # 设置是否使用交叉注意力标志为 True
        self.has_cross_attention = True
        # 存储注意力头的数量
        self.num_attention_heads = num_attention_heads
        # 如果 resnet_groups 未提供，计算其默认值
        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)

        # 如果 cross_attention_dim 是整数，则将其转换为元组
        if isinstance(cross_attention_dim, int):
            cross_attention_dim = (cross_attention_dim,)
        # 如果 cross_attention_dim 是列表或元组且长度大于 4，抛出异常
        if isinstance(cross_attention_dim, (list, tuple)) and len(cross_attention_dim) > 4:
            raise ValueError(
                "Only up to 4 cross-attention layers are supported. Ensure that the length of cross-attention "
                f"dims is less than or equal to 4. Got cross-attention dims {cross_attention_dim} of length {len(cross_attention_dim)}"
            )
        # 存储交叉注意力的维度
        self.cross_attention_dim = cross_attention_dim

        # 至少有一个 ResNet 模块
        resnets = [
            # 初始化 ResnetBlock2D 模块
            ResnetBlock2D(
                in_channels=in_channels,
                out_channels=in_channels,
                temb_channels=temb_channels,
                eps=resnet_eps,
                groups=resnet_groups,
                dropout=dropout,
                time_embedding_norm=resnet_time_scale_shift,
                non_linearity=resnet_act_fn,
                output_scale_factor=output_scale_factor,
                pre_norm=resnet_pre_norm,
            )
        ]
        # 初始化注意力模块列表
        attentions = []

        # 创建层数的循环
        for i in range(num_layers):
            # 遍历每个交叉注意力维度
            for j in range(len(cross_attention_dim)):
                # 添加 Transformer2DModel 到注意力列表
                attentions.append(
                    Transformer2DModel(
                        num_attention_heads,
                        in_channels // num_attention_heads,
                        in_channels=in_channels,
                        num_layers=transformer_layers_per_block,
                        cross_attention_dim=cross_attention_dim[j],
                        norm_num_groups=resnet_groups,
                        use_linear_projection=use_linear_projection,
                        upcast_attention=upcast_attention,
                        # 如果当前交叉注意力维度为 None，设置 double_self_attention 为 True
                        double_self_attention=True if cross_attention_dim[j] is None else False,
                    )
                )
            # 在 ResNet 列表中添加另一个 ResnetBlock2D
            resnets.append(
                ResnetBlock2D(
                    in_channels=in_channels,
                    out_channels=in_channels,
                    temb_channels=temb_channels,
                    eps=resnet_eps,
                    groups=resnet_groups,
                    dropout=dropout,
                    time_embedding_norm=resnet_time_scale_shift,
                    non_linearity=resnet_act_fn,
                    output_scale_factor=output_scale_factor,
                    pre_norm=resnet_pre_norm,
                )
            )

        # 将注意力模块列表转换为 nn.ModuleList
        self.attentions = nn.ModuleList(attentions)
        # 将 ResNet 模块列表转换为 nn.ModuleList
        self.resnets = nn.ModuleList(resnets)

        # 初始化梯度检查点标志为 False
        self.gradient_checkpointing = False
    # 定义前向传播函数，接收多个输入参数
        def forward(
            self,
            # 输入的隐藏状态，类型为 torch.Tensor
            hidden_states: torch.Tensor,
            # 可选的时间嵌入，类型为 torch.Tensor
            temb: Optional[torch.Tensor] = None,
            # 可选的编码器隐藏状态，类型为 torch.Tensor
            encoder_hidden_states: Optional[torch.Tensor] = None,
            # 可选的注意力掩码，类型为 torch.Tensor
            attention_mask: Optional[torch.Tensor] = None,
            # 可选的交叉注意力参数字典
            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
            # 可选的编码器注意力掩码，类型为 torch.Tensor
            encoder_attention_mask: Optional[torch.Tensor] = None,
            # 可选的第二组编码器隐藏状态，类型为 torch.Tensor
            encoder_hidden_states_1: Optional[torch.Tensor] = None,
            # 可选的第二组编码器注意力掩码，类型为 torch.Tensor
            encoder_attention_mask_1: Optional[torch.Tensor] = None,
# 定义一个名为 CrossAttnUpBlock2D 的类，继承自 nn.Module
class CrossAttnUpBlock2D(nn.Module):
    # 初始化方法，定义该类的构造函数
    def __init__(
        # 输入通道数
        self,
        in_channels: int,
        # 输出通道数
        out_channels: int,
        # 上一层输出通道数
        prev_output_channel: int,
        # 时间嵌入通道数
        temb_channels: int,
        # dropout 概率，默认为 0.0
        dropout: float = 0.0,
        # 层数，默认为 1
        num_layers: int = 1,
        # 每个块的 transformer 层数，默认为 1
        transformer_layers_per_block: int = 1,
        # ResNet 中的 epsilon 值，默认为 1e-6
        resnet_eps: float = 1e-6,
        # ResNet 的时间尺度偏移方式，默认为 "default"
        resnet_time_scale_shift: str = "default",
        # ResNet 的激活函数，默认为 "swish"
        resnet_act_fn: str = "swish",
        # ResNet 的分组数，默认为 32
        resnet_groups: int = 32,
        # ResNet 是否预先归一化，默认为 True
        resnet_pre_norm: bool = True,
        # 注意力头的数量，默认为 1
        num_attention_heads=1,
        # 交叉注意力的维度，默认为 1280
        cross_attention_dim=1280,
        # 输出缩放因子，默认为 1.0
        output_scale_factor=1.0,
        # 是否添加上采样，默认为 True
        add_upsample=True,
        # 是否使用线性投影，默认为 False
        use_linear_projection=False,
        # 是否仅使用交叉注意力，默认为 False
        only_cross_attention=False,
        # 是否上溢出注意力，默认为 False
        upcast_attention=False,
    # 初始化类，调用父类构造函数
        ):
            super().__init__()
            # 初始化空列表用于存储残差网络块
            resnets = []
            # 初始化空列表用于存储注意力模型
            attentions = []
    
            # 设置是否使用交叉注意力
            self.has_cross_attention = True
            # 存储注意力头的数量
            self.num_attention_heads = num_attention_heads
    
            # 如果交叉注意力维度是整数，将其转为元组
            if isinstance(cross_attention_dim, int):
                cross_attention_dim = (cross_attention_dim,)
            # 检查交叉注意力维度是否为列表或元组且长度超过4
            if isinstance(cross_attention_dim, (list, tuple)) and len(cross_attention_dim) > 4:
                # 抛出错误，限制交叉注意力层数
                raise ValueError(
                    "Only up to 4 cross-attention layers are supported. Ensure that the length of cross-attention "
                    f"dims is less than or equal to 4. Got cross-attention dims {cross_attention_dim} of length {len(cross_attention_dim)}"
                )
            # 存储交叉注意力维度
            self.cross_attention_dim = cross_attention_dim
    
            # 遍历层数以构建残差块和注意力模型
            for i in range(num_layers):
                # 设置残差跳跃通道数
                res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
                # 设置当前残差块的输入通道数
                resnet_in_channels = prev_output_channel if i == 0 else out_channels
    
                # 创建并添加残差块到列表
                resnets.append(
                    ResnetBlock2D(
                        # 计算输入通道数
                        in_channels=resnet_in_channels + res_skip_channels,
                        # 设置输出通道数
                        out_channels=out_channels,
                        # 传递时间嵌入通道数
                        temb_channels=temb_channels,
                        # 设置小常数以防止除零
                        eps=resnet_eps,
                        # 设置组数
                        groups=resnet_groups,
                        # 设置dropout比例
                        dropout=dropout,
                        # 设置时间嵌入的归一化
                        time_embedding_norm=resnet_time_scale_shift,
                        # 设置激活函数
                        non_linearity=resnet_act_fn,
                        # 设置输出缩放因子
                        output_scale_factor=output_scale_factor,
                        # 设置是否使用预归一化
                        pre_norm=resnet_pre_norm,
                    )
                )
                # 为每个交叉注意力维度创建注意力模型
                for j in range(len(cross_attention_dim)):
                    attentions.append(
                        Transformer2DModel(
                            # 设置注意力头数
                            num_attention_heads,
                            # 计算每个头的输出通道数
                            out_channels // num_attention_heads,
                            # 设置输入通道数
                            in_channels=out_channels,
                            # 设置每个块的层数
                            num_layers=transformer_layers_per_block,
                            # 设置当前交叉注意力维度
                            cross_attention_dim=cross_attention_dim[j],
                            # 设置组归一化数
                            norm_num_groups=resnet_groups,
                            # 设置是否使用线性投影
                            use_linear_projection=use_linear_projection,
                            # 设置是否只使用交叉注意力
                            only_cross_attention=only_cross_attention,
                            # 设置是否上溯注意力
                            upcast_attention=upcast_attention,
                            # 设置是否双重自注意力
                            double_self_attention=True if cross_attention_dim[j] is None else False,
                        )
                    )
            # 将注意力模型列表转换为模块列表
            self.attentions = nn.ModuleList(attentions)
            # 将残差块列表转换为模块列表
            self.resnets = nn.ModuleList(resnets)
    
            # 根据条件添加上采样模块
            if add_upsample:
                self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
            else:
                # 如果不添加上采样，设为 None
                self.upsamplers = None
    
            # 设置梯度检查点
            self.gradient_checkpointing = False
    # 定义一个前向传播函数
        def forward(
            # 输入参数：当前隐藏状态的张量
            self,
            hidden_states: torch.Tensor,
            # 输入参数：包含之前隐藏状态的元组
            res_hidden_states_tuple: Tuple[torch.Tensor, ...],
            # 可选参数：时间嵌入的张量
            temb: Optional[torch.Tensor] = None,
            # 可选参数：编码器的隐藏状态张量
            encoder_hidden_states: Optional[torch.Tensor] = None,
            # 可选参数：交叉注意力的关键字参数字典
            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
            # 可选参数：上采样的目标大小
            upsample_size: Optional[int] = None,
            # 可选参数：注意力掩码的张量
            attention_mask: Optional[torch.Tensor] = None,
            # 可选参数：编码器的注意力掩码张量
            encoder_attention_mask: Optional[torch.Tensor] = None,
            # 可选参数：编码器隐藏状态的另一个张量
            encoder_hidden_states_1: Optional[torch.Tensor] = None,
            # 可选参数：编码器隐藏状态的注意力掩码张量
            encoder_attention_mask_1: Optional[torch.Tensor] = None,

posted @ 2024-10-22 12:36 绝不原创的飞龙阅读(84) 评论(0) 收藏举报

刷新页面返回顶部

龙哥盟

掠夺·扩张·投机·博弈

diffusers-源码解析-十八-

diffusers 源码解析（十八）

`.\diffusers\pipelines\animatediff\pipeline_animatediff_sparsectrl.py`

从 diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents 复制的函数

用于受控文本到视频生成的管道

""" # 开始多行字符串注释，通常用于文档说明

Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents # 表示该函数是从指定模块复制的

Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps # 表示该函数是从指定模块复制的

定义一个名为 AnimateDiffVideoToVideoPipeline 的类，继承自多个基类

`.\diffusers\pipelines\animatediff\init.py`

`.\diffusers\pipelines\audioldm\pipeline_audioldm.py`

`.\diffusers\pipelines\audioldm\init.py`

`.\diffusers\pipelines\audioldm2\modeling_audioldm2.py`

公告

龙哥盟

掠夺·扩张·投机·博弈

diffusers-源码解析-十八-

diffusers 源码解析（十八）

.\diffusers\pipelines\animatediff\pipeline_animatediff_sparsectrl.py

从 diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents 复制的函数

用于受控文本到视频生成的管道

""" # 开始多行字符串注释，通常用于文档说明

Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents # 表示该函数是从指定模块复制的

Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps # 表示该函数是从指定模块复制的

定义一个名为 AnimateDiffVideoToVideoPipeline 的类，继承自多个基类

.\diffusers\pipelines\animatediff\__init__.py

.\diffusers\pipelines\audioldm\pipeline_audioldm.py

.\diffusers\pipelines\audioldm\__init__.py

.\diffusers\pipelines\audioldm2\modeling_audioldm2.py

公告

`.\diffusers\pipelines\animatediff\pipeline_animatediff_sparsectrl.py`

`.\diffusers\pipelines\animatediff\init.py`

`.\diffusers\pipelines\audioldm\pipeline_audioldm.py`

`.\diffusers\pipelines\audioldm\init.py`

`.\diffusers\pipelines\audioldm2\modeling_audioldm2.py`