diffusers 源码解析(四十九)
.\diffusers\pipelines\stable_diffusion_diffedit\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
DIFFUSERS_SLOW_IMPORT,
OptionalDependencyNotAvailable,
_LazyModule,
get_objects_from_module,
is_torch_available,
is_transformers_available,
)
_dummy_objects = {}
_import_structure = {}
try:
if not (is_transformers_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils import dummy_torch_and_transformers_objects
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
_import_structure["pipeline_stable_diffusion_diffedit"] = ["StableDiffusionDiffEditPipeline"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
try:
if not (is_transformers_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_objects import *
else:
from .pipeline_stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
else:
import sys
sys.modules[__name__] = _LazyModule(
__name__,
globals()["__file__"],
_import_structure,
module_spec=__spec__,
)
for name, value in _dummy_objects.items():
setattr(sys.modules[__name__], name, value)
.\diffusers\pipelines\stable_diffusion_gligen\pipeline_stable_diffusion_gligen.py
import inspect
import warnings
from typing import Any, Callable, Dict, List, Optional, Union
import PIL.Image
import torch
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...image_processor import VaeImageProcessor
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...models.attention import GatedSelfAttentionDense
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import (
USE_PEFT_BACKEND,
deprecate,
logging,
replace_example_docstring,
scale_lora_layers,
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
logger = logging.get_logger(__name__)
EXAMPLE_DOC_STRING = """ # 定义示例文档字符串的多行字符串
Examples:
```py
>>> import torch # 导入 PyTorch 库
>>> from diffusers import StableDiffusionGLIGENPipeline # 从 diffusers 库导入 StableDiffusionGLIGENPipeline 类
>>> from diffusers.utils import load_image # 从 diffusers.utils 导入 load_image 函数
>>> # 在由边界框定义的区域插入由文本描述的对象
>>> pipe = StableDiffusionGLIGENPipeline.from_pretrained( # 从预训练模型加载 StableDiffusionGLIGENPipeline
... "masterful/gligen-1-4-inpainting-text-box", variant="fp16", torch_dtype=torch.float16 # 指定模型名称和数据类型
... )
>>> pipe = pipe.to("cuda") # 将模型移动到 GPU 上
>>> input_image = load_image( # 从指定 URL 加载输入图像
... "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gligen/livingroom_modern.png" # 图像 URL
... )
>>> prompt = "a birthday cake" # 定义生成图像的文本提示
>>> boxes = [[0.2676, 0.6088, 0.4773, 0.7183]] # 定义边界框的位置
>>> phrases = ["a birthday cake"] # 定义要插入的对象描述
>>> images = pipe( # 调用管道生成图像
... prompt=prompt, # 传入文本提示
... gligen_phrases=phrases, # 传入要插入的描述
... gligen_inpaint_image=input_image, # 传入需要修复的图像
... gligen_boxes=boxes, # 传入边界框
... gligen_scheduled_sampling_beta=1, # 设定计划采样的 beta 值
... output_type="pil", # 输出类型设为 PIL 图像
... num_inference_steps=50, # 设定推理步骤数量
... ).images # 获取生成的图像列表
>>> images[0].save("./gligen-1-4-inpainting-text-box.jpg") # 将生成的第一张图像保存为 JPEG 文件
>>> # 生成由提示描述的图像,并在由边界框定义的区域插入由文本描述的对象
>>> pipe = StableDiffusionGLIGENPipeline.from_pretrained( # 从预训练模型加载另一个 StableDiffusionGLIGENPipeline
... "masterful/gligen-1-4-generation-text-box", variant="fp16", torch_dtype=torch.float16 # 指定新模型名称和数据类型
... )
>>> pipe = pipe.to("cuda") # 将模型移动到 GPU 上
>>> prompt = "a waterfall and a modern high speed train running through the tunnel in a beautiful forest with fall foliage" # 定义新的生成图像的文本提示
>>> boxes = [[0.1387, 0.2051, 0.4277, 0.7090], [0.4980, 0.4355, 0.8516, 0.7266]] # 定义多个边界框的位置
>>> phrases = ["a waterfall", "a modern high speed train running through the tunnel"] # 定义要插入的多个对象描述
>>> images = pipe( # 调用管道生成图像
... prompt=prompt, # 传入新的文本提示
... gligen_phrases=phrases, # 传入新的要插入的描述
... gligen_boxes=boxes, # 传入新的边界框
... gligen_scheduled_sampling_beta=1, # 设定计划采样的 beta 值
... output_type="pil", # 输出类型设为 PIL 图像
... num_inference_steps=50, # 设定推理步骤数量
... ).images # 获取生成的图像列表
>>> images[0].save("./gligen-1-4-generation-text-box.jpg") # 将生成的第一张图像保存为 JPEG 文件
```py
# 定义一个名为 StableDiffusionGLIGENPipeline 的类,继承自 DiffusionPipeline 和 StableDiffusionMixin
class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
用于使用 Stable Diffusion 和基于语言的图像生成 (GLIGEN) 的文本到图像生成管道。
该模型从 [`DiffusionPipeline`] 继承。有关库为所有管道实现的通用方法的文档,请检查超类文档(例如下载或保存、在特定设备上运行等)。
参数:
vae ([`AutoencoderKL`]):
用于将图像编码和解码为潜在表示的变分自编码器 (VAE) 模型。
text_encoder ([`~transformers.CLIPTextModel`]):
冻结的文本编码器 ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14))。
tokenizer ([`~transformers.CLIPTokenizer`]):
用于对文本进行标记的 `CLIPTokenizer`。
unet ([`UNet2DConditionModel`]):
用于去噪编码图像潜在值的 `UNet2DConditionModel`。
scheduler ([`SchedulerMixin`]):
与 `unet` 结合使用的调度器,用于去噪编码图像潜在值。可以是
[`DDIMScheduler`], [`LMSDiscreteScheduler`] 或 [`PNDMScheduler`] 之一。
safety_checker ([`StableDiffusionSafetyChecker`]):
分类模块,估计生成的图像是否可能被认为是冒犯性或有害的。
有关模型潜在危害的更多详细信息,请参考 [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5)。
feature_extractor ([`~transformers.CLIPImageProcessor`]):
用于从生成图像中提取特征的 `CLIPImageProcessor`;作为 `safety_checker` 的输入。
"""
# 定义可选组件列表,包括安全检查器和特征提取器
_optional_components = ["safety_checker", "feature_extractor"]
# 定义模型在 CPU 上卸载的顺序
model_cpu_offload_seq = "text_encoder->unet->vae"
# 定义从 CPU 卸载中排除的组件,安全检查器不被卸载
_exclude_from_cpu_offload = ["safety_checker"]
# 初始化方法,接受多个参数以配置管道
def __init__(
# VAE 模型,负责图像的编码和解码
vae: AutoencoderKL,
# 文本编码器,用于处理输入文本
text_encoder: CLIPTextModel,
# 用于对文本进行标记的分词器
tokenizer: CLIPTokenizer,
# 用于去噪图像的 UNet 模型
unet: UNet2DConditionModel,
# 调度器,控制去噪过程
scheduler: KarrasDiffusionSchedulers,
# 安全检查器,评估生成图像的潜在危害
safety_checker: StableDiffusionSafetyChecker,
# 特征提取器,处理生成的图像
feature_extractor: CLIPImageProcessor,
# 是否需要安全检查器的标志,默认为真
requires_safety_checker: bool = True,
# 初始化父类
):
super().__init__()
# 检查是否禁用安全检查器并且需要安全检查器时,记录警告信息
if safety_checker is None and requires_safety_checker:
logger.warning(
f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
" that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
" results in services or applications open to the public. Both the diffusers team and Hugging Face"
" strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
" it only for use-cases that involve analyzing network behavior or auditing its results. For more"
" information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
)
# 检查是否定义特征提取器以便使用安全检查器,如果没有则抛出错误
if safety_checker is not None and feature_extractor is None:
raise ValueError(
"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
# 注册模型模块,包括 VAE、文本编码器、分词器、UNet、调度器、安全检查器和特征提取器
self.register_modules(
vae=vae,
text_encoder=text_encoder,
tokenizer=tokenizer,
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
)
# 计算 VAE 的缩放因子
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
# 创建图像处理器,设置为转换 RGB
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
# 将配置中的安全检查器要求注册到对象
self.register_to_config(requires_safety_checker=requires_safety_checker)
# 从 StableDiffusionPipeline 中复制的编码提示函数
def _encode_prompt(
self,
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt=None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
lora_scale: Optional[float] = None,
**kwargs,
):
# 定义一个弃用消息,提示用户该方法将来会被移除,并建议使用新方法
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
# 调用 deprecate 函数,记录该方法的弃用信息
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
# 调用 encode_prompt 方法,获取与提示相关的嵌入元组
prompt_embeds_tuple = self.encode_prompt(
prompt=prompt, # 输入提示
device=device, # 设备类型(如 CPU 或 GPU)
num_images_per_prompt=num_images_per_prompt, # 每个提示生成的图像数量
do_classifier_free_guidance=do_classifier_free_guidance, # 是否使用无分类器的引导
negative_prompt=negative_prompt, # 负面提示
prompt_embeds=prompt_embeds, # 提示嵌入
negative_prompt_embeds=negative_prompt_embeds, # 负面提示嵌入
lora_scale=lora_scale, # Lora 缩放因子
**kwargs, # 其他可选参数
)
# 将嵌入元组中的两个张量连接起来,便于后续兼容
prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
# 返回连接后的提示嵌入
return prompt_embeds
# 从 StableDiffusionPipeline 复制的 encode_prompt 方法
def encode_prompt(
self,
prompt, # 输入提示
device, # 设备类型(如 CPU 或 GPU)
num_images_per_prompt, # 每个提示生成的图像数量
do_classifier_free_guidance, # 是否使用无分类器的引导
negative_prompt=None, # 负面提示(可选)
prompt_embeds: Optional[torch.Tensor] = None, # 提示嵌入(可选)
negative_prompt_embeds: Optional[torch.Tensor] = None, # 负面提示嵌入(可选)
lora_scale: Optional[float] = None, # Lora 缩放因子(可选)
clip_skip: Optional[int] = None, # 跳过剪辑层(可选)
# 从 StableDiffusionPipeline 复制的 run_safety_checker 方法
def run_safety_checker(self, image, device, dtype):
# 检查是否存在安全检查器
if self.safety_checker is None:
has_nsfw_concept = None # 如果没有,则设置无敏感内容标志为 None
else:
# 如果输入是张量,则处理图像为 PIL 格式
if torch.is_tensor(image):
feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
else:
# 否则将输入转换为 PIL 格式
feature_extractor_input = self.image_processor.numpy_to_pil(image)
# 提取特征并将其转换为指定设备的张量
safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
# 运行安全检查器,检查图像是否含有 NSFW 内容
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype) # 提供图像和特征输入
)
# 返回处理后的图像及其 NSFW 内容标志
return image, has_nsfw_concept
# 定义一个方法,用于准备调度器步骤的额外参数
def prepare_extra_step_kwargs(self, generator, eta):
# 准备调度器步骤的额外参数,因为不同的调度器具有不同的参数签名
# eta (η) 仅在 DDIMScheduler 中使用,对于其他调度器将被忽略
# eta 对应于 DDIM 论文中的 η: https://arxiv.org/abs/2010.02502
# 应在 [0, 1] 范围内
# 检查调度器的步骤方法是否接受 eta 参数
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
# 初始化一个空字典,用于存放额外的步骤参数
extra_step_kwargs = {}
# 如果调度器接受 eta 参数,则将其添加到额外参数字典中
if accepts_eta:
extra_step_kwargs["eta"] = eta
# 检查调度器的步骤方法是否接受 generator 参数
accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
# 如果调度器接受 generator 参数,则将其添加到额外参数字典中
if accepts_generator:
extra_step_kwargs["generator"] = generator
# 返回准备好的额外参数字典
return extra_step_kwargs
# 定义一个方法,用于检查输入参数的有效性
def check_inputs(
self,
prompt, # 文本提示,用于生成内容
height, # 生成内容的高度
width, # 生成内容的宽度
callback_steps, # 回调步骤的频率
gligen_phrases, # 用于生成的短语
gligen_boxes, # 用于生成的框
negative_prompt=None, # 可选的负面提示,用于生成的限制
prompt_embeds=None, # 可选的提示嵌入,提前计算的文本表示
negative_prompt_embeds=None, # 可选的负面提示嵌入,提前计算的负面文本表示
):
# 检查高度和宽度是否都是 8 的倍数,如果不是,则抛出值错误
if height % 8 != 0 or width % 8 != 0:
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
# 检查 callback_steps 是否为正整数,若条件不满足则抛出值错误
if (callback_steps is None) or (
callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
f" {type(callback_steps)}."
)
# 检查 prompt 和 prompt_embeds 是否同时被定义,如果是则抛出值错误
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
" only forward one of the two."
)
# 检查 prompt 和 prompt_embeds 是否都未定义,如果是则抛出值错误
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
# 检查 prompt 是否为字符串或列表类型,如果不是则抛出值错误
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
# 检查 negative_prompt 和 negative_prompt_embeds 是否同时被定义,如果是则抛出值错误
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
)
# 检查如果提供了 prompt_embeds 和 negative_prompt_embeds,则它们的形状是否相同,如果不同则抛出值错误
if prompt_embeds is not None and negative_prompt_embeds is not None:
if prompt_embeds.shape != negative_prompt_embeds.shape:
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
f" {negative_prompt_embeds.shape}."
)
# 检查 gligen_phrases 和 gligen_boxes 的长度是否相同,如果不同则抛出值错误
if len(gligen_phrases) != len(gligen_boxes):
raise ValueError(
"length of `gligen_phrases` and `gligen_boxes` has to be same, but"
f" got: `gligen_phrases` {len(gligen_phrases)} != `gligen_boxes` {len(gligen_boxes)}"
)
# 从 diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents 复制的代码
# 准备潜变量,创建指定形状的随机噪声或处理已给定的潜变量
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
# 定义潜变量的形状,包括批大小、通道数和经过 VAE 缩放因子后的高度和宽度
shape = (
batch_size,
num_channels_latents,
int(height) // self.vae_scale_factor,
int(width) // self.vae_scale_factor,
)
# 检查生成器的类型,如果是列表且长度与批大小不匹配则抛出错误
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
# 如果没有提供潜变量,则通过随机生成创建新的潜变量
if latents is None:
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
else:
# 如果提供了潜变量,将其转移到指定设备上
latents = latents.to(device)
# 按调度器所需的标准差缩放初始噪声
latents = latents * self.scheduler.init_noise_sigma
# 返回处理后的潜变量
return latents
# 启用或禁用自注意力模块的融合
def enable_fuser(self, enabled=True):
# 遍历 UNet 模块
for module in self.unet.modules():
# 如果模块是 GatedSelfAttentionDense 类型,则设置其启用状态
if type(module) is GatedSelfAttentionDense:
module.enabled = enabled
# 从给定的框列表生成修复掩码
def draw_inpaint_mask_from_boxes(self, boxes, size):
# 创建一个全为 1 的掩码,大小与输入图像一致
inpaint_mask = torch.ones(size[0], size[1])
# 遍历每个框,更新掩码中的相应区域为 0
for box in boxes:
x0, x1 = box[0] * size[0], box[2] * size[0] # 计算框的左和右边界
y0, y1 = box[1] * size[1], box[3] * size[1] # 计算框的上和下边界
inpaint_mask[int(y0) : int(y1), int(x0) : int(x1)] = 0 # 将框内区域设置为 0
# 返回修复掩码
return inpaint_mask
# 裁剪图像到指定的新宽度和高度
def crop(self, im, new_width, new_height):
# 获取图像的当前宽度和高度
width, height = im.size
# 计算裁剪区域的左、上、右、下边界
left = (width - new_width) / 2
top = (height - new_height) / 2
right = (width + new_width) / 2
bottom = (height + new_height) / 2
# 返回裁剪后的图像
return im.crop((left, top, right, bottom))
# 根据目标尺寸对图像进行中心裁剪
def target_size_center_crop(self, im, new_hw):
# 获取图像的当前宽度和高度
width, height = im.size
# 如果宽度和高度不相等,进行中心裁剪
if width != height:
im = self.crop(im, min(height, width), min(height, width))
# 将图像调整为新的宽高,并使用高质量的重采样方法
return im.resize((new_hw, new_hw), PIL.Image.LANCZOS)
# 装饰器,禁用梯度计算以节省内存
@torch.no_grad()
# 替换示例文档字符串
@replace_example_docstring(EXAMPLE_DOC_STRING)
# 定义可调用对象的 __call__ 方法,允许实例像函数一样被调用
def __call__(
# 提示信息,可以是单个字符串或字符串列表
self,
prompt: Union[str, List[str]] = None,
# 图像高度,可选参数
height: Optional[int] = None,
# 图像宽度,可选参数
width: Optional[int] = None,
# 推理步骤的数量,默认为50
num_inference_steps: int = 50,
# 引导强度,默认为7.5
guidance_scale: float = 7.5,
# Gligen 调度采样的 beta 值,默认为0.3
gligen_scheduled_sampling_beta: float = 0.3,
# Gligen 相关短语,可选字符串列表
gligen_phrases: List[str] = None,
# Gligen 边界框,列表中包含浮点数列表,可选
gligen_boxes: List[List[float]] = None,
# Gligen 使用的图像,PIL.Image.Image 对象,可选
gligen_inpaint_image: Optional[PIL.Image.Image] = None,
# 负提示信息,可选字符串或字符串列表
negative_prompt: Optional[Union[str, List[str]]] = None,
# 每个提示生成的图像数量,默认为1
num_images_per_prompt: Optional[int] = 1,
# 采样的 ETA 值,默认为0.0
eta: float = 0.0,
# 随机数生成器,可选,可以是单个或多个生成器
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
# 潜在张量,可选
latents: Optional[torch.Tensor] = None,
# 提示嵌入张量,可选
prompt_embeds: Optional[torch.Tensor] = None,
# 负提示嵌入张量,可选
negative_prompt_embeds: Optional[torch.Tensor] = None,
# 输出类型,默认为“pil”
output_type: Optional[str] = "pil",
# 是否返回字典格式的结果,默认为 True
return_dict: bool = True,
# 可选回调函数,用于在每个步骤执行
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
# 每隔多少步骤调用回调,默认为1
callback_steps: int = 1,
# 跨注意力的额外参数,可选
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
# 可选的跳过剪辑参数
clip_skip: Optional[int] = None,
.\diffusers\pipelines\stable_diffusion_gligen\pipeline_stable_diffusion_gligen_text_image.py
import inspect
import warnings
from typing import Any, Callable, Dict, List, Optional, Union
import PIL.Image
import torch
from transformers import (
CLIPImageProcessor,
CLIPProcessor,
CLIPTextModel,
CLIPTokenizer,
CLIPVisionModelWithProjection,
)
from ...image_processor import VaeImageProcessor
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...models.attention import GatedSelfAttentionDense
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import USE_PEFT_BACKEND, logging, replace_example_docstring, scale_lora_layers, unscale_lora_layers
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput
from ..stable_diffusion.clip_image_project_model import CLIPImageProjection
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
logger = logging.get_logger(__name__)
EXAMPLE_DOC_STRING = """ # 示例文档字符串,可能用于说明使用方式或示例
"""
class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionMixin):
r""" # 类的文档字符串,说明该类的功能
使用 Stable Diffusion 进行文本到图像生成的管道,结合基于语言的图像生成(GLIGEN)。
此模型继承自 [`DiffusionPipeline`]。有关库为所有管道实现的通用方法(例如下载或保存、在特定设备上运行等),请查看超类文档。
# 定义参数列表及其说明
Args:
vae ([`AutoencoderKL`]):
用于将图像编码和解码为潜在表示的变分自编码器 (VAE) 模型。
text_encoder ([`~transformers.CLIPTextModel`]):
冻结的文本编码器 ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14))。
tokenizer ([`~transformers.CLIPTokenizer`]):
用于对文本进行分词的 `CLIPTokenizer`。
processor ([`~transformers.CLIPProcessor`]):
用于处理参考图像的 `CLIPProcessor`。
image_encoder ([`transformers.CLIPVisionModelWithProjection`]):
冻结的图像编码器 ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14))。
image_project ([`CLIPImageProjection`]):
用于将图像嵌入投影到短语嵌入空间的 `CLIPImageProjection`。
unet ([`UNet2DConditionModel`]):
用于去噪编码图像潜在表示的 `UNet2DConditionModel`。
scheduler ([`SchedulerMixin`]):
与 `unet` 结合使用以去噪编码图像潜在表示的调度器。可以是
[`DDIMScheduler`], [`LMSDiscreteScheduler`] 或 [`PNDMScheduler`] 中的一个。
safety_checker ([`StableDiffusionSafetyChecker`]):
分类模块,用于评估生成的图像是否可能被视为冒犯或有害。
请参阅 [模型卡](https://huggingface.co/runwayml/stable-diffusion-v1-5) 获取有关模型潜在危害的更多细节。
feature_extractor ([`~transformers.CLIPImageProcessor`]):
用于从生成图像中提取特征的 `CLIPImageProcessor`;用于作为 `safety_checker` 的输入。
"""
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor"]
_exclude_from_cpu_offload = ["safety_checker"]
def __init__(
self,
vae: AutoencoderKL,
text_encoder: CLIPTextModel,
tokenizer: CLIPTokenizer,
processor: CLIPProcessor,
image_encoder: CLIPVisionModelWithProjection,
image_project: CLIPImageProjection,
unet: UNet2DConditionModel,
scheduler: KarrasDiffusionSchedulers,
safety_checker: StableDiffusionSafetyChecker,
feature_extractor: CLIPImageProcessor,
requires_safety_checker: bool = True,
):
super().__init__()
if safety_checker is None and requires_safety_checker:
logger.warning(
f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
" that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
" results in services or applications open to the public. Both the diffusers team and Hugging Face"
" strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
" it only for use-cases that involve analyzing network behavior or auditing its results. For more"
" information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
)
if safety_checker is not None and feature_extractor is None:
raise ValueError(
"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
self.register_modules(
vae=vae,
text_encoder=text_encoder,
tokenizer=tokenizer,
image_encoder=image_encoder,
processor=processor,
image_project=image_project,
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
self.register_to_config(requires_safety_checker=requires_safety_checker)
def encode_prompt(
self,
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt=None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
def run_safety_checker(self, image, device, dtype):
if self.safety_checker is None:
has_nsfw_concept = None
else:
if torch.is_tensor(image):
feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
else:
feature_extractor_input = self.image_processor.numpy_to_pil(image)
safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
return image, has_nsfw_concept
def prepare_extra_step_kwargs(self, generator, eta):
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
self,
prompt,
height,
width,
callback_steps,
negative_prompt=None,
prompt_embeds=None,
negative_prompt_embeds=None,
callback_on_step_end_tensor_inputs=None,
):
if height % 8 != 0 or width % 8 != 0:
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
f" {type(callback_steps)}."
)
if callback_on_step_end_tensor_inputs is not None and not all(
k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
):
raise ValueError(
f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
)
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
" only forward one of the two."
)
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
)
if prompt_embeds is not None and negative_prompt_embeds is not None:
if prompt_embeds.shape != negative_prompt_embeds.shape:
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
f" {negative_prompt_embeds.shape}."
)
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
shape = (
batch_size,
num_channels_latents,
int(height) // self.vae_scale_factor,
int(width) // self.vae_scale_factor,
)
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
if latents is None:
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
else:
latents = latents.to(device)
latents = latents * self.scheduler.init_noise_sigma
return latents
def enable_fuser(self, enabled=True):
for module in self.unet.modules():
if type(module) is GatedSelfAttentionDense:
module.enabled = enabled
def draw_inpaint_mask_from_boxes(self, boxes, size):
"""
Create an inpainting mask based on given boxes. This function generates an inpainting mask using the provided
boxes to mark regions that need to be inpainted.
"""
inpaint_mask = torch.ones(size[0], size[1])
for box in boxes:
x0, x1 = box[0] * size[0], box[2] * size[0]
y0, y1 = box[1] * size[1], box[3] * size[1]
inpaint_mask[int(y0) : int(y1), int(x0) : int(x1)] = 0
return inpaint_mask
def crop(self, im, new_width, new_height):
"""
Crop the input image to the specified dimensions.
"""
width, height = im.size
left = (width - new_width) / 2
top = (height - new_height) / 2
right = (width + new_width) / 2
bottom = (height + new_height) / 2
return im.crop((left, top, right, bottom))
def target_size_center_crop(self, im, new_hw):
"""
Crop and resize the image to the target size while keeping the center.
"""
width, height = im.size
if width != height:
im = self.crop(im, min(height, width), min(height, width))
return im.resize((new_hw, new_hw), PIL.Image.LANCZOS)
def complete_mask(self, has_mask, max_objs, device):
mask = torch.ones(1, max_objs).type(self.text_encoder.dtype).to(device)
if has_mask is None:
return mask
if isinstance(has_mask, int):
return mask * has_mask
else:
for idx, value in enumerate(has_mask):
mask[0, idx] = value
return mask
def get_clip_feature(self, input, normalize_constant, device, is_image=False):
if is_image:
if input is None:
return None
inputs = self.processor(images=[input], return_tensors="pt").to(device)
inputs["pixel_values"] = inputs["pixel_values"].to(self.image_encoder.dtype)
outputs = self.image_encoder(**inputs)
feature = outputs.image_embeds
feature = self.image_project(feature).squeeze(0)
feature = (feature / feature.norm()) * normalize_constant
feature = feature.unsqueeze(0)
else:
if input is None:
return None
inputs = self.tokenizer(input, return_tensors="pt", padding=True).to(device)
outputs = self.text_encoder(**inputs)
feature = outputs.pooler_output
return feature
def get_cross_attention_kwargs_with_grounded(
self,
hidden_size,
gligen_phrases,
gligen_images,
gligen_boxes,
input_phrases_mask,
input_images_mask,
repeat_batch,
normalize_constant,
max_objs,
device,
):
"""
准备交叉注意力的关键字参数,包含有关基础输入的信息(框,掩码,图像嵌入,短语嵌入)。
"""
phrases, images = gligen_phrases, gligen_images
images = [None] * len(phrases) if images is None else images
phrases = [None] * len(images) if phrases is None else phrases
boxes = torch.zeros(max_objs, 4, device=device, dtype=self.text_encoder.dtype)
masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
phrases_masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
image_masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
phrases_embeddings = torch.zeros(max_objs, hidden_size, device=device, dtype=self.text_encoder.dtype)
image_embeddings = torch.zeros(max_objs, hidden_size, device=device, dtype=self.text_encoder.dtype)
text_features = []
image_features = []
for phrase, image in zip(phrases, images):
text_features.append(self.get_clip_feature(phrase, normalize_constant, device, is_image=False))
image_features.append(self.get_clip_feature(image, normalize_constant, device, is_image=True))
for idx, (box, text_feature, image_feature) in enumerate(zip(gligen_boxes, text_features, image_features)):
boxes[idx] = torch.tensor(box)
masks[idx] = 1
if text_feature is not None:
phrases_embeddings[idx] = text_feature
phrases_masks[idx] = 1
if image_feature is not None:
image_embeddings[idx] = image_feature
image_masks[idx] = 1
input_phrases_mask = self.complete_mask(input_phrases_mask, max_objs, device)
phrases_masks = phrases_masks.unsqueeze(0).repeat(repeat_batch, 1) * input_phrases_mask
input_images_mask = self.complete_mask(input_images_mask, max_objs, device)
image_masks = image_masks.unsqueeze(0).repeat(repeat_batch, 1) * input_images_mask
boxes = boxes.unsqueeze(0).repeat(repeat_batch, 1, 1)
masks = masks.unsqueeze(0).repeat(repeat_batch, 1)
phrases_embeddings = phrases_embeddings.unsqueeze(0).repeat(repeat_batch, 1, 1)
image_embeddings = image_embeddings.unsqueeze(0).repeat(repeat_batch, 1, 1)
out = {
"boxes": boxes,
"masks": masks,
"phrases_masks": phrases_masks,
"image_masks": image_masks,
"phrases_embeddings": phrases_embeddings,
"image_embeddings": image_embeddings,
}
return out
def get_cross_attention_kwargs_without_grounded(self, hidden_size, repeat_batch, max_objs, device):
"""
准备无关于基础输入(框、掩码、图像嵌入、短语嵌入)信息的交叉注意力参数(均为零张量)。
"""
boxes = torch.zeros(max_objs, 4, device=device, dtype=self.text_encoder.dtype)
masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
phrases_masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
image_masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
phrases_embeddings = torch.zeros(max_objs, hidden_size, device=device, dtype=self.text_encoder.dtype)
image_embeddings = torch.zeros(max_objs, hidden_size, device=device, dtype=self.text_encoder.dtype)
out = {
"boxes": boxes.unsqueeze(0).repeat(repeat_batch, 1, 1),
"masks": masks.unsqueeze(0).repeat(repeat_batch, 1),
"phrases_masks": phrases_masks.unsqueeze(0).repeat(repeat_batch, 1),
"image_masks": image_masks.unsqueeze(0).repeat(repeat_batch, 1),
"phrases_embeddings": phrases_embeddings.unsqueeze(0).repeat(repeat_batch, 1, 1),
"image_embeddings": image_embeddings.unsqueeze(0).repeat(repeat_batch, 1, 1),
}
return out
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
prompt: Union[str, List[str]] = None,
height: Optional[int] = None,
width: Optional[int] = None,
num_inference_steps: int = 50,
guidance_scale: float = 7.5,
gligen_scheduled_sampling_beta: float = 0.3,
gligen_phrases: List[str] = None,
gligen_images: List[PIL.Image.Image] = None,
input_phrases_mask: Union[int, List[int]] = None,
input_images_mask: Union[int, List[int]] = None,
gligen_boxes: List[List[float]] = None,
gligen_inpaint_image: Optional[PIL.Image.Image] = None,
negative_prompt: Optional[Union[str, List[str]]] = None,
num_images_per_prompt: Optional[int] = 1,
eta: float = 0.0,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.Tensor] = None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
gligen_normalize_constant: float = 28.7,
clip_skip: int = None,
.\diffusers\pipelines\stable_diffusion_gligen\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
DIFFUSERS_SLOW_IMPORT,
OptionalDependencyNotAvailable,
_LazyModule,
get_objects_from_module,
is_torch_available,
is_transformers_available,
)
_dummy_objects = {}
_import_structure = {}
try:
if not (is_transformers_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils import dummy_torch_and_transformers_objects
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
_import_structure["pipeline_stable_diffusion_gligen"] = ["StableDiffusionGLIGENPipeline"]
_import_structure["pipeline_stable_diffusion_gligen_text_image"] = ["StableDiffusionGLIGENTextImagePipeline"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
try:
if not (is_transformers_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_objects import *
else:
from .pipeline_stable_diffusion_gligen import StableDiffusionGLIGENPipeline
from .pipeline_stable_diffusion_gligen_text_image import StableDiffusionGLIGENTextImagePipeline
else:
import sys
sys.modules[__name__] = _LazyModule(
__name__,
globals()["__file__"],
_import_structure,
module_spec=__spec__,
)
for name, value in _dummy_objects.items():
setattr(sys.modules[__name__], name, value)
.\diffusers\pipelines\stable_diffusion_k_diffusion\pipeline_stable_diffusion_k_diffusion.py
import importlib
import inspect
from typing import Callable, List, Optional, Union
import torch
from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
from k_diffusion.sampling import BrownianTreeNoiseSampler, get_sigmas_karras
from ...image_processor import VaeImageProcessor
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import LMSDiscreteScheduler
from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput
logger = logging.get_logger(__name__)
class ModelWrapper:
def __init__(self, model, alphas_cumprod):
self.model = model
self.alphas_cumprod = alphas_cumprod
def apply_model(self, *args, **kwargs):
if len(args) == 3:
encoder_hidden_states = args[-1]
args = args[:2]
if kwargs.get("cond", None) is not None:
encoder_hidden_states = kwargs.pop("cond")
return self.model(*args, encoder_hidden_states=encoder_hidden_states, **kwargs).sample
class StableDiffusionKDiffusionPipeline(
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
):
r""" # 文档字符串,描述此类的功能
用于文本到图像生成的管道,使用稳定扩散模型。
该模型继承自 [`DiffusionPipeline`]. 查看超类文档以获取库为所有管道实现的通用方法(例如下载或保存、在特定设备上运行等)。
此管道还继承以下加载方法:
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] 用于加载文本反演嵌入
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] 用于加载 LoRA 权重
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] 用于保存 LoRA 权重
<Tip warning={true}>
这是一个实验性管道,未来可能会发生变化。
</Tip>
# 文档字符串,描述参数的含义
Args:
vae ([`AutoencoderKL`]): # 变分自编码器模型,用于对图像进行编码和解码,转换为潜在表示
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
text_encoder ([`CLIPTextModel`]): # 冻结的文本编码器,稳定扩散使用 CLIP 的文本部分
Frozen text-encoder. Stable Diffusion uses the text portion of
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel),具体为
[clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) 变体。
tokenizer (`CLIPTokenizer`): # CLIP 的分词器,负责将文本转换为模型可接受的输入格式
Tokenizer of class
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
unet ([`UNet2DConditionModel`]): # 条件 U-Net 结构,用于对编码的图像潜在表示去噪
Conditional U-Net architecture to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]): # 用于与 U-Net 结合的调度器,帮助去噪图像潜在表示
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
safety_checker ([`StableDiffusionSafetyChecker`]): # 分类模块,评估生成的图像是否可能具有攻击性或有害
Classification module that estimates whether generated images could be considered offensive or harmful.
Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
feature_extractor ([`CLIPImageProcessor`]): # 模型从生成的图像中提取特征,以作为 `safety_checker` 的输入
Model that extracts features from generated images to be used as inputs for the `safety_checker`.
"""
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor"]
_exclude_from_cpu_offload = ["safety_checker"]
def __init__(
self,
vae,
text_encoder,
tokenizer,
unet,
scheduler,
safety_checker,
feature_extractor,
requires_safety_checker: bool = True,
):
super().__init__()
logger.info(
f"{self.__class__} is an experimntal pipeline and is likely to change in the future. We recommend to use"
" this pipeline for fast experimentation / iteration if needed, but advice to rely on existing pipelines"
" as defined in https://huggingface.co/docs/diffusers/api/schedulers#implemented-schedulers for"
" production settings."
)
scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
self.register_modules(
vae=vae,
text_encoder=text_encoder,
tokenizer=tokenizer,
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
)
self.register_to_config(requires_safety_checker=requires_safety_checker)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
model = ModelWrapper(unet, scheduler.alphas_cumprod)
if scheduler.config.prediction_type == "v_prediction":
self.k_diffusion_model = CompVisVDenoiser(model)
else:
self.k_diffusion_model = CompVisDenoiser(model)
def set_scheduler(self, scheduler_type: str):
library = importlib.import_module("k_diffusion")
sampling = getattr(library, "sampling")
try:
self.sampler = getattr(sampling, scheduler_type)
except Exception:
valid_samplers = []
for s in dir(sampling):
if "sample_" in s:
valid_samplers.append(s)
raise ValueError(f"Invalid scheduler type {scheduler_type}. Please choose one of {valid_samplers}.")
def _encode_prompt(
self,
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt=None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
lora_scale: Optional[float] = None,
**kwargs,
):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
prompt_embeds_tuple = self.encode_prompt(
prompt=prompt,
device=device,
num_images_per_prompt=num_images_per_prompt,
do_classifier_free_guidance=do_classifier_free_guidance,
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale,
**kwargs,
)
prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
return prompt_embeds
def encode_prompt(
self,
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt=None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
def run_safety_checker(self, image, device, dtype):
if self.safety_checker is None:
has_nsfw_concept = None
else:
if torch.is_tensor(image):
feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
else:
feature_extractor_input = self.image_processor.numpy_to_pil(image)
safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
return image, has_nsfw_concept
def decode_latents(self, latents):
deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
return image
def check_inputs(
self,
prompt,
height,
width,
callback_steps,
negative_prompt=None,
prompt_embeds=None,
negative_prompt_embeds=None,
callback_on_step_end_tensor_inputs=None,
):
if height % 8 != 0 or width % 8 != 0:
raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
f" {type(callback_steps)}."
)
if callback_on_step_end_tensor_inputs is not None and not all(
k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
):
raise ValueError(
f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
)
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
" only forward one of the two."
)
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
)
if prompt_embeds is not None and negative_prompt_embeds is not None:
if prompt_embeds.shape != negative_prompt_embeds.shape:
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
f" {negative_prompt_embeds.shape}."
)
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
shape = (
batch_size,
num_channels_latents,
int(height) // self.vae_scale_factor,
int(width) // self.vae_scale_factor,
)
if latents is None:
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
else:
if latents.shape != shape:
raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
latents = latents.to(device)
return latents
@torch.no_grad()
def __call__(
prompt: Union[str, List[str]] = None,
height: Optional[int] = None,
width: Optional[int] = None,
num_inference_steps: int = 50,
guidance_scale: float = 7.5,
negative_prompt: Optional[Union[str, List[str]]] = None,
num_images_per_prompt: Optional[int] = 1,
eta: float = 0.0,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.Tensor] = None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
callback_steps: int = 1,
use_karras_sigmas: Optional[bool] = False,
noise_sampler_seed: Optional[int] = None,
clip_skip: int = None,
.\diffusers\pipelines\stable_diffusion_k_diffusion\pipeline_stable_diffusion_xl_k_diffusion.py
import importlib
import inspect
from typing import List, Optional, Tuple, Union
import torch
from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
from k_diffusion.sampling import BrownianTreeNoiseSampler, get_sigmas_karras
from transformers import (
CLIPTextModel,
CLIPTextModelWithProjection,
CLIPTokenizer,
)
from ...image_processor import VaeImageProcessor
from ...loaders import (
FromSingleFileMixin,
IPAdapterMixin,
StableDiffusionXLLoraLoaderMixin,
TextualInversionLoaderMixin,
)
from ...models import AutoencoderKL, UNet2DConditionModel
from ...models.attention_processor import (
AttnProcessor2_0,
FusedAttnProcessor2_0,
XFormersAttnProcessor,
)
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers, LMSDiscreteScheduler
from ...utils import (
USE_PEFT_BACKEND,
logging,
replace_example_docstring,
scale_lora_layers,
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
logger = logging.get_logger(__name__)
EXAMPLE_DOC_STRING = """ # 示例文档字符串,演示用法
Examples:
```py
>>> import torch # 导入 PyTorch 库
>>> from diffusers import StableDiffusionXLKDiffusionPipeline # 导入扩散管道
>>> pipe = StableDiffusionXLKDiffusionPipeline.from_pretrained( # 从预训练模型加载管道
... "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 # 设置模型和数据类型
... )
>>> pipe = pipe.to("cuda") # 将管道移动到 GPU
>>> pipe.set_scheduler("sample_dpmpp_2m_sde") # 设置调度器
>>> prompt = "a photo of an astronaut riding a horse on mars" # 定义生成图像的提示
>>> image = pipe(prompt).images[0] # 生成图像并获取第一张
```py
"""
class ModelWrapper:
def __init__(self, model, alphas_cumprod):
self.model = model
self.alphas_cumprod = alphas_cumprod
def apply_model(self, *args, **kwargs):
if len(args) == 3:
encoder_hidden_states = args[-1]
args = args[:2]
if kwargs.get("cond", None) is not None:
encoder_hidden_states = kwargs.pop("cond")
return self.model(*args, encoder_hidden_states=encoder_hidden_states, **kwargs).sample
class StableDiffusionXLKDiffusionPipeline(
DiffusionPipeline,
StableDiffusionMixin,
FromSingleFileMixin,
StableDiffusionXLLoraLoaderMixin,
TextualInversionLoaderMixin,
IPAdapterMixin,
):
r"""
Pipeline for text-to-image generation using Stable Diffusion XL and k-diffusion.
# 该模型继承自 `DiffusionPipeline`。请查看超类文档以了解库为所有管道实现的通用方法
# (例如下载或保存、在特定设备上运行等)
# 该管道还继承了以下加载方法:
# [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] 用于加载文本反转嵌入
# [`~loaders.FromSingleFileMixin.from_single_file`] 用于加载 `.ckpt` 文件
# [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] 用于加载 LoRA 权重
# [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] 用于保存 LoRA 权重
# [`~loaders.IPAdapterMixin.load_ip_adapter`] 用于加载 IP 适配器
# 参数:
# vae ([`AutoencoderKL`] ):
# 变分自编码器 (VAE) 模型,用于编码和解码图像到潜在表示。
vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
# text_encoder ([`CLIPTextModel`]):
# 冻结的文本编码器。Stable Diffusion XL 使用 CLIP 的文本部分
# [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel),具体为
# [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) 变体。
text_encoder ([`CLIPTextModel`]):
Frozen text-encoder. Stable Diffusion XL uses the text portion of
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
# text_encoder_2 ([` CLIPTextModelWithProjection`]):
# 第二个冻结文本编码器。Stable Diffusion XL 使用 CLIP 的文本和池部分
# [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
# 具体为
# [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
# 变体。
text_encoder_2 ([` CLIPTextModelWithProjection`]):
Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
specifically the
[laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
variant.
# tokenizer (`CLIPTokenizer`):
# CLIP 的分词器
# Tokenizer of class
# [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
tokenizer (`CLIPTokenizer`):
Tokenizer of class
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
# tokenizer_2 (`CLIPTokenizer`):
# 第二个 CLIP 分词器
# Second Tokenizer of class
# [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
tokenizer_2 (`CLIPTokenizer`):
Second Tokenizer of class
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
# unet ([`UNet2DConditionModel`]): 条件 U-Net 架构,用于去噪编码后的图像潜变量。
# Conditional U-Net architecture to denoise the encoded image latents.
unet ([`UNet2DConditionModel`]):
Conditional U-Net architecture to denoise the encoded image latents.
# scheduler ([`SchedulerMixin`]):
# 用于与 `unet` 结合使用的调度器,以去噪编码的图像潜变量。可以是
# [`DDIMScheduler`], [`LMSDiscreteScheduler`] 或 [`PNDMScheduler`] 中的一个。
scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
# force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
# 是否将负提示嵌入强制设置为 0。还请参见
# `stabilityai/stable-diffusion-xl-base-1-0` 的配置。
force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
`stabilityai/stable-diffusion-xl-base-1-0`.
"""
model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
_optional_components = [
"tokenizer",
"tokenizer_2",
"text_encoder",
"text_encoder_2",
"feature_extractor",
]
def __init__(
self,
vae: AutoencoderKL,
text_encoder: CLIPTextModel,
text_encoder_2: CLIPTextModelWithProjection,
tokenizer: CLIPTokenizer,
tokenizer_2: CLIPTokenizer,
unet: UNet2DConditionModel,
scheduler: KarrasDiffusionSchedulers,
force_zeros_for_empty_prompt: bool = True,
):
super().__init__()
scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
self.register_modules(
vae=vae,
text_encoder=text_encoder,
text_encoder_2=text_encoder_2,
tokenizer=tokenizer,
tokenizer_2=tokenizer_2,
unet=unet,
scheduler=scheduler,
)
self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.default_sample_size = self.unet.config.sample_size
model = ModelWrapper(unet, scheduler.alphas_cumprod)
if scheduler.config.prediction_type == "v_prediction":
self.k_diffusion_model = CompVisVDenoiser(model)
else:
self.k_diffusion_model = CompVisDenoiser(model)
def set_scheduler(self, scheduler_type: str):
library = importlib.import_module("k_diffusion")
sampling = getattr(library, "sampling")
try:
self.sampler = getattr(sampling, scheduler_type)
except Exception:
valid_samplers = []
for s in dir(sampling):
if "sample_" in s:
valid_samplers.append(s)
raise ValueError(f"Invalid scheduler type {scheduler_type}. Please choose one of {valid_samplers}.")
def encode_prompt(
self,
prompt: str,
prompt_2: Optional[str] = None,
device: Optional[torch.device] = None,
num_images_per_prompt: int = 1,
do_classifier_free_guidance: bool = True,
negative_prompt: Optional[str] = None,
negative_prompt_2: Optional[str] = None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
pooled_prompt_embeds: Optional[torch.Tensor] = None,
negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
def check_inputs(
self,
prompt,
prompt_2,
height,
width,
negative_prompt=None,
negative_prompt_2=None,
prompt_embeds=None,
negative_prompt_embeds=None,
pooled_prompt_embeds=None,
negative_pooled_prompt_embeds=None,
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
shape = (
batch_size,
num_channels_latents,
int(height) // self.vae_scale_factor,
int(width) // self.vae_scale_factor,
)
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
if latents is None:
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
else:
latents = latents.to(device)
return latents
def _get_add_time_ids(
self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
):
add_time_ids = list(original_size + crops_coords_top_left + target_size)
passed_add_embed_dim = (
self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
)
expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
if expected_add_embed_dim != passed_add_embed_dim:
raise ValueError(
f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
)
add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
return add_time_ids
def upcast_vae(self):
dtype = self.vae.dtype
self.vae.to(dtype=torch.float32)
use_torch_2_0_or_xformers = isinstance(
self.vae.decoder.mid_block.attentions[0].processor,
(
AttnProcessor2_0,
XFormersAttnProcessor,
FusedAttnProcessor2_0,
),
)
if use_torch_2_0_or_xformers:
self.vae.post_quant_conv.to(dtype)
self.vae.decoder.conv_in.to(dtype)
self.vae.decoder.mid_block.to(dtype)
@property
def guidance_scale(self):
return self._guidance_scale
@property
def clip_skip(self):
return self._clip_skip
@property
def do_classifier_free_guidance(self):
return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
prompt: Union[str, List[str]] = None,
prompt_2: Optional[Union[str, List[str]]] = None,
height: Optional[int] = None,
width: Optional[int] = None,
num_inference_steps: int = 50,
guidance_scale: float = 5.0,
negative_prompt: Optional[Union[str, List[str]]] = None,
negative_prompt_2: Optional[Union[str, List[str]]] = None,
num_images_per_prompt: Optional[int] = 1,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.Tensor] = None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
pooled_prompt_embeds: Optional[torch.Tensor] = None,
negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
original_size: Optional[Tuple[int, int]] = None,
crops_coords_top_left: Tuple[int, int] = (0, 0),
target_size: Optional[Tuple[int, int]] = None,
negative_original_size: Optional[Tuple[int, int]] = None,
negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
negative_target_size: Optional[Tuple[int, int]] = None,
use_karras_sigmas: Optional[bool] = False,
noise_sampler_seed: Optional[int] = None,
clip_skip: Optional[int] = None,
.\diffusers\pipelines\stable_diffusion_k_diffusion\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
DIFFUSERS_SLOW_IMPORT,
OptionalDependencyNotAvailable,
_LazyModule,
get_objects_from_module,
is_k_diffusion_available,
is_k_diffusion_version,
is_torch_available,
is_transformers_available,
)
_dummy_objects = {}
_import_structure = {}
try:
if not (
is_transformers_available()
and is_torch_available()
and is_k_diffusion_available()
and is_k_diffusion_version(">=", "0.0.12")
):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils import dummy_torch_and_transformers_and_k_diffusion_objects
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_k_diffusion_objects))
else:
_import_structure["pipeline_stable_diffusion_k_diffusion"] = ["StableDiffusionKDiffusionPipeline"]
_import_structure["pipeline_stable_diffusion_xl_k_diffusion"] = ["StableDiffusionXLKDiffusionPipeline"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
try:
if not (
is_transformers_available()
and is_torch_available()
and is_k_diffusion_available()
and is_k_diffusion_version(">=", "0.0.12")
):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_and_k_diffusion_objects import *
else:
from .pipeline_stable_diffusion_k_diffusion import StableDiffusionKDiffusionPipeline
from .pipeline_stable_diffusion_xl_k_diffusion import StableDiffusionXLKDiffusionPipeline
else:
import sys
sys.modules[__name__] = _LazyModule(
__name__,
globals()["__file__"],
_import_structure,
module_spec=__spec__,
)
for name, value in _dummy_objects.items():
setattr(sys.modules[__name__], name, value)
.\diffusers\pipelines\stable_diffusion_ldm3d\pipeline_stable_diffusion_ldm3d.py
import inspect
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Optional, Union
import numpy as np
import PIL.Image
import torch
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
from ...image_processor import PipelineImageInput, VaeImageProcessorLDM3D
from ...loaders import FromSingleFileMixin, IPAdapterMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import (
USE_PEFT_BACKEND,
BaseOutput,
deprecate,
logging,
replace_example_docstring,
scale_lora_layers,
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
logger = logging.get_logger(__name__)
EXAMPLE_DOC_STRING = """ # 定义示例文档字符串,用于展示如何使用StableDiffusionLDM3DPipeline
Examples:
```python
>>> from diffusers import StableDiffusionLDM3DPipeline # 从diffusers模块导入StableDiffusionLDM3DPipeline
>>> pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d-4c") # 从预训练模型加载管道
>>> pipe = pipe.to("cuda") # 将管道移动到GPU设备上
>>> prompt = "a photo of an astronaut riding a horse on mars" # 定义生成图像的提示
>>> output = pipe(prompt) # 使用提示生成图像输出
>>> rgb_image, depth_image = output.rgb, output.depth # 从输出中提取RGB图像和深度图像
>>> rgb_image[0].save("astronaut_ldm3d_rgb.jpg") # 保存RGB图像
>>> depth_image[0].save("astronaut_ldm3d_depth.png") # 保存深度图像
```py
"""
def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
"""
根据`guidance_rescale`重缩放`noise_cfg`。基于[Common Diffusion Noise Schedules and
Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf)中的发现。见第3.4节
"""
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
return noise_cfg
def retrieve_timesteps(
scheduler,
num_inference_steps: Optional[int] = None,
device: Optional[Union[str, torch.device]] = None,
timesteps: Optional[List[int]] = None,
sigmas: Optional[List[float]] = None,
**kwargs,
):
"""
调用调度器的 `set_timesteps` 方法,并在调用后从调度器检索时间步。处理自定义时间步。
任何 kwargs 将被传递给 `scheduler.set_timesteps`。
参数:
scheduler (`SchedulerMixin`):
获取时间步的调度器。
num_inference_steps (`int`):
生成样本时使用的扩散步骤数。如果使用,`timesteps` 必须为 `None`。
device (`str` 或 `torch.device`, *可选*):
时间步要移动到的设备。如果为 `None`,时间步将不被移动。
timesteps (`List[int]`, *可选*):
自定义时间步,用于覆盖调度器的时间步间隔策略。如果传递了 `timesteps`,
则 `num_inference_steps` 和 `sigmas` 必须为 `None`。
sigmas (`List[float]`, *可选*):
自定义 sigma,用于覆盖调度器的时间步间隔策略。如果传递了 `sigmas`,
则 `num_inference_steps` 和 `timesteps` 必须为 `None`。
返回:
`Tuple[torch.Tensor, int]`: 一个元组,第一个元素是来自调度器的时间步调度,第二个元素是推理步骤的数量。
"""
if timesteps is not None and sigmas is not None:
raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
if timesteps is not None:
accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
if not accepts_timesteps:
raise ValueError(
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
f" timestep schedules. Please check whether you are using the correct scheduler."
)
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
timesteps = scheduler.timesteps
num_inference_steps = len(timesteps)
elif sigmas is not None:
accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
if not accept_sigmas:
raise ValueError(
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
f" sigmas schedules. Please check whether you are using the correct scheduler."
)
scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
timesteps = scheduler.timesteps
num_inference_steps = len(timesteps)
else:
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
timesteps = scheduler.timesteps
return timesteps, num_inference_steps
@dataclass
class LDM3DPipelineOutput(BaseOutput):
"""
输出类,用于稳定扩散管道。
参数:
rgb (`List[PIL.Image.Image]` 或 `np.ndarray`)
表示去噪后的 PIL 图像列表,长度为 `batch_size` 或形状为 `(batch_size, height, width,
num_channels)` 的 NumPy 数组。
depth (`List[PIL.Image.Image]` 或 `np.ndarray`)
表示去噪后的 PIL 图像列表,长度为 `batch_size` 或形状为 `(batch_size, height, width,
num_channels)` 的 NumPy 数组。
nsfw_content_detected (`List[bool]`)
表示相应生成图像是否包含“不适合工作” (nsfw) 内容的列表,如果无法执行安全检查则为 `None`。
"""
rgb: Union[List[PIL.Image.Image], np.ndarray]
depth: Union[List[PIL.Image.Image], np.ndarray]
nsfw_content_detected: Optional[List[bool]]
class StableDiffusionLDM3DPipeline(
DiffusionPipeline,
StableDiffusionMixin,
TextualInversionLoaderMixin,
IPAdapterMixin,
StableDiffusionLoraLoaderMixin,
FromSingleFileMixin,
):
r"""
用于文本到图像和 3D 生成的管道,使用 LDM3D。
此模型继承自 [`DiffusionPipeline`]。请查看超类文档以获取所有管道的通用方法(下载、保存、在特定设备上运行等)。
此管道还继承以下加载方法:
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] 用于加载文本反转嵌入
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] 用于加载 LoRA 权重
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] 用于保存 LoRA 权重
- [`~loaders.FromSingleFileMixin.from_single_file`] 用于加载 `.ckpt` 文件
- [`~loaders.IPAdapterMixin.load_ip_adapter`] 用于加载 IP 适配器
# 定义参数说明部分
Args:
vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) 模型,用于编码和解码图像到潜在表示。
text_encoder ([`~transformers.CLIPTextModel`]):
冻结的文本编码器([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14))。
tokenizer ([`~transformers.CLIPTokenizer`]):
用于将文本标记化的 `CLIPTokenizer`。
unet ([`UNet2DConditionModel`]):
用于对编码的图像潜在表示进行去噪的 `UNet2DConditionModel`。
scheduler ([`SchedulerMixin`]):
与 `unet` 结合使用的调度器,用于对编码的图像潜在表示进行去噪。可以是
[`DDIMScheduler`], [`LMSDiscreteScheduler`] 或 [`PNDMScheduler`] 之一。
safety_checker ([`StableDiffusionSafetyChecker`]):
分类模块,用于评估生成的图像是否可能被视为冒犯或有害。
有关模型潜在危害的更多详细信息,请参阅 [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5)。
feature_extractor ([`~transformers.CLIPImageProcessor`]):
用于从生成图像中提取特征的 `CLIPImageProcessor`;作为输入用于 `safety_checker`。
"""
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
_exclude_from_cpu_offload = ["safety_checker"]
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
def __init__(
vae: AutoencoderKL,
text_encoder: CLIPTextModel,
tokenizer: CLIPTokenizer,
unet: UNet2DConditionModel,
scheduler: KarrasDiffusionSchedulers,
safety_checker: StableDiffusionSafetyChecker,
feature_extractor: CLIPImageProcessor,
image_encoder: Optional[CLIPVisionModelWithProjection],
requires_safety_checker: bool = True,
):
super().__init__()
if safety_checker is None and requires_safety_checker:
logger.warning(
f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
" that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
" results in services or applications open to the public. Both the diffusers team and Hugging Face"
" strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
" it only for use-cases that involve analyzing network behavior or auditing its results. For more"
" information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
)
if safety_checker is not None and feature_extractor is None:
raise ValueError(
"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
self.register_modules(
vae=vae,
text_encoder=text_encoder,
tokenizer=tokenizer,
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
image_encoder=image_encoder,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessorLDM3D(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
def _encode_prompt(
self,
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt=None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
lora_scale: Optional[float] = None,
**kwargs,
):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
prompt_embeds_tuple = self.encode_prompt(
prompt=prompt,
device=device,
num_images_per_prompt=num_images_per_prompt,
do_classifier_free_guidance=do_classifier_free_guidance,
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale,
**kwargs,
)
prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
return prompt_embeds
def encode_prompt(
self,
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt=None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
dtype = next(self.image_encoder.parameters()).dtype
if not isinstance(image, torch.Tensor):
image = self.feature_extractor(image, return_tensors="pt").pixel_values
image = image.to(device=device, dtype=dtype)
if output_hidden_states:
image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_enc_hidden_states = self.image_encoder(
torch.zeros_like(image), output_hidden_states=True
).hidden_states[-2]
uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
num_images_per_prompt, dim=0
)
return image_enc_hidden_states, uncond_image_enc_hidden_states
else:
image_embeds = self.image_encoder(image).image_embeds
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
uncond_image_embeds = torch.zeros_like(image_embeds)
return image_embeds, uncond_image_embeds
def prepare_ip_adapter_image_embeds(
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
):
image_embeds = []
if do_classifier_free_guidance:
negative_image_embeds = []
if ip_adapter_image_embeds is None:
if not isinstance(ip_adapter_image, list):
ip_adapter_image = [ip_adapter_image]
if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
raise ValueError(
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
)
for single_ip_adapter_image, image_proj_layer in zip(
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
):
output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
single_image_embeds, single_negative_image_embeds = self.encode_image(
single_ip_adapter_image, device, 1, output_hidden_state
)
image_embeds.append(single_image_embeds[None, :])
if do_classifier_free_guidance:
negative_image_embeds.append(single_negative_image_embeds[None, :])
else:
for single_image_embeds in ip_adapter_image_embeds:
if do_classifier_free_guidance:
single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
negative_image_embeds.append(single_negative_image_embeds)
image_embeds.append(single_image_embeds)
ip_adapter_image_embeds = []
for i, single_image_embeds in enumerate(image_embeds):
single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
if do_classifier_free_guidance:
single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
single_image_embeds = single_image_embeds.to(device=device)
ip_adapter_image_embeds.append(single_image_embeds)
return ip_adapter_image_embeds
def run_safety_checker(self, image, device, dtype):
if self.safety_checker is None:
has_nsfw_concept = None
else:
if torch.is_tensor(image):
feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
else:
feature_extractor_input = self.image_processor.numpy_to_pil(image)
rgb_feature_extractor_input = feature_extractor_input[0]
safety_checker_input = self.feature_extractor(rgb_feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
return image, has_nsfw_concept
def prepare_extra_step_kwargs(self, generator, eta):
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
self,
prompt,
height,
width,
callback_steps,
negative_prompt=None,
prompt_embeds=None,
negative_prompt_embeds=None,
ip_adapter_image=None,
ip_adapter_image_embeds=None,
callback_on_step_end_tensor_inputs=None,
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
shape = (
batch_size,
num_channels_latents,
int(height) // self.vae_scale_factor,
int(width) // self.vae_scale_factor,
)
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
if latents is None:
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
else:
latents = latents.to(device)
latents = latents * self.scheduler.init_noise_sigma
return latents
def get_guidance_scale_embedding(
self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
) -> torch.Tensor:
"""
查看链接 https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
参数:
w (`torch.Tensor`):
生成带有指定引导缩放的嵌入向量,以丰富时间步嵌入。
embedding_dim (`int`, *可选*, 默认为 512):
要生成的嵌入维度。
dtype (`torch.dtype`, *可选*, 默认为 `torch.float32`):
生成的嵌入数据类型。
返回:
`torch.Tensor`: 嵌入向量,形状为 `(len(w), embedding_dim)`。
"""
assert len(w.shape) == 1
w = w * 1000.0
half_dim = embedding_dim // 2
emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
emb = w.to(dtype)[:, None] * emb[None, :]
emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
if embedding_dim % 2 == 1:
emb = torch.nn.functional.pad(emb, (0, 1))
assert emb.shape == (w.shape[0], embedding_dim)
return emb
@property
def guidance_scale(self):
return self._guidance_scale
@property
def guidance_rescale(self):
return self._guidance_rescale
@property
def clip_skip(self):
return self._clip_skip
@property
def do_classifier_free_guidance(self):
return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
@property
def cross_attention_kwargs(self):
return self._cross_attention_kwargs
@property
def num_timesteps(self):
return self._num_timesteps
@property
def interrupt(self):
return self._interrupt
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
prompt: Union[str, List[str]] = None,
height: Optional[int] = None,
width: Optional[int] = None,
num_inference_steps: int = 49,
timesteps: List[int] = None,
sigmas: List[float] = None,
guidance_scale: float = 5.0,
negative_prompt: Optional[Union[str, List[str]]] = None,
num_images_per_prompt: Optional[int] = 1,
eta: float = 0.0,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.Tensor] = None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
ip_adapter_image: Optional[PipelineImageInput] = None,
ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
guidance_rescale: float = 0.0,
clip_skip: Optional[int] = None,
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
**kwargs,
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律