diffusers 源码解析(四十八)
.\diffusers\pipelines\stable_diffusion_3\pipeline_stable_diffusion_3_img2img.py
import inspect
from typing import Callable, Dict, List, Optional, Union
import PIL.Image
import torch
from transformers import (
CLIPTextModelWithProjection,
CLIPTokenizer,
T5EncoderModel,
T5TokenizerFast,
)
from ...image_processor import PipelineImageInput, VaeImageProcessor
from ...loaders import SD3LoraLoaderMixin
from ...models.autoencoders import AutoencoderKL
from ...models.transformers import SD3Transformer2DModel
from ...schedulers import FlowMatchEulerDiscreteScheduler
from ...utils import (
USE_PEFT_BACKEND,
is_torch_xla_available,
logging,
replace_example_docstring,
scale_lora_layers,
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import DiffusionPipeline
from .pipeline_output import StableDiffusion3PipelineOutput
if is_torch_xla_available():
import torch_xla.core.xla_model as xm
XLA_AVAILABLE = True
else:
XLA_AVAILABLE = False
logger = logging.get_logger(__name__)
EXAMPLE_DOC_STRING = """
Examples:
```py
>>> import torch
>>> from diffusers import AutoPipelineForImage2Image
>>> from diffusers.utils import load_image
>>> device = "cuda" # 设置设备为 CUDA
>>> model_id_or_path = "stabilityai/stable-diffusion-3-medium-diffusers" # 指定模型路径
>>> pipe = AutoPipelineForImage2Image.from_pretrained(model_id_or_path, torch_dtype=torch.float16) # 从预训练模型加载管道
>>> pipe = pipe.to(device) # 将管道移动到指定设备
>>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg" # 指定输入图像 URL
>>> init_image = load_image(url).resize((1024, 1024)) # 加载并调整图像大小
>>> prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k" # 设置生成的提示语
>>> images = pipe(prompt=prompt, image=init_image, strength=0.95, guidance_scale=7.5).images[0] # 生成图像
```py
"""
def retrieve_latents(
encoder_output: torch.Tensor,
generator: Optional[torch.Generator] = None,
sample_mode: str = "sample"
):
if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
return encoder_output.latent_dist.sample(generator)
elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
return encoder_output.latent_dist.mode()
elif hasattr(encoder_output, "latents"):
return encoder_output.latents
else:
raise AttributeError("Could not access latents of provided encoder_output")
def retrieve_timesteps(
scheduler,
num_inference_steps: Optional[int] = None,
device: Optional[Union[str, torch.device]] = None,
timesteps: Optional[List[int]] = None,
sigmas: Optional[List[float]] = None,
**kwargs,
):
"""
调用调度器的 `set_timesteps` 方法,并在调用后从调度器中检索时间步。处理
自定义时间步。所有关键字参数将传递给 `scheduler.set_timesteps`。
参数:
scheduler (`SchedulerMixin`):
用于获取时间步的调度器。
num_inference_steps (`int`):
生成样本时使用的扩散步骤数。如果使用,则 `timesteps`
必须为 `None`。
device (`str` 或 `torch.device`, *可选*):
要将时间步移动到的设备。如果为 `None`,则时间步不移动。
timesteps (`List[int]`, *可选*):
用于覆盖调度器的时间步间距策略的自定义时间步。如果传递 `timesteps`,
`num_inference_steps` 和 `sigmas` 必须为 `None`。
sigmas (`List[float]`, *可选*):
用于覆盖调度器的时间步间距策略的自定义 sigma。如果传递 `sigmas`,
`num_inference_steps` 和 `timesteps` 必须为 `None`。
返回:
`Tuple[torch.Tensor, int]`: 一个元组,第一个元素是来自调度器的时间步调度,
第二个元素是推断步骤的数量。
"""
if timesteps is not None and sigmas is not None:
raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
if timesteps is not None:
accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
if not accepts_timesteps:
raise ValueError(
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
f" timestep schedules. Please check whether you are using the correct scheduler."
)
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
timesteps = scheduler.timesteps
num_inference_steps = len(timesteps)
elif sigmas is not None:
accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
if not accept_sigmas:
raise ValueError(
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
f" sigmas schedules. Please check whether you are using the correct scheduler."
)
scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
timesteps = scheduler.timesteps
num_inference_steps = len(timesteps)
else:
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
timesteps = scheduler.timesteps
return timesteps, num_inference_steps
class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
r"""
Args:
transformer ([`SD3Transformer2DModel`]):
Conditional Transformer (MMDiT) 结构,用于对编码后的图像潜变量进行去噪。
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
一个调度器,结合 `transformer` 用于对编码后的图像潜变量进行去噪。
vae ([`AutoencoderKL`]):
变分自编码器 (VAE) 模型,用于在潜在表示和图像之间进行编码和解码。
text_encoder ([`CLIPTextModelWithProjection`]):
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
特别是 [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) 变体,
并添加了一个投影层,该层使用对角矩阵初始化,维度为 `hidden_size`。
text_encoder_2 ([`CLIPTextModelWithProjection`]):
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
特别是
[laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
变体。
text_encoder_3 ([`T5EncoderModel`]):
冻结的文本编码器。Stable Diffusion 3 使用
[T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel),特别是
[t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) 变体。
tokenizer (`CLIPTokenizer`):
类
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) 的标记器。
tokenizer_2 (`CLIPTokenizer`):
类
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) 的第二个标记器。
tokenizer_3 (`T5TokenizerFast`):
类
[T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer) 的标记器。
"""
model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
_optional_components = []
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
def __init__(
self,
transformer: SD3Transformer2DModel,
scheduler: FlowMatchEulerDiscreteScheduler,
vae: AutoencoderKL,
text_encoder: CLIPTextModelWithProjection,
tokenizer: CLIPTokenizer,
text_encoder_2: CLIPTextModelWithProjection,
tokenizer_2: CLIPTokenizer,
text_encoder_3: T5EncoderModel,
tokenizer_3: T5TokenizerFast,
):
super().__init__()
self.register_modules(
vae=vae,
text_encoder=text_encoder,
text_encoder_2=text_encoder_2,
text_encoder_3=text_encoder_3,
tokenizer=tokenizer,
tokenizer_2=tokenizer_2,
tokenizer_3=tokenizer_3,
transformer=transformer,
scheduler=scheduler,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(
vae_scale_factor=self.vae_scale_factor, vae_latent_channels=self.vae.config.latent_channels
)
self.tokenizer_max_length = self.tokenizer.model_max_length
self.default_sample_size = self.transformer.config.sample_size
def _get_t5_prompt_embeds(
self,
prompt: Union[str, List[str]] = None,
num_images_per_prompt: int = 1,
max_sequence_length: int = 256,
device: Optional[torch.device] = None,
dtype: Optional[torch.dtype] = None,
):
device = device or self._execution_device
dtype = dtype or self.text_encoder.dtype
prompt = [prompt] if isinstance(prompt, str) else prompt
batch_size = len(prompt)
if self.text_encoder_3 is None:
return torch.zeros(
(
batch_size * num_images_per_prompt,
self.tokenizer_max_length,
self.transformer.config.joint_attention_dim,
),
device=device,
dtype=dtype,
)
text_inputs = self.tokenizer_3(
prompt,
padding="max_length",
max_length=max_sequence_length,
truncation=True,
add_special_tokens=True,
return_tensors="pt",
)
text_input_ids = text_inputs.input_ids
untruncated_ids = self.tokenizer_3(prompt, padding="longest", return_tensors="pt").input_ids
if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
removed_text = self.tokenizer_3.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
logger.warning(
"The following part of your input was truncated because `max_sequence_length` is set to "
f" {max_sequence_length} tokens: {removed_text}"
)
prompt_embeds = self.text_encoder_3(text_input_ids.to(device))[0]
dtype = self.text_encoder_3.dtype
prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
_, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
return prompt_embeds
def _get_clip_prompt_embeds(
self,
prompt: Union[str, List[str]],
num_images_per_prompt: int = 1,
device: Optional[torch.device] = None,
clip_skip: Optional[int] = None,
clip_model_index: int = 0,
):
device = device or self._execution_device
clip_tokenizers = [self.tokenizer, self.tokenizer_2]
clip_text_encoders = [self.text_encoder, self.text_encoder_2]
tokenizer = clip_tokenizers[clip_model_index]
text_encoder = clip_text_encoders[clip_model_index]
prompt = [prompt] if isinstance(prompt, str) else prompt
batch_size = len(prompt)
text_inputs = tokenizer(
prompt,
padding="max_length",
max_length=self.tokenizer_max_length,
truncation=True,
return_tensors="pt",
)
text_input_ids = text_inputs.input_ids
untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
removed_text = tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
f" {self.tokenizer_max_length} tokens: {removed_text}"
)
prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
pooled_prompt_embeds = prompt_embeds[0]
if clip_skip is None:
prompt_embeds = prompt_embeds.hidden_states[-2]
else:
prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
_, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
return prompt_embeds, pooled_prompt_embeds
def encode_prompt(
self,
prompt: Union[str, List[str]],
prompt_2: Union[str, List[str]],
prompt_3: Union[str, List[str]],
device: Optional[torch.device] = None,
num_images_per_prompt: int = 1,
do_classifier_free_guidance: bool = True,
negative_prompt: Optional[Union[str, List[str]]] = None,
negative_prompt_2: Optional[Union[str, List[str]]] = None,
negative_prompt_3: Optional[Union[str, List[str]]] = None,
prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
clip_skip: Optional[int] = None,
max_sequence_length: int = 256,
lora_scale: Optional[float] = None,
def check_inputs(
self,
prompt,
prompt_2,
prompt_3,
strength,
negative_prompt=None,
negative_prompt_2=None,
negative_prompt_3=None,
prompt_embeds=None,
negative_prompt_embeds=None,
pooled_prompt_embeds=None,
negative_pooled_prompt_embeds=None,
callback_on_step_end_tensor_inputs=None,
max_sequence_length=None,
def get_timesteps(self, num_inference_steps, strength, device):
init_timestep = min(num_inference_steps * strength, num_inference_steps)
t_start = int(max(num_inference_steps - init_timestep, 0))
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
if hasattr(self.scheduler, "set_begin_index"):
self.scheduler.set_begin_index(t_start * self.scheduler.order)
return timesteps, num_inference_steps - t_start
def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
raise ValueError(
f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
)
image = image.to(device=device, dtype=dtype)
batch_size = batch_size * num_images_per_prompt
if image.shape[1] == self.vae.config.latent_channels:
init_latents = image
else:
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
elif isinstance(generator, list):
init_latents = [
retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
for i in range(batch_size)
]
init_latents = torch.cat(init_latents, dim=0)
else:
init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
init_latents = (init_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
additional_image_per_prompt = batch_size // init_latents.shape[0]
init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
raise ValueError(
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
)
else:
init_latents = torch.cat([init_latents], dim=0)
shape = init_latents.shape
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
init_latents = self.scheduler.scale_noise(init_latents, timestep, noise)
latents = init_latents.to(device=device, dtype=dtype)
return latents
@property
def guidance_scale(self):
return self._guidance_scale
@property
def clip_skip(self):
return self._clip_skip
@property
def do_classifier_free_guidance(self):
return self._guidance_scale > 1
@property
def num_timesteps(self):
return self._num_timesteps
@property
def interrupt(self):
return self._interrupt
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
prompt: Union[str, List[str]] = None,
prompt_2: Optional[Union[str, List[str]]] = None,
prompt_3: Optional[Union[str, List[str]]] = None,
image: PipelineImageInput = None,
strength: float = 0.6,
num_inference_steps: int = 50,
timesteps: List[int] = None,
guidance_scale: float = 7.0,
negative_prompt: Optional[Union[str, List[str]]] = None,
negative_prompt_2: Optional[Union[str, List[str]]] = None,
negative_prompt_3: Optional[Union[str, List[str]]] = None,
num_images_per_prompt: Optional[int] = 1,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.FloatTensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
clip_skip: Optional[int] = None,
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
max_sequence_length: int = 256,
.\diffusers\pipelines\stable_diffusion_3\pipeline_stable_diffusion_3_inpaint.py
import inspect
from typing import Callable, Dict, List, Optional, Union
import torch
from transformers import (
CLIPTextModelWithProjection,
CLIPTokenizer,
T5EncoderModel,
T5TokenizerFast,
)
from ...callbacks import MultiPipelineCallbacks, PipelineCallback
from ...image_processor import PipelineImageInput, VaeImageProcessor
from ...loaders import SD3LoraLoaderMixin
from ...models.autoencoders import AutoencoderKL
from ...models.transformers import SD3Transformer2DModel
from ...schedulers import FlowMatchEulerDiscreteScheduler
from ...utils import (
USE_PEFT_BACKEND,
is_torch_xla_available,
logging,
replace_example_docstring,
scale_lora_layers,
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import DiffusionPipeline
from .pipeline_output import StableDiffusion3PipelineOutput
if is_torch_xla_available():
import torch_xla.core.xla_model as xm
XLA_AVAILABLE = True
else:
XLA_AVAILABLE = False
logger = logging.get_logger(__name__)
EXAMPLE_DOC_STRING = """ # 定义示例文档字符串
Examples: # 示例说明
```py # 开始代码块
>>> import torch # 导入 PyTorch 库
>>> from diffusers import StableDiffusion3InpaintPipeline # 导入稳定扩散修复管道类
>>> from diffusers.utils import load_image # 导入加载图像的实用工具
>>> pipe = StableDiffusion3InpaintPipeline.from_pretrained( # 从预训练模型加载管道
... "stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16 # 指定模型名称及数据类型
... )
>>> pipe.to("cuda") # 将管道转移到 CUDA 设备
>>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench" # 定义生成图像的提示
>>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" # 定义源图像 URL
>>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" # 定义掩模图像 URL
>>> source = load_image(img_url) # 加载源图像
>>> mask = load_image(mask_url) # 加载掩模图像
>>> image = pipe(prompt=prompt, image=source, mask_image=mask).images[0] # 生成修复后的图像
>>> image.save("sd3_inpainting.png") # 保存生成的图像
```py # 结束代码块
"""
def retrieve_latents(
encoder_output: torch.Tensor,
generator: Optional[torch.Generator] = None,
sample_mode: str = "sample"
):
if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
return encoder_output.latent_dist.sample(generator)
elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
return encoder_output.latent_dist.mode()
elif hasattr(encoder_output, "latents"):
return encoder_output.latents
else:
raise AttributeError("Could not access latents of provided encoder_output")
def retrieve_timesteps(
scheduler,
num_inference_steps: Optional[int] = None,
device: Optional[Union[str, torch.device]] = None,
timesteps: Optional[List[int]] = None,
sigmas: Optional[List[float]] = None,
**kwargs,
):
"""
调用调度器的 `set_timesteps` 方法,并在调用后从调度器获取时间步。处理
自定义时间步。任何 kwargs 将被传递给 `scheduler.set_timesteps`。
参数:
scheduler (`SchedulerMixin`):
用于获取时间步的调度器。
num_inference_steps (`int`):
用于生成样本的扩散步骤数。如果使用,则 `timesteps` 必须为 `None`。
device (`str` 或 `torch.device`,*可选*):
时间步应移动到的设备。如果为 `None`,则时间步不会移动。
timesteps (`List[int]`,*可选*):
自定义时间步,用于覆盖调度器的时间步间隔策略。如果传递 `timesteps`,
则 `num_inference_steps` 和 `sigmas` 必须为 `None`。
sigmas (`List[float]`,*可选*):
自定义 sigma,用于覆盖调度器的时间步间隔策略。如果传递 `sigmas`,
则 `num_inference_steps` 和 `timesteps` 必须为 `None`。
返回:
`Tuple[torch.Tensor, int]`:一个元组,第一个元素是调度器的时间步调度,
第二个元素是推理步骤的数量。
"""
if timesteps is not None and sigmas is not None:
raise ValueError("只能传递 `timesteps` 或 `sigmas` 之一。请选择一个设置自定义值")
if timesteps is not None:
accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
if not accepts_timesteps:
raise ValueError(
f"当前调度器类 {scheduler.__class__} 的 `set_timesteps` 不支持自定义"
f" 时间步调度。请检查您是否使用了正确的调度器。"
)
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
timesteps = scheduler.timesteps
num_inference_steps = len(timesteps)
elif sigmas is not None:
accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
if not accept_sigmas:
raise ValueError(
f"当前调度器类 {scheduler.__class__} 的 `set_timesteps` 不支持自定义"
f" sigma 调度。请检查您是否使用了正确的调度器。"
)
scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
timesteps = scheduler.timesteps
num_inference_steps = len(timesteps)
else:
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
timesteps = scheduler.timesteps
return timesteps, num_inference_steps
class StableDiffusion3InpaintPipeline(DiffusionPipeline):
r"""
Args:
transformer ([`SD3Transformer2DModel`]):
条件变换器(MMDiT)架构,用于对编码后的图像潜变量进行去噪。
scheduler ([`FlowMatchEulerDiscreteScheduler`]):
用于与 `transformer` 结合使用的调度器,用于去噪编码后的图像潜变量。
vae ([`AutoencoderKL`]):
变分自编码器(VAE)模型,用于将图像编码为潜在表示并解码。
text_encoder ([`CLIPTextModelWithProjection`]):
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
特别是 [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) 变体,
具有额外的投影层,该层用具有 `hidden_size` 维度的对角矩阵初始化。
text_encoder_2 ([`CLIPTextModelWithProjection`]):
[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
特别是 [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k) 变体。
text_encoder_3 ([`T5EncoderModel`]):
冻结的文本编码器。Stable Diffusion 3 使用 [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel),
特别是 [t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) 变体。
tokenizer (`CLIPTokenizer`):
类 [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) 的分词器。
tokenizer_2 (`CLIPTokenizer`):
第二个类 [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) 的分词器。
tokenizer_3 (`T5TokenizerFast`):
类 [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer) 的分词器。
"""
model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->transformer->vae"
_optional_components = []
_callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
def __init__(
self,
transformer: SD3Transformer2DModel,
scheduler: FlowMatchEulerDiscreteScheduler,
vae: AutoencoderKL,
text_encoder: CLIPTextModelWithProjection,
tokenizer: CLIPTokenizer,
text_encoder_2: CLIPTextModelWithProjection,
tokenizer_2: CLIPTokenizer,
text_encoder_3: T5EncoderModel,
tokenizer_3: T5TokenizerFast,
):
super().__init__()
self.register_modules(
vae=vae,
text_encoder=text_encoder,
text_encoder_2=text_encoder_2,
text_encoder_3=text_encoder_3,
tokenizer=tokenizer,
tokenizer_2=tokenizer_2,
tokenizer_3=tokenizer_3,
transformer=transformer,
scheduler=scheduler,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(
vae_scale_factor=self.vae_scale_factor, vae_latent_channels=self.vae.config.latent_channels
)
self.mask_processor = VaeImageProcessor(
vae_scale_factor=self.vae_scale_factor,
vae_latent_channels=self.vae.config.latent_channels,
do_normalize=False,
do_binarize=True,
do_convert_grayscale=True,
)
self.tokenizer_max_length = self.tokenizer.model_max_length
self.default_sample_size = self.transformer.config.sample_size
def _get_t5_prompt_embeds(
self,
prompt: Union[str, List[str]] = None,
num_images_per_prompt: int = 1,
max_sequence_length: int = 256,
device: Optional[torch.device] = None,
dtype: Optional[torch.dtype] = None,
):
device = device or self._execution_device
dtype = dtype or self.text_encoder.dtype
prompt = [prompt] if isinstance(prompt, str) else prompt
batch_size = len(prompt)
if self.text_encoder_3 is None:
return torch.zeros(
(
batch_size * num_images_per_prompt,
self.tokenizer_max_length,
self.transformer.config.joint_attention_dim,
),
device=device,
dtype=dtype,
)
text_inputs = self.tokenizer_3(
prompt,
padding="max_length",
max_length=max_sequence_length,
truncation=True,
add_special_tokens=True,
return_tensors="pt",
)
text_input_ids = text_inputs.input_ids
untruncated_ids = self.tokenizer_3(prompt, padding="longest", return_tensors="pt").input_ids
if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
removed_text = self.tokenizer_3.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
logger.warning(
"The following part of your input was truncated because `max_sequence_length` is set to "
f" {max_sequence_length} tokens: {removed_text}"
)
prompt_embeds = self.text_encoder_3(text_input_ids.to(device))[0]
dtype = self.text_encoder_3.dtype
prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
_, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
return prompt_embeds
def _get_clip_prompt_embeds(
self,
prompt: Union[str, List[str]],
num_images_per_prompt: int = 1,
device: Optional[torch.device] = None,
clip_skip: Optional[int] = None,
clip_model_index: int = 0,
):
device = device or self._execution_device
clip_tokenizers = [self.tokenizer, self.tokenizer_2]
clip_text_encoders = [self.text_encoder, self.text_encoder_2]
tokenizer = clip_tokenizers[clip_model_index]
text_encoder = clip_text_encoders[clip_model_index]
prompt = [prompt] if isinstance(prompt, str) else prompt
batch_size = len(prompt)
text_inputs = tokenizer(
prompt,
padding="max_length",
max_length=self.tokenizer_max_length,
truncation=True,
return_tensors="pt",
)
text_input_ids = text_inputs.input_ids
untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
removed_text = tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
logger.warning(
"The following part of your input was truncated because CLIP can only handle sequences up to"
f" {self.tokenizer_max_length} tokens: {removed_text}"
)
prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
pooled_prompt_embeds = prompt_embeds[0]
if clip_skip is None:
prompt_embeds = prompt_embeds.hidden_states[-2]
else:
prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
_, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
return prompt_embeds, pooled_prompt_embeds
def encode_prompt(
self,
prompt: Union[str, List[str]],
prompt_2: Union[str, List[str]],
prompt_3: Union[str, List[str]],
device: Optional[torch.device] = None,
num_images_per_prompt: int = 1,
do_classifier_free_guidance: bool = True,
negative_prompt: Optional[Union[str, List[str]]] = None,
negative_prompt_2: Optional[Union[str, List[str]]] = None,
negative_prompt_3: Optional[Union[str, List[str]]] = None,
prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
clip_skip: Optional[int] = None,
max_sequence_length: int = 256,
lora_scale: Optional[float] = None,
def check_inputs(
self,
prompt,
prompt_2,
prompt_3,
strength,
negative_prompt=None,
negative_prompt_2=None,
negative_prompt_3=None,
prompt_embeds=None,
negative_prompt_embeds=None,
pooled_prompt_embeds=None,
negative_pooled_prompt_embeds=None,
callback_on_step_end_tensor_inputs=None,
max_sequence_length=None,
def get_timesteps(self, num_inference_steps, strength, device):
init_timestep = min(num_inference_steps * strength, num_inference_steps)
t_start = int(max(num_inference_steps - init_timestep, 0))
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
if hasattr(self.scheduler, "set_begin_index"):
self.scheduler.set_begin_index(t_start * self.scheduler.order)
return timesteps, num_inference_steps - t_start
def prepare_latents(
self,
batch_size,
num_channels_latents,
height,
width,
dtype,
device,
generator,
latents=None,
image=None,
timestep=None,
is_strength_max=True,
return_noise=False,
return_image_latents=False,
):
shape = (
batch_size,
num_channels_latents,
int(height) // self.vae_scale_factor,
int(width) // self.vae_scale_factor,
)
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
if (image is None or timestep is None) and not is_strength_max:
raise ValueError(
"Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
"However, either the image or the noise timestep has not been provided."
)
if return_image_latents or (latents is None and not is_strength_max):
image = image.to(device=device, dtype=dtype)
if image.shape[1] == 16:
image_latents = image
else:
image_latents = self._encode_vae_image(image=image, generator=generator)
image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
if latents is None:
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
latents = noise if is_strength_max else self.scheduler.scale_noise(image_latents, timestep, noise)
else:
noise = latents.to(device)
latents = noise
outputs = (latents,)
if return_noise:
outputs += (noise,)
if return_image_latents:
outputs += (image_latents,)
return outputs
def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
if isinstance(generator, list):
image_latents = [
retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
for i in range(image.shape[0])
]
image_latents = torch.cat(image_latents, dim=0)
else:
image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
return image_latents
def prepare_mask_latents(
self,
mask,
masked_image,
batch_size,
num_images_per_prompt,
height,
width,
dtype,
device,
generator,
do_classifier_free_guidance,
):
mask = torch.nn.functional.interpolate(
mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
)
mask = mask.to(device=device, dtype=dtype)
batch_size = batch_size * num_images_per_prompt
masked_image = masked_image.to(device=device, dtype=dtype)
if masked_image.shape[1] == 16:
masked_image_latents = masked_image
else:
masked_image_latents = retrieve_latents(self.vae.encode(masked_image), generator=generator)
masked_image_latents = (masked_image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
if mask.shape[0] < batch_size:
if not batch_size % mask.shape[0] == 0:
raise ValueError(
"The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
" of masks that you pass is divisible by the total requested batch size."
)
mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
if masked_image_latents.shape[0] < batch_size:
if not batch_size % masked_image_latents.shape[0] == 0:
raise ValueError(
"The passed images and the required batch size don't match. Images are supposed to be duplicated"
f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
" Make sure the number of images that you pass is divisible by the total requested batch size."
)
masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
masked_image_latents = (
torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
)
masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
return mask, masked_image_latents
@property
def guidance_scale(self):
return self._guidance_scale
@property
def clip_skip(self):
return self._clip_skip
@property
def do_classifier_free_guidance(self):
return self._guidance_scale > 1
@property
def num_timesteps(self):
return self._num_timesteps
@property
def interrupt(self):
return self._interrupt
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
prompt: Union[str, List[str]] = None,
prompt_2: Optional[Union[str, List[str]]] = None,
prompt_3: Optional[Union[str, List[str]]] = None,
image: PipelineImageInput = None,
mask_image: PipelineImageInput = None,
masked_image_latents: PipelineImageInput = None,
height: int = None,
width: int = None,
padding_mask_crop: Optional[int] = None,
strength: float = 0.6,
num_inference_steps: int = 50,
timesteps: List[int] = None,
guidance_scale: float = 7.0,
negative_prompt: Optional[Union[str, List[str]]] = None,
negative_prompt_2: Optional[Union[str, List[str]]] = None,
negative_prompt_3: Optional[Union[str, List[str]]] = None,
num_images_per_prompt: Optional[int] = 1,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.Tensor] = None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
pooled_prompt_embeds: Optional[torch.Tensor] = None,
negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
clip_skip: Optional[int] = None,
callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
max_sequence_length: int = 256,
.\diffusers\pipelines\stable_diffusion_3\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
DIFFUSERS_SLOW_IMPORT,
OptionalDependencyNotAvailable,
_LazyModule,
get_objects_from_module,
is_flax_available,
is_torch_available,
is_transformers_available,
)
_dummy_objects = {}
_additional_imports = {}
_import_structure = {"pipeline_output": ["StableDiffusion3PipelineOutput"]}
try:
if not (is_transformers_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils import dummy_torch_and_transformers_objects
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
_import_structure["pipeline_stable_diffusion_3"] = ["StableDiffusion3Pipeline"]
_import_structure["pipeline_stable_diffusion_3_img2img"] = ["StableDiffusion3Img2ImgPipeline"]
_import_structure["pipeline_stable_diffusion_3_inpaint"] = ["StableDiffusion3InpaintPipeline"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
try:
if not (is_transformers_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_objects import *
else:
from .pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
from .pipeline_stable_diffusion_3_img2img import StableDiffusion3Img2ImgPipeline
from .pipeline_stable_diffusion_3_inpaint import StableDiffusion3InpaintPipeline
else:
import sys
sys.modules[__name__] = _LazyModule(
__name__,
globals()["__file__"],
_import_structure,
module_spec=__spec__,
)
for name, value in _dummy_objects.items():
setattr(sys.modules[__name__], name, value)
for name, value in _additional_imports.items():
setattr(sys.modules[__name__], name, value)
.\diffusers\pipelines\stable_diffusion_attend_and_excite\pipeline_stable_diffusion_attend_and_excite.py
import inspect
import math
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np
import torch
from torch.nn import functional as F
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...image_processor import VaeImageProcessor
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...models.attention_processor import Attention
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import (
USE_PEFT_BACKEND,
deprecate,
logging,
replace_example_docstring,
scale_lora_layers,
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
logger = logging.get_logger(__name__)
EXAMPLE_DOC_STRING = """
Examples:
```py
>>> import torch
>>> from diffusers import StableDiffusionAttendAndExcitePipeline
>>> pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained(
... "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
... ).to("cuda")
>>> prompt = "a cat and a frog"
>>> # 使用 get_indices 函数查找要更改的令牌的索引
>>> pipe.get_indices(prompt)
{0: '<|startoftext|>', 1: 'a</w>', 2: 'cat</w>', 3: 'and</w>', 4: 'a</w>', 5: 'frog</w>', 6: '<|endoftext|>'}
>>> token_indices = [2, 5]
>>> seed = 6141
>>> generator = torch.Generator("cuda").manual_seed(seed)
>>> images = pipe(
... prompt=prompt,
... token_indices=token_indices,
... guidance_scale=7.5,
... generator=generator,
... num_inference_steps=50,
... max_iter_to_alter=25,
... ).images
>>> image = images[0]
>>> image.save(f"../images/{prompt}_{seed}.png")
```py
"""
class AttentionStore:
@staticmethod
def get_empty_store():
return {"down": [], "mid": [], "up": []}
def __call__(self, attn, is_cross: bool, place_in_unet: str):
if self.cur_att_layer >= 0 and is_cross:
if attn.shape[1] == np.prod(self.attn_res):
self.step_store[place_in_unet].append(attn)
self.cur_att_layer += 1
if self.cur_att_layer == self.num_att_layers:
self.cur_att_layer = 0
self.between_steps()
def between_steps(self):
self.attention_store = self.step_store
self.step_store = self.get_empty_store()
def get_average_attention(self):
average_attention = self.attention_store
return average_attention
def aggregate_attention(self, from_where: List[str]) -> torch.Tensor:
"""在指定的分辨率下聚合不同层和头部的注意力。"""
out = []
attention_maps = self.get_average_attention()
for location in from_where:
for item in attention_maps[location]:
cross_maps = item.reshape(-1, self.attn_res[0], self.attn_res[1], item.shape[-1])
out.append(cross_maps)
out = torch.cat(out, dim=0)
out = out.sum(0) / out.shape[0]
return out
def reset(self):
self.cur_att_layer = 0
self.step_store = self.get_empty_store()
self.attention_store = {}
def __init__(self, attn_res):
"""
初始化一个空的 AttentionStore :param step_index: 用于可视化扩散过程中的特定步骤
"""
self.num_att_layers = -1
self.cur_att_layer = 0
self.step_store = self.get_empty_store()
self.attention_store = {}
self.curr_step_index = 0
self.attn_res = attn_res
class AttendExciteAttnProcessor:
def __init__(self, attnstore, place_in_unet):
super().__init__()
self.attnstore = attnstore
self.place_in_unet = place_in_unet
def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
batch_size, sequence_length, _ = hidden_states.shape
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
query = attn.to_q(hidden_states)
is_cross = encoder_hidden_states is not None
encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
key = attn.to_k(encoder_hidden_states)
value = attn.to_v(encoder_hidden_states)
query = attn.head_to_batch_dim(query)
key = attn.head_to_batch_dim(key)
value = attn.head_to_batch_dim(value)
attention_probs = attn.get_attention_scores(query, key, attention_mask)
if attention_probs.requires_grad:
self.attnstore(attention_probs, is_cross, self.place_in_unet)
hidden_states = torch.bmm(attention_probs, value)
hidden_states = attn.batch_to_head_dim(hidden_states)
hidden_states = attn.to_out[0](hidden_states)
hidden_states = attn.to_out[1](hidden_states)
return hidden_states
class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin):
r"""
使用 Stable Diffusion 和 Attend-and-Excite 进行文本到图像生成的管道。
该模型继承自 [`DiffusionPipeline`]。请查看超类文档,以获取所有管道实现的通用方法
(下载、保存、在特定设备上运行等)。
该管道还继承以下加载方法:
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] 用于加载文本反演嵌入
# 定义函数参数的说明文档,描述各参数的作用
Args:
vae ([`AutoencoderKL`]):
# Variational Auto-Encoder (VAE) 模型,用于将图像编码和解码为潜在表示
Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
text_encoder ([`~transformers.CLIPTextModel`]):
# 冻结的文本编码器,使用 CLIP 模型进行文本特征提取
Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
tokenizer ([`~transformers.CLIPTokenizer`]):
# 用于将文本进行分词的 CLIPTokenizer
A `CLIPTokenizer` to tokenize text.
unet ([`UNet2DConditionModel`]):
# UNet 模型,用于对编码后的图像潜在特征进行去噪处理
A `UNet2DConditionModel` to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]):
# 调度器,用于与 UNet 结合去噪编码后的图像潜在特征,可以是 DDIM、LMS 或 PNDM 调度器
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
safety_checker ([`StableDiffusionSafetyChecker`]):
# 分类模块,用于评估生成的图像是否可能被认为是冒犯性或有害的
Classification module that estimates whether generated images could be considered offensive or harmful.
# 参见模型卡以获取有关模型潜在危害的更多详细信息
Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
about a model's potential harms.
feature_extractor ([`~transformers.CLIPImageProcessor`]):
# CLIP 图像处理器,用于从生成的图像中提取特征;这些特征作为输入提供给安全检查器
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
"""
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor"]
_exclude_from_cpu_offload = ["safety_checker"]
def __init__(
self,
vae: AutoencoderKL,
text_encoder: CLIPTextModel,
tokenizer: CLIPTokenizer,
unet: UNet2DConditionModel,
scheduler: KarrasDiffusionSchedulers,
safety_checker: StableDiffusionSafetyChecker,
feature_extractor: CLIPImageProcessor,
requires_safety_checker: bool = True,
):
super().__init__()
if safety_checker is None and requires_safety_checker:
logger.warning(
f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
" that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
" results in services or applications open to the public. Both the diffusers team and Hugging Face"
" strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
" it only for use-cases that involve analyzing network behavior or auditing its results. For more"
" information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
)
if safety_checker is not None and feature_extractor is None:
raise ValueError(
"Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
" checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
self.register_modules(
vae=vae,
text_encoder=text_encoder,
tokenizer=tokenizer,
unet=unet,
scheduler=scheduler,
safety_checker=safety_checker,
feature_extractor=feature_extractor,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
def _encode_prompt(
self,
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt=None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
lora_scale: Optional[float] = None,
**kwargs,
):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
prompt_embeds_tuple = self.encode_prompt(
prompt=prompt,
device=device,
num_images_per_prompt=num_images_per_prompt,
do_classifier_free_guidance=do_classifier_free_guidance,
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale,
**kwargs,
)
prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
return prompt_embeds
def encode_prompt(
self,
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt=None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
def run_safety_checker(self, image, device, dtype):
if self.safety_checker is None:
has_nsfw_concept = None
else:
if torch.is_tensor(image):
feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
else:
feature_extractor_input = self.image_processor.numpy_to_pil(image)
safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
return image, has_nsfw_concept
def decode_latents(self, latents):
deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
return image
def prepare_extra_step_kwargs(self, generator, eta):
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def check_inputs(
self,
prompt,
indices,
height,
width,
callback_steps,
negative_prompt=None,
prompt_embeds=None,
negative_prompt_embeds=None,
):
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
shape = (
batch_size,
num_channels_latents,
int(height) // self.vae_scale_factor,
int(width) // self.vae_scale_factor,
)
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
if latents is None:
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
else:
latents = latents.to(device)
latents = latents * self.scheduler.init_noise_sigma
return latents
@staticmethod
def _compute_max_attention_per_index(
attention_maps: torch.Tensor,
indices: List[int],
) -> List[torch.Tensor]:
"""计算我们希望改变的每个 token 的最大注意力值。"""
attention_for_text = attention_maps[:, :, 1:-1]
attention_for_text *= 100
attention_for_text = torch.nn.functional.softmax(attention_for_text, dim=-1)
indices = [index - 1 for index in indices]
max_indices_list = []
for i in indices:
image = attention_for_text[:, :, i]
smoothing = GaussianSmoothing().to(attention_maps.device)
input = F.pad(image.unsqueeze(0).unsqueeze(0), (1, 1, 1, 1), mode="reflect")
image = smoothing(input).squeeze(0).squeeze(0)
max_indices_list.append(image.max())
return max_indices_list
def _aggregate_and_get_max_attention_per_token(
self,
indices: List[int],
):
"""聚合每个 token 的注意力,并计算每个 token 的最大激活值。"""
attention_maps = self.attention_store.aggregate_attention(
from_where=("up", "down", "mid"),
)
max_attention_per_index = self._compute_max_attention_per_index(
attention_maps=attention_maps,
indices=indices,
)
return max_attention_per_index
@staticmethod
def _compute_loss(max_attention_per_index: List[torch.Tensor]) -> torch.Tensor:
"""使用每个 token 的最大注意力值计算 attend-and-excite 损失。"""
losses = [max(0, 1.0 - curr_max) for curr_max in max_attention_per_index]
loss = max(losses)
return loss
@staticmethod
def _update_latent(latents: torch.Tensor, loss: torch.Tensor, step_size: float) -> torch.Tensor:
"""根据计算出的损失更新潜在变量。"""
grad_cond = torch.autograd.grad(loss.requires_grad_(True), [latents], retain_graph=True)[0]
latents = latents - step_size * grad_cond
return latents
def _perform_iterative_refinement_step(
latents: torch.Tensor,
indices: List[int],
loss: torch.Tensor,
threshold: float,
text_embeddings: torch.Tensor,
step_size: float,
t: int,
max_refinement_steps: int = 20,
):
"""
执行论文中引入的迭代潜在优化。我们根据损失目标持续更新潜在代码,直到所有令牌达到给定的阈值。
"""
iteration = 0
target_loss = max(0, 1.0 - threshold)
while loss > target_loss:
iteration += 1
latents = latents.clone().detach().requires_grad_(True)
self.unet(latents, t, encoder_hidden_states=text_embeddings).sample
self.unet.zero_grad()
max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
indices=indices,
)
loss = self._compute_loss(max_attention_per_index)
if loss != 0:
latents = self._update_latent(latents, loss, step_size)
logger.info(f"\t Try {iteration}. loss: {loss}")
if iteration >= max_refinement_steps:
logger.info(f"\t Exceeded max number of iterations ({max_refinement_steps})! ")
break
latents = latents.clone().detach().requires_grad_(True)
_ = self.unet(latents, t, encoder_hidden_states=text_embeddings).sample
self.unet.zero_grad()
max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
indices=indices,
)
loss = self._compute_loss(max_attention_per_index)
logger.info(f"\t Finished with loss of: {loss}")
return loss, latents, max_attention_per_index
def register_attention_control(self):
attn_procs = {}
cross_att_count = 0
for name in self.unet.attn_processors.keys():
if name.startswith("mid_block"):
place_in_unet = "mid"
elif name.startswith("up_blocks"):
place_in_unet = "up"
elif name.startswith("down_blocks"):
place_in_unet = "down"
else:
continue
cross_att_count += 1
attn_procs[name] = AttendExciteAttnProcessor(attnstore=self.attention_store, place_in_unet=place_in_unet)
self.unet.set_attn_processor(attn_procs)
self.attention_store.num_att_layers = cross_att_count
def get_indices(self, prompt: str) -> Dict[str, int]:
"""用于列出要更改的令牌的索引的实用函数"""
ids = self.tokenizer(prompt).input_ids
indices = {i: tok for tok, i in zip(self.tokenizer.convert_ids_to_tokens(ids), range(len(ids)))}
return indices
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
self,
prompt: Union[str, List[str]],
token_indices: Union[List[int], List[List[int]]],
height: Optional[int] = None,
width: Optional[int] = None,
num_inference_steps: int = 50,
guidance_scale: float = 7.5,
negative_prompt: Optional[Union[str, List[str]]] = None,
num_images_per_prompt: int = 1,
eta: float = 0.0,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.Tensor] = None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
max_iter_to_alter: int = 25,
thresholds: dict = {0: 0.05, 10: 0.5, 20: 0.8},
scale_factor: int = 20,
attn_res: Optional[Tuple[int]] = (16, 16),
clip_skip: Optional[int] = None,
class GaussianSmoothing(torch.nn.Module):
"""
参数:
对 1D、2D 或 3D 张量应用高斯平滑。每个通道分别使用深度卷积进行过滤。
channels (int, sequence): 输入张量的通道数。输出将具有相同数量的通道。
kernel_size (int, sequence): 高斯核的大小。 sigma (float, sequence): 高斯核的标准差。
dim (int, optional): 数据的维度数量。默认值为 2(空间)。
"""
def __init__(
self,
channels: int = 1,
kernel_size: int = 3,
sigma: float = 0.5,
dim: int = 2,
):
super().__init__()
if isinstance(kernel_size, int):
kernel_size = [kernel_size] * dim
if isinstance(sigma, float):
sigma = [sigma] * dim
kernel = 1
meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size])
for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
mean = (size - 1) / 2
kernel *= 1 / (std * math.sqrt(2 * math.pi)) * torch.exp(-(((mgrid - mean) / (2 * std)) ** 2))
kernel = kernel / torch.sum(kernel)
kernel = kernel.view(1, 1, *kernel.size())
kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))
self.register_buffer("weight", kernel)
self.groups = channels
if dim == 1:
self.conv = F.conv1d
elif dim == 2:
self.conv = F.conv2d
elif dim == 3:
self.conv = F.conv3d
else:
raise RuntimeError("Only 1, 2 and 3 dimensions are supported. Received {}.".format(dim))
def forward(self, input):
"""
参数:
对输入应用高斯滤波。
input (torch.Tensor): 需要应用高斯滤波的输入。
返回:
filtered (torch.Tensor): 滤波后的输出。
"""
return self.conv(input, weight=self.weight.to(input.dtype), groups=self.groups)
.\diffusers\pipelines\stable_diffusion_attend_and_excite\__init__.py
from typing import TYPE_CHECKING
from ...utils import (
DIFFUSERS_SLOW_IMPORT,
OptionalDependencyNotAvailable,
_LazyModule,
get_objects_from_module,
is_torch_available,
is_transformers_available,
)
_dummy_objects = {}
_import_structure = {}
try:
if not (is_transformers_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils import dummy_torch_and_transformers_objects
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
_import_structure["pipeline_stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
try:
if not (is_transformers_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_objects import *
else:
from .pipeline_stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
else:
import sys
sys.modules[__name__] = _LazyModule(
__name__,
globals()["__file__"],
_import_structure,
module_spec=__spec__,
)
for name, value in _dummy_objects.items():
setattr(sys.modules[__name__], name, value)
.\diffusers\pipelines\stable_diffusion_diffedit\pipeline_stable_diffusion_diffedit.py
import inspect
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Optional, Union
import numpy as np
import PIL.Image
import torch
from packaging import version
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
from ...configuration_utils import FrozenDict
from ...image_processor import VaeImageProcessor
from ...loaders import StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import DDIMInverseScheduler, KarrasDiffusionSchedulers
from ...utils import (
PIL_INTERPOLATION,
USE_PEFT_BACKEND,
BaseOutput,
deprecate,
logging,
replace_example_docstring,
scale_lora_layers,
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
logger = logging.get_logger(__name__)
@dataclass
class DiffEditInversionPipelineOutput(BaseOutput):
"""
Stable Diffusion 管道的输出类。
参数:
latents (`torch.Tensor`)
反转的潜变量张量
images (`List[PIL.Image.Image]` 或 `np.ndarray`)
一个 PIL 图像的列表,长度为 `num_timesteps * batch_size` 或形状为 `(num_timesteps,
batch_size, height, width, num_channels)` 的 numpy 数组。PIL 图像或 numpy 数组表示
扩散管道的去噪图像。
"""
latents: torch.Tensor
images: Union[List[PIL.Image.Image], np.ndarray]
EXAMPLE_DOC_STRING = """
```py
>>> import PIL # 导入PIL库用于图像处理
>>> import requests # 导入requests库用于HTTP请求
>>> import torch # 导入PyTorch库用于深度学习
>>> from io import BytesIO # 从io模块导入BytesIO类用于字节流处理
>>> from diffusers import StableDiffusionDiffEditPipeline # 从diffusers库导入StableDiffusionDiffEditPipeline类
>>> def download_image(url): # 定义下载图像的函数,接受URL作为参数
... response = requests.get(url) # 使用requests库获取指定URL的响应
... return PIL.Image.open(BytesIO(response.content)).convert("RGB") # 将响应内容转为字节流,打开为图像并转换为RGB模式
>>> img_url = "https://github.com/Xiang-cd/DiffEdit-stable-diffusion/raw/main/assets/origin.png" # 图像的URL
>>> init_image = download_image(img_url).resize((768, 768)) # 下载图像并调整大小为768x768
>>> pipeline = StableDiffusionDiffEditPipeline.from_pretrained( # 从预训练模型加载StableDiffusionDiffEditPipeline
... "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16 # 指定模型名称和数据类型为float16
... )
>>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config) # 设置调度器为DDIM调度器
>>> pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config) # 设置逆调度器为DDIM逆调度器
>>> pipeline.enable_model_cpu_offload() # 启用模型的CPU卸载以节省内存
>>> mask_prompt = "A bowl of fruits" # 定义遮罩提示词
>>> prompt = "A bowl of pears" # 定义生成提示词
>>> mask_image = pipeline.generate_mask(image=init_image, source_prompt=prompt, target_prompt=mask_prompt) # 生成遮罩图像
>>> image_latents = pipeline.invert(image=init_image, prompt=mask_prompt).latents # 对初始图像进行反向处理,获取潜在图像
>>> image = pipeline(prompt=prompt, mask_image=mask_image, image_latents=image_latents).images[0] # 生成最终图像
```py
"""
EXAMPLE_INVERT_DOC_STRING = """
```py
>>> import PIL # 导入PIL库用于图像处理
>>> import requests # 导入requests库用于HTTP请求
>>> import torch # 导入PyTorch库用于深度学习
>>> from io import BytesIO # 从io模块导入BytesIO类用于字节流处理
>>> from diffusers import StableDiffusionDiffEditPipeline # 从diffusers库导入StableDiffusionDiffEditPipeline类
>>> def download_image(url): # 定义下载图像的函数,接受URL作为参数
... response = requests.get(url) # 使用requests库获取指定URL的响应
... return PIL.Image.open(BytesIO(response.content)).convert("RGB") # 将响应内容转为字节流,打开为图像并转换为RGB模式
>>> img_url = "https://github.com/Xiang-cd/DiffEdit-stable-diffusion/raw/main/assets/origin.png" # 图像的URL
>>> init_image = download_image(img_url).resize((768, 768)) # 下载图像并调整大小为768x768
>>> pipeline = StableDiffusionDiffEditPipeline.from_pretrained( # 从预训练模型加载StableDiffusionDiffEditPipeline
... "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16 # 指定模型名称和数据类型为float16
... )
>>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config) # 设置调度器为DDIM调度器
>>> pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config) # 设置逆调度器为DDIM逆调度器
>>> pipeline.enable_model_cpu_offload() # 启用模型的CPU卸载以节省内存
>>> prompt = "A bowl of fruits" # 定义生成提示词
>>> inverted_latents = pipeline.invert(image=init_image, prompt=prompt).latents # 对初始图像进行反向处理,获取潜在图像
```py
"""
def auto_corr_loss(hidden_states, generator=None):
reg_loss = 0.0
for i in range(hidden_states.shape[0]):
for j in range(hidden_states.shape[1]):
noise = hidden_states[i : i + 1, j : j + 1, :, :]
while True:
roll_amount = torch.randint(noise.shape[2] // 2, (1,), generator=generator).item()
reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=2)).mean() ** 2
reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=3)).mean() ** 2
if noise.shape[2] <= 8:
break
noise = torch.nn.functional.avg_pool2d(noise, kernel_size=2)
return reg_loss
def kl_divergence(hidden_states):
return hidden_states.var() + hidden_states.mean() ** 2 - 1 - torch.log(hidden_states.var() + 1e-7)
def preprocess(image):
deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
if isinstance(image, torch.Tensor):
return image
elif isinstance(image, PIL.Image.Image):
image = [image]
if isinstance(image[0], PIL.Image.Image):
w, h = image[0].size
w, h = (x - x % 8 for x in (w, h))
image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
image = np.concatenate(image, axis=0)
image = np.array(image).astype(np.float32) / 255.0
image = image.transpose(0, 3, 1, 2)
image = 2.0 * image - 1.0
image = torch.from_numpy(image)
elif isinstance(image[0], torch.Tensor):
image = torch.cat(image, dim=0)
return image
def preprocess_mask(mask, batch_size: int = 1):
if not isinstance(mask, torch.Tensor):
if isinstance(mask, (PIL.Image.Image, np.ndarray)):
mask = [mask]
if isinstance(mask, list):
if isinstance(mask[0], PIL.Image.Image):
mask = [np.array(m.convert("L")).astype(np.float32) / 255.0 for m in mask]
if isinstance(mask[0], np.ndarray):
mask = np.stack(mask, axis=0) if mask[0].ndim < 3 else np.concatenate(mask, axis=0)
mask = torch.from_numpy(mask)
elif isinstance(mask[0], torch.Tensor):
mask = torch.stack(mask, dim=0) if mask[0].ndim < 3 else torch.cat(mask, dim=0)
if mask.ndim == 2:
mask = mask.unsqueeze(0).unsqueeze(0)
if mask.ndim == 3:
if mask.shape[0] == 1:
mask = mask.unsqueeze(0)
else:
mask = mask.unsqueeze(1)
if batch_size > 1:
if mask.shape[0] == 1:
mask = torch.cat([mask] * batch_size)
elif mask.shape[0] > 1 and mask.shape[0] != batch_size:
raise ValueError(
f"`mask_image` with batch size {mask.shape[0]} cannot be broadcasted to batch size {batch_size} "
f"inferred by prompt inputs"
)
if mask.shape[1] != 1:
raise ValueError(f"`mask_image` must have 1 channel, but has {mask.shape[1]} channels")
if mask.min() < 0 or mask.max() > 1:
raise ValueError("`mask_image` should be in [0, 1] range")
mask[mask < 0.5] = 0
mask[mask >= 0.5] = 1
return mask
class StableDiffusionDiffEditPipeline(
DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, StableDiffusionLoraLoaderMixin
):
r"""
<Tip warning={true}>
# 提示用户该特性是实验性的
This is an experimental feature!
</Tip>
# 使用稳定扩散和DiffEdit进行文本引导的图像修补的管道。
Pipeline for text-guided image inpainting using Stable Diffusion and DiffEdit.
# 该模型继承自DiffusionPipeline。检查超类文档以获取所有管道实现的通用方法(下载、保存、在特定设备上运行等)。
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
# 该管道还继承以下加载和保存方法:
# 加载文本逆转嵌入的方法
- [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
# 加载和保存LoRA权重的方法
- [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
- [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
# 构造函数参数说明:
vae ([`AutoencoderKL`]):
# 变分自编码器模型,用于将图像编码和解码为潜在表示。
Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
text_encoder ([`~transformers.CLIPTextModel`]):
# 冻结的文本编码器,使用特定的CLIP模型。
Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
tokenizer ([`~transformers.CLIPTokenizer`]):
# 用于将文本进行分词的CLIP分词器。
A `CLIPTokenizer` to tokenize text.
unet ([`UNet2DConditionModel`]):
# 用于去噪编码后图像潜在表示的UNet模型。
A `UNet2DConditionModel` to denoise the encoded image latents.
scheduler ([`SchedulerMixin`]):
# 与UNet结合使用的调度器,用于去噪图像潜在表示。
A scheduler to be used in combination with `unet` to denoise the encoded image latents.
inverse_scheduler ([`DDIMInverseScheduler`]):
# 与UNet结合使用的调度器,用于填补输入潜在表示的未掩蔽部分。
A scheduler to be used in combination with `unet` to fill in the unmasked part of the input latents.
safety_checker ([`StableDiffusionSafetyChecker`]):
# 用于评估生成图像是否可能被视为冒犯或有害的分类模块。
Classification module that estimates whether generated images could be considered offensive or harmful.
# 参考模型卡以获取关于模型潜在危害的更多细节。
Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
about a model's potential harms.
feature_extractor ([`~transformers.CLIPImageProcessor`]):
# 用于提取生成图像特征的CLIP图像处理器;作为输入传递给安全检查器。
A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
"""
model_cpu_offload_seq = "text_encoder->unet->vae"
_optional_components = ["safety_checker", "feature_extractor", "inverse_scheduler"]
_exclude_from_cpu_offload = ["safety_checker"]
def __init__(
self,
vae: AutoencoderKL,
text_encoder: CLIPTextModel,
tokenizer: CLIPTokenizer,
unet: UNet2DConditionModel,
scheduler: KarrasDiffusionSchedulers,
safety_checker: StableDiffusionSafetyChecker,
feature_extractor: CLIPImageProcessor,
inverse_scheduler: DDIMInverseScheduler,
requires_safety_checker: bool = True,
def _encode_prompt(
self,
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt=None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
lora_scale: Optional[float] = None,
**kwargs,
):
deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
prompt_embeds_tuple = self.encode_prompt(
prompt=prompt,
device=device,
num_images_per_prompt=num_images_per_prompt,
do_classifier_free_guidance=do_classifier_free_guidance,
negative_prompt=negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
lora_scale=lora_scale,
**kwargs,
)
prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
return prompt_embeds
def encode_prompt(
self,
prompt,
device,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt=None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
lora_scale: Optional[float] = None,
clip_skip: Optional[int] = None,
):
def run_safety_checker(self, image, device, dtype):
if self.safety_checker is None:
has_nsfw_concept = None
else:
if torch.is_tensor(image):
feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
else:
feature_extractor_input = self.image_processor.numpy_to_pil(image)
safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
image, has_nsfw_concept = self.safety_checker(
images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
)
return image, has_nsfw_concept
def prepare_extra_step_kwargs(self, generator, eta):
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
extra_step_kwargs = {}
if accepts_eta:
extra_step_kwargs["eta"] = eta
accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
if accepts_generator:
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
def decode_latents(self, latents):
deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clamp(0, 1)
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
return image
def check_inputs(
self,
prompt,
strength,
callback_steps,
negative_prompt=None,
prompt_embeds=None,
negative_prompt_embeds=None,
):
if (strength is None) or (strength is not None and (strength < 0 or strength > 1)):
raise ValueError(
f"The value of `strength` should in [0.0, 1.0] but is, but is {strength} of type {type(strength)}."
)
if (callback_steps is None) or (
callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
):
raise ValueError(
f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
f" {type(callback_steps)}."
)
if prompt is not None and prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
" only forward one of the two."
)
elif prompt is None and prompt_embeds is None:
raise ValueError(
"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
)
elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
)
if prompt_embeds is not None and negative_prompt_embeds is not None:
if prompt_embeds.shape != negative_prompt_embeds.shape:
raise ValueError(
"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
f" {negative_prompt_embeds.shape}."
)
def check_source_inputs(
self,
source_prompt=None,
source_negative_prompt=None,
source_prompt_embeds=None,
source_negative_prompt_embeds=None,
):
if source_prompt is not None and source_prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `source_prompt`: {source_prompt} and `source_prompt_embeds`: {source_prompt_embeds}."
" Please make sure to only forward one of the two."
)
elif source_prompt is None and source_prompt_embeds is None:
raise ValueError(
"Provide either `source_image` or `source_prompt_embeds`. Cannot leave all both of the arguments undefined."
)
elif source_prompt is not None and (
not isinstance(source_prompt, str) and not isinstance(source_prompt, list)
):
raise ValueError(f"`source_prompt` has to be of type `str` or `list` but is {type(source_prompt)}")
if source_negative_prompt is not None and source_negative_prompt_embeds is not None:
raise ValueError(
f"Cannot forward both `source_negative_prompt`: {source_negative_prompt} and `source_negative_prompt_embeds`:"
f" {source_negative_prompt_embeds}. Please make sure to only forward one of the two."
)
if source_prompt_embeds is not None and source_negative_prompt_embeds is not None:
if source_prompt_embeds.shape != source_negative_prompt_embeds.shape:
raise ValueError(
"`source_prompt_embeds` and `source_negative_prompt_embeds` must have the same shape when passed"
f" directly, but got: `source_prompt_embeds` {source_prompt_embeds.shape} !="
f" `source_negative_prompt_embeds` {source_negative_prompt_embeds.shape}."
)
def get_timesteps(self, num_inference_steps, strength, device):
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
return timesteps, num_inference_steps - t_start
def get_inverse_timesteps(self, num_inference_steps, strength, device):
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
if t_start == 0:
return self.inverse_scheduler.timesteps, num_inference_steps
timesteps = self.inverse_scheduler.timesteps[:-t_start]
return timesteps, num_inference_steps - t_start
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
shape = (
batch_size,
num_channels_latents,
int(height) // self.vae_scale_factor,
int(width) // self.vae_scale_factor,
)
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
if latents is None:
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
else:
latents = latents.to(device)
latents = latents * self.scheduler.init_noise_sigma
return latents
def prepare_image_latents(self, image, batch_size, dtype, device, generator=None):
if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
raise ValueError(
f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
)
image = image.to(device=device, dtype=dtype)
if image.shape[1] == 4:
latents = image
else:
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
if isinstance(generator, list):
latents = [
self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
]
latents = torch.cat(latents, dim=0)
else:
latents = self.vae.encode(image).latent_dist.sample(generator)
latents = self.vae.config.scaling_factor * latents
if batch_size != latents.shape[0]:
if batch_size % latents.shape[0] == 0:
deprecation_message = (
f"You have passed {batch_size} text prompts (`prompt`), but only {latents.shape[0]} initial"
" images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
" your script to pass as many initial images as text prompts to suppress this warning."
)
deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
additional_latents_per_image = batch_size // latents.shape[0]
latents = torch.cat([latents] * additional_latents_per_image, dim=0)
else:
raise ValueError(
f"Cannot duplicate `image` of batch size {latents.shape[0]} to {batch_size} text prompts."
)
else:
latents = torch.cat([latents], dim=0)
return latents
def get_epsilon(self, model_output: torch.Tensor, sample: torch.Tensor, timestep: int):
pred_type = self.inverse_scheduler.config.prediction_type
alpha_prod_t = self.inverse_scheduler.alphas_cumprod[timestep]
beta_prod_t = 1 - alpha_prod_t
if pred_type == "epsilon":
return model_output
elif pred_type == "sample":
return (sample - alpha_prod_t ** (0.5) * model_output) / beta_prod_t ** (0.5)
elif pred_type == "v_prediction":
return (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
else:
raise ValueError(
f"prediction_type given as {pred_type} must be one of `epsilon`, `sample`, or `v_prediction`"
)
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def generate_mask(
image: Union[torch.Tensor, PIL.Image.Image] = None,
target_prompt: Optional[Union[str, List[str]]] = None,
target_negative_prompt: Optional[Union[str, List[str]]] = None,
target_prompt_embeds: Optional[torch.Tensor] = None,
target_negative_prompt_embeds: Optional[torch.Tensor] = None,
source_prompt: Optional[Union[str, List[str]]] = None,
source_negative_prompt: Optional[Union[str, List[str]]] = None,
source_prompt_embeds: Optional[torch.Tensor] = None,
source_negative_prompt_embeds: Optional[torch.Tensor] = None,
num_maps_per_mask: Optional[int] = 10,
mask_encode_strength: Optional[float] = 0.5,
mask_thresholding_ratio: Optional[float] = 3.0,
num_inference_steps: int = 50,
guidance_scale: float = 7.5,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
output_type: Optional[str] = "np",
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@torch.no_grad()
@replace_example_docstring(EXAMPLE_INVERT_DOC_STRING)
def invert(
prompt: Optional[Union[str, List[str]]] = None,
image: Union[torch.Tensor, PIL.Image.Image] = None,
num_inference_steps: int = 50,
inpaint_strength: float = 0.8,
guidance_scale: float = 7.5,
negative_prompt: Optional[Union[str, List[str]]] = None,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
decode_latents: bool = False,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
callback_steps: Optional[int] = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
lambda_auto_corr: float = 20.0,
lambda_kl: float = 20.0,
num_reg_steps: int = 0,
num_auto_corr_rolls: int = 5,
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
self,
prompt: Optional[Union[str, List[str]]] = None,
mask_image: Union[torch.Tensor, PIL.Image.Image] = None,
image_latents: Union[torch.Tensor, PIL.Image.Image] = None,
inpaint_strength: Optional[float] = 0.8,
num_inference_steps: int = 50,
guidance_scale: float = 7.5,
negative_prompt: Optional[Union[str, List[str]]] = None,
num_images_per_prompt: Optional[int] = 1,
eta: float = 0.0,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.Tensor] = None,
prompt_embeds: Optional[torch.Tensor] = None,
negative_prompt_embeds: Optional[torch.Tensor] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
clip_skip: int = None,
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 一个费力不讨好的项目,让我损失了近一半的绩效!
· 清华大学推出第四讲使用 DeepSeek + DeepResearch 让科研像聊天一样简单!
· 实操Deepseek接入个人知识库
· CSnakes vs Python.NET:高效嵌入与灵活互通的跨语言方案对比
· Plotly.NET 一个为 .NET 打造的强大开源交互式图表库