From ee76208109d5ac0c8d51ee056a11f23de07fa52c Mon Sep 17 00:00:00 2001 From: Bahjat Kawar Date: Fri, 17 Mar 2023 09:30:07 +0200 Subject: [PATCH 01/14] TIME first commit --- docs/source/en/_toctree.yml | 2 + .../stable_diffusion/model_editing.mdx | 59 ++ .../pipelines/stable_diffusion/overview.mdx | 1 + docs/source/en/index.mdx | 3 +- src/diffusers/__init__.py | 1 + src/diffusers/pipelines/__init__.py | 1 + .../pipelines/stable_diffusion/__init__.py | 1 + ...pipeline_stable_diffusion_model_editing.py | 764 ++++++++++++++++++ .../dummy_torch_and_transformers_objects.py | 15 + 9 files changed, 846 insertions(+), 1 deletion(-) create mode 100644 docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx create mode 100644 src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 09012a5c693d..4c7683ff5bbc 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -154,6 +154,8 @@ title: Score SDE VE - local: api/pipelines/semantic_stable_diffusion title: Semantic Guidance + - local: api/pipelines/time_diffusion + title: Text-to-Image Model Editing - sections: - local: api/pipelines/stable_diffusion/overview title: Overview diff --git a/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx b/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx new file mode 100644 index 000000000000..6758af059e7e --- /dev/null +++ b/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx @@ -0,0 +1,59 @@ + + +# Editing Implicit Assumptions in Text-to-Image Diffusion Models + +## Overview + +[Editing Implicit Assumptions in Text-to-Image Diffusion Models](https://arxiv.org/abs/2303.08084) by Hadas Orgad*, Bahjat Kawar*, and Yonatan Belinkov. + +The abstract of the paper is the following: + +*Text-to-image diffusion models often make implicit assumptions about the world when generating images. While some assumptions are useful (e.g., the sky is blue), they can also be outdated, incorrect, or reflective of social biases present in the training data. Thus, there is a need to control these assumptions without requiring explicit user input or costly re-training. In this work, we aim to edit a given implicit assumption in a pre-trained diffusion model. Our Text-to-Image Model Editing method, TIME for short, receives a pair of inputs: a "source" under-specified prompt for which the model makes an implicit assumption (e.g., "a pack of roses"), and a "destination" prompt that describes the same setting, but with a specified desired attribute (e.g., "a pack of blue roses"). TIME then updates the model's cross-attention layers, as these layers assign visual meaning to textual tokens. We edit the projection matrices in these layers such that the source prompt is projected close to the destination prompt. Our method is highly efficient, as it modifies a mere 2.2% of the model's parameters in under one second. To evaluate model editing approaches, we introduce TIMED (TIME Dataset), containing 147 source and destination prompt pairs from various domains. Our experiments (using Stable Diffusion) show that TIME is successful in model editing, generalizes well for related prompts unseen during editing, and imposes minimal effect on unrelated generations. + +Resources: + +* [Project Page](https://time-diffusion.github.io/). +* [Paper](https://arxiv.org/abs/2303.08084). +* [Original Code](https://github.com/bahjat-kawar/time-diffusion). +* [Demo](https://huggingface.co/spaces/bahjat-kawar/time-diffusion). + +## Available Pipelines: + +| Pipeline | Tasks | Demo +|---|---|:---:| +| [StableDiffusionModelEditingPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py) | *Text-to-Image Model Editing* | [🤗 Space](https://huggingface.co/spaces/bahjat-kawar/time-diffusion)) | + +## Usage example + +```python +import torch +from diffusers import StableDiffusionModelEditingPipeline + +model_ckpt = "CompVis/stable-diffusion-v1-4" +pipe = StableDiffusionModelEditingPipeline.from_pretrained(model_ckpt) + +pipe = pipe.to("cuda") + +source_prompt = "A pack of roses" +destination_prompt = "A pack of blue roses" +pipe.edit_model(source_prompt, destination_prompt) + +prompt = "A field of roses" +image = pipe(prompt).images[0] +image.save("field_of_roses.png") +``` + +## StableDiffusionModelEditingPipeline +[[autodoc]] StableDiffusionModelEditingPipeline + - __call__ + - all diff --git a/docs/source/en/api/pipelines/stable_diffusion/overview.mdx b/docs/source/en/api/pipelines/stable_diffusion/overview.mdx index 160fa0d2ebce..70731fd294b9 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/overview.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/overview.mdx @@ -35,6 +35,7 @@ For more details about how Stable Diffusion works and how it differs from the ba | [StableDiffusionInstructPix2PixPipeline](./pix2pix) | **Experimental** – *Text-Based Image Editing * | | [InstructPix2Pix: Learning to Follow Image Editing Instructions](https://huggingface.co/spaces/timbrooks/instruct-pix2pix) | [StableDiffusionAttendAndExcitePipeline](./attend_and_excite) | **Experimental** – *Text-to-Image Generation * | | [Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models](https://huggingface.co/spaces/AttendAndExcite/Attend-and-Excite) | [StableDiffusionPix2PixZeroPipeline](./pix2pix_zero) | **Experimental** – *Text-Based Image Editing * | | [Zero-shot Image-to-Image Translation](https://arxiv.org/abs/2302.03027) +| [StableDiffusionModelEditingPipeline](./model_editing) | **Experimental** – *Text-to-Image Model Editing * | | [Editing Implicit Assumptions in Text-to-Image Diffusion Models](https://arxiv.org/abs/2303.08084) diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index 59c4d595cc8b..3c77e9110305 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -88,4 +88,5 @@ The library has three main components: | [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | | [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation | | [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation | -| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | \ No newline at end of file +| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | +| [time_diffusion](./api/pipelines/time_diffusion) | [Editing Implicit Assumptions in Text-to-Image Diffusion Models](https://arxiv.org/abs/2303.08084) | Text-to-Image Model Editing | \ No newline at end of file diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index f480b4100907..13137e015602 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -122,6 +122,7 @@ StableDiffusionInpaintPipelineLegacy, StableDiffusionInstructPix2PixPipeline, StableDiffusionLatentUpscalePipeline, + StableDiffusionModelEditingPipeline, StableDiffusionPanoramaPipeline, StableDiffusionPipeline, StableDiffusionPipelineSafe, diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 5b6c729f80be..3ab4ad1a7892 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -56,6 +56,7 @@ StableDiffusionInpaintPipelineLegacy, StableDiffusionInstructPix2PixPipeline, StableDiffusionLatentUpscalePipeline, + StableDiffusionModelEditingPipeline, StableDiffusionPanoramaPipeline, StableDiffusionPipeline, StableDiffusionPix2PixZeroPipeline, diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py index 54ec4dabc73e..22ed322bef9c 100644 --- a/src/diffusers/pipelines/stable_diffusion/__init__.py +++ b/src/diffusers/pipelines/stable_diffusion/__init__.py @@ -51,6 +51,7 @@ class StableDiffusionPipelineOutput(BaseOutput): from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy from .pipeline_stable_diffusion_instruct_pix2pix import StableDiffusionInstructPix2PixPipeline from .pipeline_stable_diffusion_latent_upscale import StableDiffusionLatentUpscalePipeline + from .pipeline_stable_diffusion_model_editing import StableDiffusionModelEditingPipeline from .pipeline_stable_diffusion_panorama import StableDiffusionPanoramaPipeline from .pipeline_stable_diffusion_sag import StableDiffusionSAGPipeline from .pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py new file mode 100644 index 000000000000..cc64b55a70d6 --- /dev/null +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -0,0 +1,764 @@ +# Copyright 2023 MultiDiffusion Authors and The HuggingFace Team. All rights reserved." +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Any, Callable, Dict, List, Optional, Union + +import torch +from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer + +import copy + +from ...models import AutoencoderKL, UNet2DConditionModel +from ...schedulers import DDIMScheduler, PNDMScheduler +from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring +from ..pipeline_utils import DiffusionPipeline +from . import StableDiffusionPipelineOutput +from .safety_checker import StableDiffusionSafetyChecker + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import StableDiffusionModelEditingPipeline + + >>> model_ckpt = "CompVis/stable-diffusion-v1-4" + >>> pipe = StableDiffusionModelEditingPipeline.from_pretrained(model_ckpt) + + >>> pipe = pipe.to("cuda") + + >>> source_prompt = "A pack of roses" + >>> destination_prompt = "A pack of blue roses" + >>> pipe.edit_model(source_prompt, destination_prompt) + + >>> prompt = "A field of roses" + >>> image = pipe(prompt).images[0] + ``` +""" + + +class StableDiffusionModelEditingPipeline(DiffusionPipeline): + r""" + Pipeline for text-to-image model editing using "Editing Implicit Assumptions in Text-to-Image Diffusion Models". + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.). + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. The original work + on Multi Diffsion used the [`DDIMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. + feature_extractor ([`CLIPFeatureExtractor`]): + Model that extracts features from generated images to be used as inputs for the `safety_checker`. + with_to_k ([`bool`]): + Whether to edit the key projection matrices along wiht the value projection matrices. + with_augs ([`bool`]): + Whether to apply textual augmentations while editing the text-to-image model. + """ + _optional_components = ["safety_checker", "feature_extractor"] + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: DDIMScheduler, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + requires_safety_checker: bool = True, + with_to_k: bool = True, + with_augs: bool = True, + ): + super().__init__() + + if isinstance(scheduler, PNDMScheduler): + logger.error("PNDMScheduler for this pipeline is currently not supported.") + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + self.with_to_k = with_to_k + self.with_augs = with_augs + + # get cross-attention layers + ca_layers = [] + def append_ca(net_): + if net_.__class__.__name__ == 'CrossAttention': + ca_layers.append(net_) + elif hasattr(net_, 'children'): + for net__ in net_.children(): + append_ca(net__) + for net in self.unet.named_children(): + if "down" in net[0]: + append_ca(net[1]) + elif "up" in net[0]: + append_ca(net[1]) + elif "mid" in net[0]: + append_ca(net[1]) + + # get projection matrices + self.ca_clip_layers = [l for l in ca_layers if l.to_v.in_features == 768] + self.projection_matrices = [l.to_v for l in self.ca_clip_layers] + self.og_matrices = [copy.deepcopy(l.to_v) for l in self.ca_clip_layers] + if self.with_to_k: + self.projection_matrices = self.projection_matrices + [l.to_k for l in self.ca_clip_layers] + self.og_matrices = self.og_matrices + [copy.deepcopy(l.to_k) for l in self.ca_clip_layers] + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. + + When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several + steps. This is useful to save some memory and allow larger batch sizes. + """ + self.vae.enable_slicing() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_slicing() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + Note that offloading happens on a submodule basis. Memory savings are higher than with + `enable_model_cpu_offload`, but performance is lower. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): + from accelerate import cpu_offload + else: + raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: + cpu_offload(cpu_offloaded_model, device) + + if self.safety_checker is not None: + cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + """ + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + return prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + else: + has_nsfw_concept = None + return image, has_nsfw_concept + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents + def decode_latents(self, latents): + latents = 1 / self.vae.config.scaling_factor * latents + image = self.vae.decode(latents).sample + image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + return image + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs + def check_inputs( + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): + shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + @torch.no_grad() + def edit_model(self, source_prompt, destination_prompt, lamb=0.1, restart_params=True): + # Apply model editing via closed-form solution (see Eq. 5 in the TIME paper https://arxiv.org/abs/2303.08084) + + # restart LDM parameters + if restart_params: + num_ca_clip_layers = len(self.ca_clip_layers) + for idx_, l in enumerate(self.ca_clip_layers): + l.to_v = copy.deepcopy(self.og_matrices[idx_]) + self.projection_matrices[idx_] = l.to_v + if self.with_to_k: + l.to_k = copy.deepcopy(self.og_matrices[num_ca_clip_layers + idx_]) + self.projection_matrices[num_ca_clip_layers + idx_] = l.to_k + + # set up sentences + old_texts = [source_prompt] + new_texts = [destination_prompt] + if with_augs: + base = old_texts[0] if old_texts[0][0:1] != "A" else "a" + old_texts[0][1:] + old_texts.append("A photo of " + base) + old_texts.append("An image of " + base) + old_texts.append("A picture of " + base) + base = new_texts[0] if new_texts[0][0:1] != "A" else "a" + new_texts[0][1:] + new_texts.append("A photo of " + base) + new_texts.append("An image of " + base) + new_texts.append("A picture of " + base) + + # prepare input k* and v* + old_embs, new_embs = [], [] + for old_text, new_text in zip(old_texts, new_texts): + text_input = self.tokenizer( + [old_text, new_text], + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0] + old_emb, new_emb = text_embeddings + old_embs.append(old_emb) + new_embs.append(new_emb) + + # identify corresponding destinations for each token in old_emb + idxs_replaces = [] + for old_text, new_text in zip(old_texts, new_texts): + tokens_a = self.tokenizer(old_text).input_ids + tokens_b = self.tokenizer(new_text).input_ids + tokens_a = [self.tokenizer.encode("a ")[1] if self.tokenizer.decode(t) == 'an' else t for t in tokens_a] + tokens_b = [self.tokenizer.encode("a ")[1] if self.tokenizer.decode(t) == 'an' else t for t in tokens_b] + num_orig_tokens = len(tokens_a) + idxs_replace = [] + j = 0 + for i in range(num_orig_tokens): + curr_token = tokens_a[i] + while tokens_b[j] != curr_token: + j += 1 + idxs_replace.append(j) + j += 1 + while j < 77: + idxs_replace.append(j) + j += 1 + while len(idxs_replace) < 77: + idxs_replace.append(76) + idxs_replaces.append(idxs_replace) + + # prepare batch: for each pair of setences, old context and new values + contexts, valuess = [], [] + for old_emb, new_emb, idxs_replace in zip(old_embs, new_embs, idxs_replaces): + context = old_emb.detach() + values = [] + with torch.no_grad(): + for layer in self.projection_matrices: + values.append(layer(new_emb[idxs_replace]).detach()) + contexts.append(context) + valuess.append(values) + + # edit the model + for layer_num in range(len(self.projection_matrices)): + #mat1 = \lambda W + \sum{v k^T} + mat1 = lamb * self.projection_matrices[layer_num].weight + + #mat2 = \lambda I + \sum{k k^T} + mat2 = lamb * torch.eye(self.projection_matrices[layer_num].weight.shape[1], device = self.projection_matrices[layer_num].weight.device) + + #aggregate sums for mat1, mat2 + for context, values in zip(contexts, valuess): + context_vector = context.reshape(context.shape[0], context.shape[1], 1) + context_vector_T = context.reshape(context.shape[0], 1, context.shape[1]) + value_vector = values[layer_num].reshape(values[layer_num].shape[0], values[layer_num].shape[1], 1) + for_mat1 = (value_vector @ context_vector_T).sum(dim=0) + for_mat2 = (context_vector @ context_vector_T).sum(dim=0) + mat1 += for_mat1 + mat2 += for_mat2 + + #update projection matrix + self.projection_matrices[layer_num].weight = torch.nn.Parameter(mat1 @ torch.inverse(mat2)) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__call__ + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + ) + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + prompt_embeds = self._encode_prompt( + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + ) + + # 4. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + # 5. Prepare latent variables + num_channels_latents = self.unet.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + ).sample + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + if output_type == "latent": + image = latents + has_nsfw_concept = None + elif output_type == "pil": + # 8. Post-processing + image = self.decode_latents(latents) + + # 9. Run safety checker + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # 10. Convert to PIL + image = self.numpy_to_pil(image) + else: + # 8. Post-processing + image = self.decode_latents(latents) + + # 9. Run safety checker + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) \ No newline at end of file diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 1b0f812ad16c..9d6c073a0372 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -450,3 +450,18 @@ def from_config(cls, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) + + +class StableDiffusionModelEditingPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) From 7b8ebd2384586d39a550e0993be0ff70ac98a9df Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 20 Mar 2023 08:36:46 +0530 Subject: [PATCH 02/14] styling. --- docs/source/en/_toctree.yml | 4 +- ...pipeline_stable_diffusion_model_editing.py | 52 ++++++++++--------- 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 4c7683ff5bbc..a2b0c63858a1 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -154,8 +154,6 @@ title: Score SDE VE - local: api/pipelines/semantic_stable_diffusion title: Semantic Guidance - - local: api/pipelines/time_diffusion - title: Text-to-Image Model Editing - sections: - local: api/pipelines/stable_diffusion/overview title: Overview @@ -192,6 +190,8 @@ title: Stable unCLIP - local: api/pipelines/stochastic_karras_ve title: Stochastic Karras VE + - local: api/pipelines/time_diffusion + title: Text-to-Image Model Editing - local: api/pipelines/unclip title: UnCLIP - local: api/pipelines/latent_diffusion_uncond diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index cc64b55a70d6..d80152d91eba 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -11,14 +11,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import inspect from typing import Any, Callable, Dict, List, Optional, Union import torch from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer -import copy - from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDIMScheduler, PNDMScheduler from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring @@ -134,19 +133,21 @@ def __init__( # get cross-attention layers ca_layers = [] + def append_ca(net_): - if net_.__class__.__name__ == 'CrossAttention': + if net_.__class__.__name__ == "CrossAttention": ca_layers.append(net_) - elif hasattr(net_, 'children'): + elif hasattr(net_, "children"): for net__ in net_.children(): append_ca(net__) + for net in self.unet.named_children(): - if "down" in net[0]: - append_ca(net[1]) - elif "up" in net[0]: - append_ca(net[1]) - elif "mid" in net[0]: - append_ca(net[1]) + if "down" in net[0]: + append_ca(net[1]) + elif "up" in net[0]: + append_ca(net[1]) + elif "mid" in net[0]: + append_ca(net[1]) # get projection matrices self.ca_clip_layers = [l for l in ca_layers if l.to_v.in_features == 768] @@ -475,11 +476,11 @@ def edit_model(self, source_prompt, destination_prompt, lamb=0.1, restart_params if self.with_to_k: l.to_k = copy.deepcopy(self.og_matrices[num_ca_clip_layers + idx_]) self.projection_matrices[num_ca_clip_layers + idx_] = l.to_k - + # set up sentences old_texts = [source_prompt] new_texts = [destination_prompt] - if with_augs: + if self.with_augs: base = old_texts[0] if old_texts[0][0:1] != "A" else "a" + old_texts[0][1:] old_texts.append("A photo of " + base) old_texts.append("An image of " + base) @@ -488,7 +489,7 @@ def edit_model(self, source_prompt, destination_prompt, lamb=0.1, restart_params new_texts.append("A photo of " + base) new_texts.append("An image of " + base) new_texts.append("A picture of " + base) - + # prepare input k* and v* old_embs, new_embs = [], [] for old_text, new_text in zip(old_texts, new_texts): @@ -503,14 +504,14 @@ def edit_model(self, source_prompt, destination_prompt, lamb=0.1, restart_params old_emb, new_emb = text_embeddings old_embs.append(old_emb) new_embs.append(new_emb) - + # identify corresponding destinations for each token in old_emb idxs_replaces = [] for old_text, new_text in zip(old_texts, new_texts): tokens_a = self.tokenizer(old_text).input_ids tokens_b = self.tokenizer(new_text).input_ids - tokens_a = [self.tokenizer.encode("a ")[1] if self.tokenizer.decode(t) == 'an' else t for t in tokens_a] - tokens_b = [self.tokenizer.encode("a ")[1] if self.tokenizer.decode(t) == 'an' else t for t in tokens_b] + tokens_a = [self.tokenizer.encode("a ")[1] if self.tokenizer.decode(t) == "an" else t for t in tokens_a] + tokens_b = [self.tokenizer.encode("a ")[1] if self.tokenizer.decode(t) == "an" else t for t in tokens_b] num_orig_tokens = len(tokens_a) idxs_replace = [] j = 0 @@ -526,7 +527,7 @@ def edit_model(self, source_prompt, destination_prompt, lamb=0.1, restart_params while len(idxs_replace) < 77: idxs_replace.append(76) idxs_replaces.append(idxs_replace) - + # prepare batch: for each pair of setences, old context and new values contexts, valuess = [], [] for old_emb, new_emb, idxs_replace in zip(old_embs, new_embs, idxs_replaces): @@ -537,16 +538,19 @@ def edit_model(self, source_prompt, destination_prompt, lamb=0.1, restart_params values.append(layer(new_emb[idxs_replace]).detach()) contexts.append(context) valuess.append(values) - + # edit the model for layer_num in range(len(self.projection_matrices)): - #mat1 = \lambda W + \sum{v k^T} + # mat1 = \lambda W + \sum{v k^T} mat1 = lamb * self.projection_matrices[layer_num].weight - #mat2 = \lambda I + \sum{k k^T} - mat2 = lamb * torch.eye(self.projection_matrices[layer_num].weight.shape[1], device = self.projection_matrices[layer_num].weight.device) + # mat2 = \lambda I + \sum{k k^T} + mat2 = lamb * torch.eye( + self.projection_matrices[layer_num].weight.shape[1], + device=self.projection_matrices[layer_num].weight.device, + ) - #aggregate sums for mat1, mat2 + # aggregate sums for mat1, mat2 for context, values in zip(contexts, valuess): context_vector = context.reshape(context.shape[0], context.shape[1], 1) context_vector_T = context.reshape(context.shape[0], 1, context.shape[1]) @@ -556,7 +560,7 @@ def edit_model(self, source_prompt, destination_prompt, lamb=0.1, restart_params mat1 += for_mat1 mat2 += for_mat2 - #update projection matrix + # update projection matrix self.projection_matrices[layer_num].weight = torch.nn.Parameter(mat1 @ torch.inverse(mat2)) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__call__ @@ -761,4 +765,4 @@ def __call__( if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) \ No newline at end of file + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) From 16c985517ed767559780048ef0d740b3734c9f0d Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 20 Mar 2023 08:45:57 +0530 Subject: [PATCH 03/14] styling 2. --- ...pipeline_stable_diffusion_model_editing.py | 3 +- .../dummy_torch_and_transformers_objects.py | 30 +++++++++---------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index d80152d91eba..0595b1a9c70a 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -20,7 +20,7 @@ from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDIMScheduler, PNDMScheduler -from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring +from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker @@ -565,7 +565,6 @@ def edit_model(self, source_prompt, destination_prompt, lamb=0.1, restart_params # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__call__ @torch.no_grad() - @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, prompt: Union[str, List[str]] = None, diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 9d6c073a0372..5bb9067d6944 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -227,6 +227,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class StableDiffusionModelEditingPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class StableDiffusionPanoramaPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] @@ -450,18 +465,3 @@ def from_config(cls, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) - - -class StableDiffusionModelEditingPipeline(metaclass=DummyObject): - _backends = ["torch", "transformers"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch", "transformers"]) - - @classmethod - def from_config(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch", "transformers"]) From 9b5e5ab43fed5ba4d28f4ee48564dd6193c8ba25 Mon Sep 17 00:00:00 2001 From: Bahjat Kawar Date: Thu, 23 Mar 2023 10:43:05 +0200 Subject: [PATCH 04/14] fixes; tests --- docs/source/en/_toctree.yml | 2 + ...pipeline_stable_diffusion_model_editing.py | 1 - .../test_stable_diffusion_model_editing.py | 294 ++++++++++++++++++ 3 files changed, 296 insertions(+), 1 deletion(-) create mode 100644 tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index a2b0c63858a1..55d7be027425 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -183,6 +183,8 @@ title: MultiDiffusion Panorama - local: api/pipelines/stable_diffusion/controlnet title: Text-to-Image Generation with ControlNet Conditioning + - local: api/pipelines/stable_diffusion/model_editing + title: Text-to-Image Model Editing title: Stable Diffusion - local: api/pipelines/stable_diffusion_2 title: Stable Diffusion 2 diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index 0595b1a9c70a..d99ef143ebc3 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -563,7 +563,6 @@ def edit_model(self, source_prompt, destination_prompt, lamb=0.1, restart_params # update projection matrix self.projection_matrices[layer_num].weight = torch.nn.Parameter(mat1 @ torch.inverse(mat2)) - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__call__ @torch.no_grad() def __call__( self, diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py new file mode 100644 index 000000000000..371d9c90f045 --- /dev/null +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py @@ -0,0 +1,294 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import unittest + +import numpy as np +import torch +from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer + +from diffusers import ( + AutoencoderKL, + DDIMScheduler, + EulerAncestralDiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + StableDiffusionModelEditingPipeline, + UNet2DConditionModel, +) +from diffusers.utils import slow, torch_device +from diffusers.utils.testing_utils import require_torch_gpu, skip_mps + +from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS +from ...test_pipelines_common import PipelineTesterMixin + + +torch.backends.cuda.matmul.allow_tf32 = False + + +@skip_mps +class StableDiffusionModelEditingPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = StableDiffusionModelEditingPipeline + params = TEXT_TO_IMAGE_PARAMS + batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + + def get_dummy_components(self): + torch.manual_seed(0) + unet = UNet2DConditionModel( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, + in_channels=4, + out_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + cross_attention_dim=32, + ) + scheduler = DDIMScheduler() + torch.manual_seed(0) + vae = AutoencoderKL( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + ) + torch.manual_seed(0) + text_encoder_config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + ) + text_encoder = CLIPTextModel(text_encoder_config) + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + + components = { + "unet": unet, + "scheduler": scheduler, + "vae": vae, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "safety_checker": None, + "feature_extractor": None, + } + return components + + def get_dummy_inputs(self, device, seed=0): + generator = torch.manual_seed(seed) + inputs = { + "prompt": "A field of roses", + "generator": generator, + # Setting height and width to None to prevent OOMs on CPU. + "height": None, + "width": None, + "num_inference_steps": 2, + "guidance_scale": 6.0, + "output_type": "numpy", + } + return inputs + + def test_stable_diffusion_model_editing_default_case(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + sd_pipe = StableDiffusionModelEditingPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + image = sd_pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] + assert image.shape == (1, 64, 64, 3) + + expected_slice = np.array( + [0.5217179, 0.50658035, 0.5003239, 0.41109088, 0.3595158, 0.46607107, 0.5323504, 0.5335255, 0.49187922] + ) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + + def test_stable_diffusion_model_editing_negative_prompt(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + sd_pipe = StableDiffusionModelEditingPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + negative_prompt = "french fries" + output = sd_pipe(**inputs, negative_prompt=negative_prompt) + image = output.images + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (1, 64, 64, 3) + + expected_slice = np.array( + [0.546259, 0.5108156, 0.50897664, 0.41931948, 0.3748669, 0.4669299, 0.5427151, 0.54561913, 0.49353] + ) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + + def test_stable_diffusion_model_editing_euler(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + components["scheduler"] = EulerAncestralDiscreteScheduler( + beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear" + ) + sd_pipe = StableDiffusionModelEditingPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + image = sd_pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (1, 64, 64, 3) + + expected_slice = np.array( + [0.47106352, 0.53579676, 0.45798016, 0.514294, 0.56856745, 0.4788605, 0.54380214, 0.5046455, 0.50404465] + ) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + + def test_stable_diffusion_model_editing_pndm(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + components["scheduler"] = PNDMScheduler() + sd_pipe = StableDiffusionModelEditingPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + # the pipeline does not expect pndm so test if it raises error. + with self.assertRaises(ValueError): + _ = sd_pipe(**inputs).images + + +@slow +@require_torch_gpu +class StableDiffusionModelEditingSlowTests(unittest.TestCase): + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, seed=0): + generator = torch.manual_seed(seed) + inputs = { + "prompt": "A field of roses", + "generator": generator, + "num_inference_steps": 3, + "guidance_scale": 7.5, + "output_type": "numpy", + } + return inputs + + def test_stable_diffusion_model_editing_default(self): + model_ckpt = "CompVis/stable-diffusion-v1-4" + pipe = StableDiffusionModelEditingPipeline.from_pretrained(model_ckpt, safety_checker=None) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + pipe.enable_attention_slicing() + + inputs = self.get_inputs() + image = pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1].flatten() + + assert image.shape == (1, 512, 512, 3) + + expected_slice = np.array( + [ + 0.6749496, + 0.6386453, + 0.51443267, + 0.66094905, + 0.61921215, + 0.5491332, + 0.5744417, + 0.58075106, + 0.5174658 + ] + ) + + assert np.abs(expected_slice - image_slice).max() < 1e-2 + + # make sure image changes after editing + pipe.edit_model("A pack of roses", "A pack of blue roses") + + image = pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1].flatten() + + assert image.shape == (1, 512, 512, 3) + + assert np.abs(expected_slice - image_slice).max() > 1e-1 + + def test_stable_diffusion_model_editing_k_lms(self): + pipe = StableDiffusionModelEditingPipeline.from_pretrained( + "CompVis/stable-diffusion-v1-4", safety_checker=None + ) + pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + pipe.enable_attention_slicing() + + inputs = self.get_inputs() + image = pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1].flatten() + + assert image.shape == (1, 512, 512, 3) + + expected_slice = np.array( + [ + [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ] + ] + ) + + assert np.abs(expected_slice - image_slice).max() < 1e-3 + + def test_stable_diffusion_model_editing_pipeline_with_sequential_cpu_offloading(self): + torch.cuda.empty_cache() + torch.cuda.reset_max_memory_allocated() + torch.cuda.reset_peak_memory_stats() + + model_ckpt = "CompVis/stable-diffusion-v1-4" + scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler") + pipe = StableDiffusionModelEditingPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + pipe.enable_attention_slicing(1) + pipe.enable_sequential_cpu_offload() + + inputs = self.get_inputs() + _ = pipe(**inputs) + + mem_bytes = torch.cuda.max_memory_allocated() + # make sure that less than 5.2 GB is allocated + assert mem_bytes < 5.2 * 10**9 From 3e8185147b0a5eb8414142d9128b7177870953d6 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 23 Mar 2023 17:00:17 +0530 Subject: [PATCH 05/14] apply styling and doc fix. --- .../pipelines/stable_diffusion/model_editing.mdx | 2 +- .../test_stable_diffusion_model_editing.py | 16 ++++------------ 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx b/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx index 6758af059e7e..598449381b41 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx @@ -18,7 +18,7 @@ specific language governing permissions and limitations under the License. The abstract of the paper is the following: -*Text-to-image diffusion models often make implicit assumptions about the world when generating images. While some assumptions are useful (e.g., the sky is blue), they can also be outdated, incorrect, or reflective of social biases present in the training data. Thus, there is a need to control these assumptions without requiring explicit user input or costly re-training. In this work, we aim to edit a given implicit assumption in a pre-trained diffusion model. Our Text-to-Image Model Editing method, TIME for short, receives a pair of inputs: a "source" under-specified prompt for which the model makes an implicit assumption (e.g., "a pack of roses"), and a "destination" prompt that describes the same setting, but with a specified desired attribute (e.g., "a pack of blue roses"). TIME then updates the model's cross-attention layers, as these layers assign visual meaning to textual tokens. We edit the projection matrices in these layers such that the source prompt is projected close to the destination prompt. Our method is highly efficient, as it modifies a mere 2.2% of the model's parameters in under one second. To evaluate model editing approaches, we introduce TIMED (TIME Dataset), containing 147 source and destination prompt pairs from various domains. Our experiments (using Stable Diffusion) show that TIME is successful in model editing, generalizes well for related prompts unseen during editing, and imposes minimal effect on unrelated generations. +*Text-to-image diffusion models often make implicit assumptions about the world when generating images. While some assumptions are useful (e.g., the sky is blue), they can also be outdated, incorrect, or reflective of social biases present in the training data. Thus, there is a need to control these assumptions without requiring explicit user input or costly re-training. In this work, we aim to edit a given implicit assumption in a pre-trained diffusion model. Our Text-to-Image Model Editing method, TIME for short, receives a pair of inputs: a "source" under-specified prompt for which the model makes an implicit assumption (e.g., "a pack of roses"), and a "destination" prompt that describes the same setting, but with a specified desired attribute (e.g., "a pack of blue roses"). TIME then updates the model's cross-attention layers, as these layers assign visual meaning to textual tokens. We edit the projection matrices in these layers such that the source prompt is projected close to the destination prompt. Our method is highly efficient, as it modifies a mere 2.2% of the model's parameters in under one second. To evaluate model editing approaches, we introduce TIMED (TIME Dataset), containing 147 source and destination prompt pairs from various domains. Our experiments (using Stable Diffusion) show that TIME is successful in model editing, generalizes well for related prompts unseen during editing, and imposes minimal effect on unrelated generations.* Resources: diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py index 371d9c90f045..b42911aa64cb 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py @@ -215,17 +215,7 @@ def test_stable_diffusion_model_editing_default(self): assert image.shape == (1, 512, 512, 3) expected_slice = np.array( - [ - 0.6749496, - 0.6386453, - 0.51443267, - 0.66094905, - 0.61921215, - 0.5491332, - 0.5744417, - 0.58075106, - 0.5174658 - ] + [0.6749496, 0.6386453, 0.51443267, 0.66094905, 0.61921215, 0.5491332, 0.5744417, 0.58075106, 0.5174658] ) assert np.abs(expected_slice - image_slice).max() < 1e-2 @@ -280,7 +270,9 @@ def test_stable_diffusion_model_editing_pipeline_with_sequential_cpu_offloading( model_ckpt = "CompVis/stable-diffusion-v1-4" scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler") - pipe = StableDiffusionModelEditingPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None) + pipe = StableDiffusionModelEditingPipeline.from_pretrained( + model_ckpt, scheduler=scheduler, safety_checker=None + ) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) pipe.enable_attention_slicing(1) From a05b6a9004d5d7c781a78ee624e625ff9ab9d0cc Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 23 Mar 2023 17:09:58 +0530 Subject: [PATCH 06/14] remove sups. --- docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx b/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx index 598449381b41..a54cc266d46d 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx @@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License. ## Overview -[Editing Implicit Assumptions in Text-to-Image Diffusion Models](https://arxiv.org/abs/2303.08084) by Hadas Orgad*, Bahjat Kawar*, and Yonatan Belinkov. +[Editing Implicit Assumptions in Text-to-Image Diffusion Models](https://arxiv.org/abs/2303.08084) by Hadas Orgad, Bahjat Kawar, and Yonatan Belinkov. The abstract of the paper is the following: From 61ed87e7998e8a512fef9749f2812c6b7e706408 Mon Sep 17 00:00:00 2001 From: Bahjat Kawar Date: Thu, 23 Mar 2023 19:16:04 +0200 Subject: [PATCH 07/14] fixes --- diff.txt | 119 ++++++++++++++++++ .../stable_diffusion/model_editing.mdx | 2 + ...pipeline_stable_diffusion_model_editing.py | 11 +- .../test_stable_diffusion_model_editing.py | 37 +----- 4 files changed, 130 insertions(+), 39 deletions(-) create mode 100644 diff.txt diff --git a/diff.txt b/diff.txt new file mode 100644 index 000000000000..d2fa12bd0248 --- /dev/null +++ b/diff.txt @@ -0,0 +1,119 @@ +diff --git a/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx b/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx +index 6758af05..a8855713 100644 +--- a/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx ++++ b/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx +@@ -33,6 +33,8 @@ Resources: + |---|---|:---:| + | [StableDiffusionModelEditingPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py) | *Text-to-Image Model Editing* | [🤗 Space](https://huggingface.co/spaces/bahjat-kawar/time-diffusion)) | + ++This pipeline enables editing the diffusion model weights, such that its assumptions on a given concept are changed. The resulting change is expected to take effect in all prompt generations pertaining to the edited concept. ++ + ## Usage example + + ```python +diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +index d99ef143..04abf069 100644 +--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py ++++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +@@ -1,4 +1,4 @@ +-# Copyright 2023 MultiDiffusion Authors and The HuggingFace Team. All rights reserved." ++# Copyright 2023 TIME Authors and The HuggingFace Team. All rights reserved." + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at +@@ -20,6 +20,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer + + from ...models import AutoencoderKL, UNet2DConditionModel + from ...schedulers import DDIMScheduler, PNDMScheduler ++from ...schedulers.scheduling_utils import SchedulerMixin + from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor + from ..pipeline_utils import DiffusionPipeline + from . import StableDiffusionPipelineOutput +@@ -68,8 +69,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline): + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): +- A scheduler to be used in combination with `unet` to denoise the encoded image latents. The original work +- on Multi Diffsion used the [`DDIMScheduler`]. ++ A scheduler to be used in combination with `unet` to denoise the encoded image latents. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. +@@ -88,7 +88,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline): + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, +- scheduler: DDIMScheduler, ++ scheduler: SchedulerMixin, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + requires_safety_checker: bool = True, +@@ -141,6 +141,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline): + for net__ in net_.children(): + append_ca(net__) + ++ # recursively find all cross-attention layers in unet + for net in self.unet.named_children(): + if "down" in net[0]: + append_ca(net[1]) +@@ -466,6 +467,8 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline): + @torch.no_grad() + def edit_model(self, source_prompt, destination_prompt, lamb=0.1, restart_params=True): + # Apply model editing via closed-form solution (see Eq. 5 in the TIME paper https://arxiv.org/abs/2303.08084) ++ # When `restart_params` is True (default), the model parameters restart to their pre-trained version. ++ # This is done to avoid edit compounding. When it is False, edits accumulate (behavior not studied in paper). + + # restart LDM parameters + if restart_params: +diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py +index 371d9c90..2c650791 100644 +--- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py ++++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py +@@ -240,39 +240,6 @@ class StableDiffusionModelEditingSlowTests(unittest.TestCase): + + assert np.abs(expected_slice - image_slice).max() > 1e-1 + +- def test_stable_diffusion_model_editing_k_lms(self): +- pipe = StableDiffusionModelEditingPipeline.from_pretrained( +- "CompVis/stable-diffusion-v1-4", safety_checker=None +- ) +- pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) +- pipe.to(torch_device) +- pipe.set_progress_bar_config(disable=None) +- pipe.enable_attention_slicing() +- +- inputs = self.get_inputs() +- image = pipe(**inputs).images +- image_slice = image[0, -3:, -3:, -1].flatten() +- +- assert image.shape == (1, 512, 512, 3) +- +- expected_slice = np.array( +- [ +- [ +- 0.0, +- 0.0, +- 0.0, +- 0.0, +- 0.0, +- 0.0, +- 0.0, +- 0.0, +- 0.0, +- ] +- ] +- ) +- +- assert np.abs(expected_slice - image_slice).max() < 1e-3 +- + def test_stable_diffusion_model_editing_pipeline_with_sequential_cpu_offloading(self): + torch.cuda.empty_cache() + torch.cuda.reset_max_memory_allocated() +@@ -290,5 +257,5 @@ class StableDiffusionModelEditingSlowTests(unittest.TestCase): + _ = pipe(**inputs) + + mem_bytes = torch.cuda.max_memory_allocated() +- # make sure that less than 5.2 GB is allocated +- assert mem_bytes < 5.2 * 10**9 ++ # make sure that less than 4.4 GB is allocated ++ assert mem_bytes < 4.4 * 10**9 diff --git a/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx b/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx index 6758af059e7e..a8855713c2d9 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx @@ -33,6 +33,8 @@ Resources: |---|---|:---:| | [StableDiffusionModelEditingPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py) | *Text-to-Image Model Editing* | [🤗 Space](https://huggingface.co/spaces/bahjat-kawar/time-diffusion)) | +This pipeline enables editing the diffusion model weights, such that its assumptions on a given concept are changed. The resulting change is expected to take effect in all prompt generations pertaining to the edited concept. + ## Usage example ```python diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index d99ef143ebc3..04abf069f09c 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -1,4 +1,4 @@ -# Copyright 2023 MultiDiffusion Authors and The HuggingFace Team. All rights reserved." +# Copyright 2023 TIME Authors and The HuggingFace Team. All rights reserved." # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -20,6 +20,7 @@ from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDIMScheduler, PNDMScheduler +from ...schedulers.scheduling_utils import SchedulerMixin from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput @@ -68,8 +69,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline): [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image latents. The original work - on Multi Diffsion used the [`DDIMScheduler`]. + A scheduler to be used in combination with `unet` to denoise the encoded image latents. safety_checker ([`StableDiffusionSafetyChecker`]): Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. @@ -88,7 +88,7 @@ def __init__( text_encoder: CLIPTextModel, tokenizer: CLIPTokenizer, unet: UNet2DConditionModel, - scheduler: DDIMScheduler, + scheduler: SchedulerMixin, safety_checker: StableDiffusionSafetyChecker, feature_extractor: CLIPFeatureExtractor, requires_safety_checker: bool = True, @@ -141,6 +141,7 @@ def append_ca(net_): for net__ in net_.children(): append_ca(net__) + # recursively find all cross-attention layers in unet for net in self.unet.named_children(): if "down" in net[0]: append_ca(net[1]) @@ -466,6 +467,8 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype @torch.no_grad() def edit_model(self, source_prompt, destination_prompt, lamb=0.1, restart_params=True): # Apply model editing via closed-form solution (see Eq. 5 in the TIME paper https://arxiv.org/abs/2303.08084) + # When `restart_params` is True (default), the model parameters restart to their pre-trained version. + # This is done to avoid edit compounding. When it is False, edits accumulate (behavior not studied in paper). # restart LDM parameters if restart_params: diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py index 371d9c90f045..2c650791e100 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py @@ -240,39 +240,6 @@ def test_stable_diffusion_model_editing_default(self): assert np.abs(expected_slice - image_slice).max() > 1e-1 - def test_stable_diffusion_model_editing_k_lms(self): - pipe = StableDiffusionModelEditingPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None - ) - pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - - expected_slice = np.array( - [ - [ - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - ] - ] - ) - - assert np.abs(expected_slice - image_slice).max() < 1e-3 - def test_stable_diffusion_model_editing_pipeline_with_sequential_cpu_offloading(self): torch.cuda.empty_cache() torch.cuda.reset_max_memory_allocated() @@ -290,5 +257,5 @@ def test_stable_diffusion_model_editing_pipeline_with_sequential_cpu_offloading( _ = pipe(**inputs) mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 5.2 GB is allocated - assert mem_bytes < 5.2 * 10**9 + # make sure that less than 4.4 GB is allocated + assert mem_bytes < 4.4 * 10**9 From 0094901f03c073ef2f37d4b8c56ed91f9632c9f2 Mon Sep 17 00:00:00 2001 From: Bahjat Kawar Date: Thu, 23 Mar 2023 19:17:44 +0200 Subject: [PATCH 08/14] remove temp file --- diff.txt | 119 ------------------------------------------------------- 1 file changed, 119 deletions(-) delete mode 100644 diff.txt diff --git a/diff.txt b/diff.txt deleted file mode 100644 index d2fa12bd0248..000000000000 --- a/diff.txt +++ /dev/null @@ -1,119 +0,0 @@ -diff --git a/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx b/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx -index 6758af05..a8855713 100644 ---- a/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx -+++ b/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx -@@ -33,6 +33,8 @@ Resources: - |---|---|:---:| - | [StableDiffusionModelEditingPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py) | *Text-to-Image Model Editing* | [🤗 Space](https://huggingface.co/spaces/bahjat-kawar/time-diffusion)) | - -+This pipeline enables editing the diffusion model weights, such that its assumptions on a given concept are changed. The resulting change is expected to take effect in all prompt generations pertaining to the edited concept. -+ - ## Usage example - - ```python -diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py -index d99ef143..04abf069 100644 ---- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py -+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py -@@ -1,4 +1,4 @@ --# Copyright 2023 MultiDiffusion Authors and The HuggingFace Team. All rights reserved." -+# Copyright 2023 TIME Authors and The HuggingFace Team. All rights reserved." - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at -@@ -20,6 +20,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer - - from ...models import AutoencoderKL, UNet2DConditionModel - from ...schedulers import DDIMScheduler, PNDMScheduler -+from ...schedulers.scheduling_utils import SchedulerMixin - from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor - from ..pipeline_utils import DiffusionPipeline - from . import StableDiffusionPipelineOutput -@@ -68,8 +69,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline): - [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. - scheduler ([`SchedulerMixin`]): -- A scheduler to be used in combination with `unet` to denoise the encoded image latents. The original work -- on Multi Diffsion used the [`DDIMScheduler`]. -+ A scheduler to be used in combination with `unet` to denoise the encoded image latents. - safety_checker ([`StableDiffusionSafetyChecker`]): - Classification module that estimates whether generated images could be considered offensive or harmful. - Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. -@@ -88,7 +88,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline): - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, -- scheduler: DDIMScheduler, -+ scheduler: SchedulerMixin, - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, - requires_safety_checker: bool = True, -@@ -141,6 +141,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline): - for net__ in net_.children(): - append_ca(net__) - -+ # recursively find all cross-attention layers in unet - for net in self.unet.named_children(): - if "down" in net[0]: - append_ca(net[1]) -@@ -466,6 +467,8 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline): - @torch.no_grad() - def edit_model(self, source_prompt, destination_prompt, lamb=0.1, restart_params=True): - # Apply model editing via closed-form solution (see Eq. 5 in the TIME paper https://arxiv.org/abs/2303.08084) -+ # When `restart_params` is True (default), the model parameters restart to their pre-trained version. -+ # This is done to avoid edit compounding. When it is False, edits accumulate (behavior not studied in paper). - - # restart LDM parameters - if restart_params: -diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py -index 371d9c90..2c650791 100644 ---- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py -+++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py -@@ -240,39 +240,6 @@ class StableDiffusionModelEditingSlowTests(unittest.TestCase): - - assert np.abs(expected_slice - image_slice).max() > 1e-1 - -- def test_stable_diffusion_model_editing_k_lms(self): -- pipe = StableDiffusionModelEditingPipeline.from_pretrained( -- "CompVis/stable-diffusion-v1-4", safety_checker=None -- ) -- pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) -- pipe.to(torch_device) -- pipe.set_progress_bar_config(disable=None) -- pipe.enable_attention_slicing() -- -- inputs = self.get_inputs() -- image = pipe(**inputs).images -- image_slice = image[0, -3:, -3:, -1].flatten() -- -- assert image.shape == (1, 512, 512, 3) -- -- expected_slice = np.array( -- [ -- [ -- 0.0, -- 0.0, -- 0.0, -- 0.0, -- 0.0, -- 0.0, -- 0.0, -- 0.0, -- 0.0, -- ] -- ] -- ) -- -- assert np.abs(expected_slice - image_slice).max() < 1e-3 -- - def test_stable_diffusion_model_editing_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() -@@ -290,5 +257,5 @@ class StableDiffusionModelEditingSlowTests(unittest.TestCase): - _ = pipe(**inputs) - - mem_bytes = torch.cuda.max_memory_allocated() -- # make sure that less than 5.2 GB is allocated -- assert mem_bytes < 5.2 * 10**9 -+ # make sure that less than 4.4 GB is allocated -+ assert mem_bytes < 4.4 * 10**9 From 0f431089d155f3df24697f4221a3b392175fd42c Mon Sep 17 00:00:00 2001 From: Bahjat Kawar Date: Thu, 23 Mar 2023 19:22:56 +0200 Subject: [PATCH 09/14] move augmentations to const --- .../pipeline_stable_diffusion_model_editing.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index 04abf069f09c..97fab6aa6ad9 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -29,6 +29,8 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name +augs_const = ["A photo of ", "An image of ", "A picture of "] + EXAMPLE_DOC_STRING = """ Examples: ```py @@ -485,13 +487,11 @@ def edit_model(self, source_prompt, destination_prompt, lamb=0.1, restart_params new_texts = [destination_prompt] if self.with_augs: base = old_texts[0] if old_texts[0][0:1] != "A" else "a" + old_texts[0][1:] - old_texts.append("A photo of " + base) - old_texts.append("An image of " + base) - old_texts.append("A picture of " + base) + for aug in augs_const: + old_texts.append(aug + base) base = new_texts[0] if new_texts[0][0:1] != "A" else "a" + new_texts[0][1:] - new_texts.append("A photo of " + base) - new_texts.append("An image of " + base) - new_texts.append("A picture of " + base) + for aug in augs_const: + new_texts.append(aug + base) # prepare input k* and v* old_embs, new_embs = [], [] From 0be85905219ac4556caf734b27339cfd5e8025a0 Mon Sep 17 00:00:00 2001 From: Bahjat Kawar Date: Thu, 23 Mar 2023 19:28:42 +0200 Subject: [PATCH 10/14] added doc entry --- docs/source/en/index.mdx | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index 85094864a318..d020eb5d7d17 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -76,6 +76,7 @@ The library has three main components: | [stable_diffusion_self_attention_guidance](./api/pipelines/stable_diffusion/self_attention_guidance) | [Improving Sample Quality of Diffusion Models Using Self-Attention Guidance](https://arxiv.org/abs/2210.00939) | Text-to-Image Generation | | [stable_diffusion_image_variation](./stable_diffusion/image_variation) | [Stable Diffusion Image Variations](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations) | Image-to-Image Generation | | [stable_diffusion_latent_upscale](./stable_diffusion/latent_upscale) | [Stable Diffusion Latent Upscaler](https://twitter.com/StabilityAI/status/1590531958815064065) | Text-Guided Super Resolution Image-to-Image | +| [stable_diffusion_model_editing](./api/pipelines/stable_diffusion/model_editing) | [Editing Implicit Assumptions in Text-to-Image Diffusion Models](https://time-diffusion.github.io/) | Text-to-Image Model Editing | | [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [Stable Diffusion 2](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation | | [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [Stable Diffusion 2](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting | | [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [Depth-Conditional Stable Diffusion](https://github.com/Stability-AI/stablediffusion#depth-conditional-stable-diffusion) | Depth-to-Image Generation | From d0854fc53269d37e6c0eb118b7dab5d03f54faea Mon Sep 17 00:00:00 2001 From: Bahjat Kawar Date: Thu, 23 Mar 2023 19:30:41 +0200 Subject: [PATCH 11/14] code quality --- .../stable_diffusion/pipeline_stable_diffusion_model_editing.py | 2 +- .../stable_diffusion/test_stable_diffusion_model_editing.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index 97fab6aa6ad9..ef4b658cc336 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -19,7 +19,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...models import AutoencoderKL, UNet2DConditionModel -from ...schedulers import DDIMScheduler, PNDMScheduler +from ...schedulers import PNDMScheduler from ...schedulers.scheduling_utils import SchedulerMixin from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor from ..pipeline_utils import DiffusionPipeline diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py index d52c237d999e..2d9b1e54ee6e 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py @@ -24,7 +24,6 @@ AutoencoderKL, DDIMScheduler, EulerAncestralDiscreteScheduler, - LMSDiscreteScheduler, PNDMScheduler, StableDiffusionModelEditingPipeline, UNet2DConditionModel, From c4c69b00bafa5542a2478c24c4038d315d2a6d8b Mon Sep 17 00:00:00 2001 From: Bahjat Kawar Date: Fri, 24 Mar 2023 09:44:31 +0300 Subject: [PATCH 12/14] customize augmentations --- ...pipeline_stable_diffusion_model_editing.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index ef4b658cc336..7e7d94bd99ad 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -29,7 +29,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -augs_const = ["A photo of ", "An image of ", "A picture of "] +AUGS_CONST = ["A photo of ", "An image of ", "A picture of "] EXAMPLE_DOC_STRING = """ Examples: @@ -79,8 +79,8 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline): Model that extracts features from generated images to be used as inputs for the `safety_checker`. with_to_k ([`bool`]): Whether to edit the key projection matrices along wiht the value projection matrices. - with_augs ([`bool`]): - Whether to apply textual augmentations while editing the text-to-image model. + with_augs ([`list`]): + Textual augmentations to apply while editing the text-to-image model. Set to empty list for no augmentations. """ _optional_components = ["safety_checker", "feature_extractor"] @@ -95,7 +95,7 @@ def __init__( feature_extractor: CLIPFeatureExtractor, requires_safety_checker: bool = True, with_to_k: bool = True, - with_augs: bool = True, + with_augs: list = AUGS_CONST, ): super().__init__() @@ -485,13 +485,13 @@ def edit_model(self, source_prompt, destination_prompt, lamb=0.1, restart_params # set up sentences old_texts = [source_prompt] new_texts = [destination_prompt] - if self.with_augs: - base = old_texts[0] if old_texts[0][0:1] != "A" else "a" + old_texts[0][1:] - for aug in augs_const: - old_texts.append(aug + base) - base = new_texts[0] if new_texts[0][0:1] != "A" else "a" + new_texts[0][1:] - for aug in augs_const: - new_texts.append(aug + base) + #add augmentations + base = old_texts[0] if old_texts[0][0:1] != "A" else "a" + old_texts[0][1:] + for aug in self.with_augs: + old_texts.append(aug + base) + base = new_texts[0] if new_texts[0][0:1] != "A" else "a" + new_texts[0][1:] + for aug in self.with_augs: + new_texts.append(aug + base) # prepare input k* and v* old_embs, new_embs = [], [] From c44938144b9d0929591611b6e5aa6c9e2de3692c Mon Sep 17 00:00:00 2001 From: Bahjat Kawar Date: Fri, 24 Mar 2023 09:46:20 +0300 Subject: [PATCH 13/14] quality --- .../stable_diffusion/pipeline_stable_diffusion_model_editing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index 7e7d94bd99ad..5a0c7c227670 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -485,7 +485,7 @@ def edit_model(self, source_prompt, destination_prompt, lamb=0.1, restart_params # set up sentences old_texts = [source_prompt] new_texts = [destination_prompt] - #add augmentations + # add augmentations base = old_texts[0] if old_texts[0][0:1] != "A" else "a" + old_texts[0][1:] for aug in self.with_augs: old_texts.append(aug + base) From 40d352383c45d6f7c32ca4a30264955e664fae8e Mon Sep 17 00:00:00 2001 From: Bahjat Kawar Date: Fri, 24 Mar 2023 09:49:44 +0300 Subject: [PATCH 14/14] quality --- .../stable_diffusion/pipeline_stable_diffusion_model_editing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index 5a0c7c227670..5cb3348eff5d 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -80,7 +80,7 @@ class StableDiffusionModelEditingPipeline(DiffusionPipeline): with_to_k ([`bool`]): Whether to edit the key projection matrices along wiht the value projection matrices. with_augs ([`list`]): - Textual augmentations to apply while editing the text-to-image model. Set to empty list for no augmentations. + Textual augmentations to apply while editing the text-to-image model. Set to [] for no augmentations. """ _optional_components = ["safety_checker", "feature_extractor"]