From 1af72fa94f99224477cfd6c7c49d966b3146715b Mon Sep 17 00:00:00 2001 From: Mark Rich Date: Sun, 23 Oct 2022 19:36:04 -0700 Subject: [PATCH 01/10] initial commit to add imagic to stable diffusion community pipelines --- examples/community/README.md | 67 +++- examples/community/imagic_stable_diffusion.py | 350 ++++++++++++++++++ src/diffusers/pipeline_utils.py | 2 +- .../pipeline_stable_diffusion.py | 5 +- 4 files changed, 421 insertions(+), 3 deletions(-) create mode 100644 examples/community/imagic_stable_diffusion.py diff --git a/examples/community/README.md b/examples/community/README.md index 2ef84291fc7d..d1dcbf7a2ce2 100644 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -16,8 +16,12 @@ If a community doesn't work as expected, please open an issue and ping the autho | Speech to Image | Using automatic-speech-recognition to transcribe text and Stable Diffusion to generate images | [Speech to Image](#speech-to-image) | - | [Mikail Duzenli](https://github.com/MikailINTech) | Wild Card Stable Diffusion | Stable Diffusion Pipeline that supports prompts that contain wildcard terms (indicated by surrounding double underscores), with values instantiated randomly from a corresponding txt file or a dictionary of possible values | [Wildcard Stable Diffusion](#wildcard-stable-diffusion) | - | [Shyam Sudhakaran](https://github.com/shyamsn97) | | Composable Stable Diffusion| Stable Diffusion Pipeline that supports prompts that contain "|" in prompts (as an AND condition) and weights (separated by "|" as well) to positively / negatively weight prompts. | [Composable Stable Diffusion](#composable-stable-diffusion) | - | [Mark Rich](https://github.com/MarkRich) | +<<<<<<< HEAD | Seed Resizing Stable Diffusion| Stable Diffusion Pipeline that supports resizing an image and retaining the concepts of the 512 by 512 generation. | [Seed Resizing](#seed-resizing) | - | [Mark Rich](https://github.com/MarkRich) | +======= +| Imagic Stable Diffusion | Stable Diffusion Pipeline that enables writing a text prompt to edit an existing image| [Imagic Stable Diffusion](#imagic-stable-diffusion) | - | [Mark Rich](https://github.com/MarkRich) | +>>>>>>> initial commit to add imagic to stable diffusion community pipelines To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly. @@ -373,6 +377,7 @@ for i in range(4): for i, img in enumerate(images): img.save(f"./composable_diffusion/image_{i}.png") ``` +<<<<<<< HEAD ### Seed Resizing Test seed resizing. Originally generate an image in 512 by 512, then generate image with same seed at 512 by 592 using seed resizing. Finally, generate 512 by 592 using original stable diffusion pipeline. @@ -380,6 +385,23 @@ Test seed resizing. Originally generate an image in 512 by 512, then generate im import torch as th import numpy as np from diffusers import DiffusionPipeline +======= + +### Imagic Stable Diffusion +Allows you to edit an image using stable diffusion. + + +```python + + +import torch as th +import numpy as np +import requests +from PIL import Image +from io import BytesIO +import torch +from diffusers import DiffusionPipeline, DDIMScheduler +>>>>>>> initial commit to add imagic to stable diffusion community pipelines has_cuda = th.cuda.is_available() device = th.device('cpu' if not has_cuda else 'cuda') @@ -387,14 +409,23 @@ device = th.device('cpu' if not has_cuda else 'cuda') pipe = DiffusionPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", use_auth_token=True, +<<<<<<< HEAD custom_pipeline="seed_resize_stable_diffusion" ).to(device) +======= + custom_pipeline="imagic_stable_diffusion", + scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False) +).to(device) + + +>>>>>>> initial commit to add imagic to stable diffusion community pipelines def dummy(images, **kwargs): return images, False pipe.safety_checker = dummy +<<<<<<< HEAD images = [] th.manual_seed(0) @@ -456,4 +487,38 @@ res = pipe_compare( image = res.images[0] image.save('./seed_resize/seed_resize_{w}_{h}_image_compare.png'.format(w=width, h=height)) -``` \ No newline at end of file +``` +======= +images = [] +generator = th.Generator("cuda").manual_seed(0) + +seed = 0 +prompt = "A photo of Barack Obama smiling with a big grin" + +images = [] + +url = 'https://www.dropbox.com/s/6tlwzr73jd1r9yk/obama.png?dl=1' + +response = requests.get(url) +init_image = Image.open(BytesIO(response.content)).convert("RGB") +init_image = init_image.resize((512, 512)) + +res = pipe.train( + prompt, + init_image, + guidance_scale=7.5, + num_inference_steps=50, + generator=generator) +res = pipe(alpha=1) +image = res.images[0] +image.save('./imagic/imagic_image_alpha_1.png') + +res = pipe(alpha=1.5) +image = res.images[0] +image.save('./imagic/imagic_image_alpha_1_5.png') + +res = pipe(alpha=2) +image = res.images[0] +image.save('./imagic/imagic_image_alpha_2.png') +``` +>>>>>>> initial commit to add imagic to stable diffusion community pipelines diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py new file mode 100644 index 000000000000..2b3f5cc9ccc0 --- /dev/null +++ b/examples/community/imagic_stable_diffusion.py @@ -0,0 +1,350 @@ +""" + modeled after the textual_inversion.py / train_dreambooth.py and the work + of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb +""" +import warnings +from typing import List, Optional, Union + +import numpy as np +import torch +import torch.nn.functional as F + +import PIL +from accelerate import Accelerator +from diffusers import StableDiffusionPipeline +from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker +from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler +from tqdm.auto import tqdm +from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer + + +def freeze_params(params): + for param in params: + param.requires_grad = False + + +def unfreeze_params(params): + for param in params: + param.requires_grad = True + + +def preprocess(image): + w, h = image.size + w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 + image = image.resize((w, h), resample=PIL.Image.LANCZOS) + image = np.array(image).astype(np.float32) / 255.0 + image = image[None].transpose(0, 3, 1, 2) + image = torch.from_numpy(image) + return 2.0 * image - 1.0 + + +class ImagicStableDiffusionPipeline(DiffusionPipeline): + r""" + Pipeline for imagic image editing. + See paper here: https://arxiv.org/pdf/2210.09276.pdf + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offsensive or harmful. + Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details. + feature_extractor ([`CLIPFeatureExtractor`]): + Model that extracts features from generated images to be used as inputs for the `safety_checker`. + """ + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + ): + super().__init__() + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + + def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): + r""" + Enable sliced attention computation. + When this option is enabled, the attention module will split the input tensor in slices, to compute attention + in several steps. This is useful to save some memory in exchange for a small speed decrease. + Args: + slice_size (`str` or `int`, *optional*, defaults to `"auto"`): + When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If + a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case, + `attention_head_dim` must be a multiple of `slice_size`. + """ + if slice_size == "auto": + # half the attention head size is usually a good trade-off between + # speed and memory + slice_size = self.unet.config.attention_head_dim // 2 + self.unet.set_attention_slice(slice_size) + + def disable_attention_slicing(self): + r""" + Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go + back to computing attention in one step. + """ + # set slice_size = `None` to disable `attention slicing` + self.enable_attention_slicing(None) + + # @torch.no_grad() + def __call__( + self, + prompt: Union[str, List[str]], + init_image: Union[torch.FloatTensor, PIL.Image.Image], + alpha: float = 1.2, + height: Optional[int] = 512, + width: Optional[int] = 512, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + generator: Optional[torch.Generator] = None, + return_dict: bool = True, + **kwargs, + ): + r""" + Function invoked when calling the pipeline for generation. + Args: + prompt (`str` or `List[str]`): + The prompt or prompts to guide the image generation. + height (`int`, *optional*, defaults to 512): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to 512): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator`, *optional*): + A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation + deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `nd.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + embedding_learning_rate = 0.001 + diffusion_model_learning_rate = 2e-6 + text_embedding_optimization_steps = 500 + model_fine_tuning_optimization_steps = 1000 + + accelerator = Accelerator( + gradient_accumulation_steps=1, + mixed_precision="fp16", + log_with="wandb", + logging_dir="./logging", + ) + + if "torch_device" in kwargs: + device = kwargs.pop("torch_device") + warnings.warn( + "`torch_device` is deprecated as an input argument to `__call__` and will be removed in v0.3.0." + " Consider using `pipe.to(torch_device)` instead." + ) + + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + self.to(device) + + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + # Freeze vae and unet + freeze_params(self.vae.parameters()) + freeze_params(self.unet.parameters()) + freeze_params(self.text_encoder.parameters()) + self.unet.eval() + self.vae.eval() + self.text_encoder.eval() + + if accelerator.is_main_process: + accelerator.init_trackers( + "imagic", + config={ + "embedding_learning_rate": embedding_learning_rate, + "text_embedding_optimization_steps": text_embedding_optimization_steps, + }, + ) + + # get text embeddings for prompt + text_input = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncaton=True, + return_tensors="pt", + ) + text_embeddings = torch.nn.Parameter( + self.text_encoder(text_input.input_ids.to(self.device))[0], requires_grad=True + ) + text_embeddings = text_embeddings.detach() + text_embeddings.requires_grad_() + text_embeddings_orig = text_embeddings.clone() + + # Initialize the optimizer + optimizer = torch.optim.Adam( + [text_embeddings], # only optimize the embeddings + lr=embedding_learning_rate, + ) + + if isinstance(init_image, PIL.Image.Image): + init_image = preprocess(init_image) + + latents_dtype = text_embeddings.dtype + init_image = init_image.to(device=self.device, dtype=latents_dtype) + init_latent_image_dist = self.vae.encode(init_image).latent_dist + init_image_latents = init_latent_image_dist.sample(generator=generator) + init_image_latents = 0.18215 * init_image_latents + + pipeline = StableDiffusionPipeline( + text_encoder=self.text_encoder, + vae=self.vae, + unet=self.unet, + tokenizer=self.tokenizer, + scheduler=self.scheduler, + safety_checker=self.safety_checker, + feature_extractor=self.feature_extractor, + ) + pipeline = pipeline.to("cuda") + + progress_bar = tqdm(range(text_embedding_optimization_steps), disable=not accelerator.is_local_main_process) + progress_bar.set_description("Steps") + + global_step = 0 + + print("First optimizing the text embedding to better reconstruct the init image") + for _ in range(text_embedding_optimization_steps): + with accelerator.accumulate(text_embeddings): + # Sample noise that we'll add to the latents + noise = torch.randn(init_image_latents.shape).to(init_image_latents.device) + timesteps = torch.randint(1000, (1,), device=init_image_latents.device) + + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps) + + # Predict the noise residual + noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample + + loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean() + accelerator.backward(loss) + + optimizer.step() + optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + global_step += 1 + + logs = {"loss": loss.detach().item()} # , "lr": lr_scheduler.get_last_lr()[0]} + progress_bar.set_postfix(**logs) + accelerator.log(logs, step=global_step) + + accelerator.wait_for_everyone() + + text_embeddings.requires_grad_(False) + freeze_params(text_embeddings) + + # Now we fine tune the unet to better reconstruct the image + unfreeze_params(self.unet.parameters()) + self.unet.train() + optimizer = torch.optim.Adam( + self.unet.parameters(), # only optimize unet + lr=diffusion_model_learning_rate, + ) + progress_bar = tqdm(range(model_fine_tuning_optimization_steps), disable=not accelerator.is_local_main_process) + + print("Next fine tuning the entire model to better reconstruct the init image") + for _ in range(model_fine_tuning_optimization_steps): + with accelerator.accumulate(self.unet.parameters()): + # Sample noise that we'll add to the latents + noise = torch.randn(init_image_latents.shape).to(init_image_latents.device) + timesteps = torch.randint(1000, (1,), device=init_image_latents.device) + + # Add noise to the latents according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps) + + # Predict the noise residual + noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample + + loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean() + accelerator.backward(loss) + + optimizer.step() + optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + progress_bar.update(1) + global_step += 1 + + logs = {"loss": loss.detach().item()} # , "lr": lr_scheduler.get_last_lr()[0]} + progress_bar.set_postfix(**logs) + accelerator.log(logs, step=global_step) + + accelerator.wait_for_everyone() + + new_text_embeddings = alpha * text_embeddings_orig + (1 - alpha) * text_embeddings + image = pipeline( + prompt, text_embeddings=new_text_embeddings, scale=7.5, num_inference_steps=num_inference_steps + ).images[ + 0 + ] # , latents=noise_latents).images[0] + + # run safety checker + safety_cheker_input = self.feature_extractor(image, return_tensors="pt").to(self.device) + image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_cheker_input.pixel_values) + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index 5c94df25cca0..7b7e13bc0c6c 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -52,7 +52,7 @@ INDEX_FILE = "diffusion_pytorch_model.bin" -CUSTOM_PIPELINE_FILE_NAME = "pipeline.py" +CUSTOM_PIPELINE_FILE_NAME = "imagic_stable_diffusion.py" DUMMY_MODULES_FOLDER = "diffusers.utils" diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 5927f36b12a1..809516bfba5e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -174,6 +174,7 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: Optional[int] = 1, + text_embeddings: Optional[torch.FloatTensor] = None, **kwargs, ): r""" @@ -264,7 +265,9 @@ def __call__( f" {self.tokenizer.model_max_length} tokens: {removed_text}" ) text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length] - text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0] + + if text_embeddings is None: + text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0] # duplicate text embeddings for each generation per prompt, using mps friendly method bs_embed, seq_len, _ = text_embeddings.shape From 10349e52de16c8d1e27a39f6f714217512760375 Mon Sep 17 00:00:00 2001 From: Mark Rich Date: Sun, 23 Oct 2022 19:39:30 -0700 Subject: [PATCH 02/10] remove some testing changes --- src/diffusers/pipeline_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index 7b7e13bc0c6c..5c94df25cca0 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -52,7 +52,7 @@ INDEX_FILE = "diffusion_pytorch_model.bin" -CUSTOM_PIPELINE_FILE_NAME = "imagic_stable_diffusion.py" +CUSTOM_PIPELINE_FILE_NAME = "pipeline.py" DUMMY_MODULES_FOLDER = "diffusers.utils" From 9c6bf052418023bb347ff0a52d88ff97ad2a297b Mon Sep 17 00:00:00 2001 From: Mark Rich Date: Thu, 27 Oct 2022 10:28:34 -0700 Subject: [PATCH 03/10] comments from PR review for imagic stable diffusion --- examples/community/imagic_stable_diffusion.py | 218 ++++++++++++++---- 1 file changed, 179 insertions(+), 39 deletions(-) diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py index 2b3f5cc9ccc0..d1e93b33045e 100644 --- a/examples/community/imagic_stable_diffusion.py +++ b/examples/community/imagic_stable_diffusion.py @@ -4,6 +4,7 @@ """ import warnings from typing import List, Optional, Union +import inspect import numpy as np import torch @@ -19,6 +20,9 @@ from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from tqdm.auto import tqdm from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +from diffusers.utils import logging + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name def freeze_params(params): @@ -115,18 +119,18 @@ def disable_attention_slicing(self): # set slice_size = `None` to disable `attention slicing` self.enable_attention_slicing(None) - # @torch.no_grad() - def __call__( + + def train( self, prompt: Union[str, List[str]], init_image: Union[torch.FloatTensor, PIL.Image.Image], - alpha: float = 1.2, height: Optional[int] = 512, width: Optional[int] = 512, - num_inference_steps: Optional[int] = 50, - guidance_scale: Optional[float] = 7.5, generator: Optional[torch.Generator] = None, - return_dict: bool = True, + embedding_learning_rate: float = 0.001, + diffusion_model_learning_rate: float = 2e-6, + text_embedding_optimization_steps: int = 500, + model_fine_tuning_optimization_steps: int = 1000, **kwargs, ): r""" @@ -170,11 +174,6 @@ def __call__( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ - embedding_learning_rate = 0.001 - diffusion_model_learning_rate = 2e-6 - text_embedding_optimization_steps = 500 - model_fine_tuning_optimization_steps = 1000 - accelerator = Accelerator( gradient_accumulation_steps=1, mixed_precision="fp16", @@ -197,9 +196,9 @@ def __call__( raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") # Freeze vae and unet - freeze_params(self.vae.parameters()) - freeze_params(self.unet.parameters()) - freeze_params(self.text_encoder.parameters()) + self.vae.requires_grad_(False) + self.unet.requires_grad_(False) + self.text_encoder.requires_grad_(False) self.unet.eval() self.vae.eval() self.text_encoder.eval() @@ -243,23 +242,12 @@ def __call__( init_image_latents = init_latent_image_dist.sample(generator=generator) init_image_latents = 0.18215 * init_image_latents - pipeline = StableDiffusionPipeline( - text_encoder=self.text_encoder, - vae=self.vae, - unet=self.unet, - tokenizer=self.tokenizer, - scheduler=self.scheduler, - safety_checker=self.safety_checker, - feature_extractor=self.feature_extractor, - ) - pipeline = pipeline.to("cuda") - progress_bar = tqdm(range(text_embedding_optimization_steps), disable=not accelerator.is_local_main_process) progress_bar.set_description("Steps") global_step = 0 - print("First optimizing the text embedding to better reconstruct the init image") + logger.info("First optimizing the text embedding to better reconstruct the init image") for _ in range(text_embedding_optimization_steps): with accelerator.accumulate(text_embeddings): # Sample noise that we'll add to the latents @@ -291,10 +279,9 @@ def __call__( accelerator.wait_for_everyone() text_embeddings.requires_grad_(False) - freeze_params(text_embeddings) # Now we fine tune the unet to better reconstruct the image - unfreeze_params(self.unet.parameters()) + self.unet.requires_grad_(True) self.unet.train() optimizer = torch.optim.Adam( self.unet.parameters(), # only optimize unet @@ -302,7 +289,7 @@ def __call__( ) progress_bar = tqdm(range(model_fine_tuning_optimization_steps), disable=not accelerator.is_local_main_process) - print("Next fine tuning the entire model to better reconstruct the init image") + logger.info("Next fine tuning the entire model to better reconstruct the init image") for _ in range(model_fine_tuning_optimization_steps): with accelerator.accumulate(self.unet.parameters()): # Sample noise that we'll add to the latents @@ -332,19 +319,172 @@ def __call__( accelerator.log(logs, step=global_step) accelerator.wait_for_everyone() + self.text_embeddings_orig = text_embeddings_orig + self.text_embeddings = text_embeddings - new_text_embeddings = alpha * text_embeddings_orig + (1 - alpha) * text_embeddings - image = pipeline( - prompt, text_embeddings=new_text_embeddings, scale=7.5, num_inference_steps=num_inference_steps - ).images[ - 0 - ] # , latents=noise_latents).images[0] + @torch.no_grad() + def __call__( + self, + alpha: float = 1.2, + height: Optional[int] = 512, + width: Optional[int] = 512, + num_inference_steps: Optional[int] = 50, + generator: Optional[torch.Generator] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + guidance_scale: float = 7.5, + eta: float = 0.0, + **kwargs, + ): + r""" + Function invoked when calling the pipeline for generation. + Args: + prompt (`str` or `List[str]`): + The prompt or prompts to guide the image generation. + height (`int`, *optional*, defaults to 512): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to 512): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator`, *optional*): + A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation + deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `nd.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + if self.text_embeddings is None: + raise ValueError("Please run the pipe.train() before trying to generate an image.") + if self.text_embeddings_orig is None: + raise ValueError("Please run the pipe.train() before trying to generate an image.") + + text_embeddings = alpha * self.text_embeddings_orig + (1 - alpha) * self.text_embeddings + + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance: + uncond_tokens = [""] + max_length = self.tokenizer.model_max_length + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0] + + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = uncond_embeddings.shape[1] + uncond_embeddings = uncond_embeddings.view(1, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) + + # get the initial random noise unless the user supplied it + + # Unlike in other pipelines, latents need to be generated in the target device + # for 1-to-1 results reproducibility with the CompVis implementation. + # However this currently doesn't work in `mps`. + latents_shape = (1, self.unet.in_channels, height // 8, width // 8) + latents_dtype = text_embeddings.dtype + if self.device.type == "mps": + # randn does not exist on mps + latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to( + self.device + ) + else: + latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype) + + # set timesteps + self.scheduler.set_timesteps(num_inference_steps) + + # Some schedulers like PNDM have timesteps as arrays + # It's more optimized to move all timesteps to correct device beforehand + timesteps_tensor = self.scheduler.timesteps.to(self.device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + for i, t in enumerate(self.progress_bar(timesteps_tensor)): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + + latents = 1 / 0.18215 * latents + image = self.vae.decode(latents).sample + + image = (image / 2 + 0.5).clamp(0, 1) + + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to( + self.device + ) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype) + ) + else: + has_nsfw_concept = None - # run safety checker - safety_cheker_input = self.feature_extractor(image, return_tensors="pt").to(self.device) - image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_cheker_input.pixel_values) + if output_type == "pil": + image = self.numpy_to_pil(image) if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) \ No newline at end of file From 983e032dbe096dd45ec389c77cada1e1d093329a Mon Sep 17 00:00:00 2001 From: Mark Rich Date: Thu, 27 Oct 2022 10:32:07 -0700 Subject: [PATCH 04/10] remove changes from pipeline_stable_diffusion as part of imagic pipeline --- examples/community/imagic_stable_diffusion.py | 11 +++++------ .../stable_diffusion/pipeline_stable_diffusion.py | 4 ---- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py index d1e93b33045e..60c2756de8a9 100644 --- a/examples/community/imagic_stable_diffusion.py +++ b/examples/community/imagic_stable_diffusion.py @@ -2,9 +2,9 @@ modeled after the textual_inversion.py / train_dreambooth.py and the work of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb """ +import inspect import warnings from typing import List, Optional, Union -import inspect import numpy as np import torch @@ -18,9 +18,10 @@ from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler +from diffusers.utils import logging from tqdm.auto import tqdm from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer -from diffusers.utils import logging + logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -119,7 +120,6 @@ def disable_attention_slicing(self): # set slice_size = `None` to disable `attention slicing` self.enable_attention_slicing(None) - def train( self, prompt: Union[str, List[str]], @@ -379,14 +379,13 @@ def __call__( """ if height % 8 != 0 or width % 8 != 0: raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - if self.text_embeddings is None: + if self.text_embeddings is None: raise ValueError("Please run the pipe.train() before trying to generate an image.") if self.text_embeddings_orig is None: raise ValueError("Please run the pipe.train() before trying to generate an image.") text_embeddings = alpha * self.text_embeddings_orig + (1 - alpha) * self.text_embeddings - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` # corresponds to doing no classifier free guidance. @@ -487,4 +486,4 @@ def __call__( if not return_dict: return (image, has_nsfw_concept) - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) \ No newline at end of file + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 809516bfba5e..cbce0b33a942 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -174,7 +174,6 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: Optional[int] = 1, - text_embeddings: Optional[torch.FloatTensor] = None, **kwargs, ): r""" @@ -266,9 +265,6 @@ def __call__( ) text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length] - if text_embeddings is None: - text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0] - # duplicate text embeddings for each generation per prompt, using mps friendly method bs_embed, seq_len, _ = text_embeddings.shape text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1) From ca8e4c0a7d8eba1cb3db95e7fdeaf191c97a80a5 Mon Sep 17 00:00:00 2001 From: Mark Rich Date: Thu, 27 Oct 2022 10:35:06 -0700 Subject: [PATCH 05/10] clean up example code and add line back in to pipeline_stable_diffusion for imagic pipeline --- examples/community/README.md | 13 ++++--------- .../stable_diffusion/pipeline_stable_diffusion.py | 1 + 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/examples/community/README.md b/examples/community/README.md index d1dcbf7a2ce2..8b7c47e2b499 100644 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -1,6 +1,4 @@ -# Community Examples - -> **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).** +# Community Examples > **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).** **Community** examples consist of both inference and training examples that have been added by the community. Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out. @@ -392,8 +390,6 @@ Allows you to edit an image using stable diffusion. ```python - - import torch as th import numpy as np import requests @@ -419,6 +415,7 @@ pipe = DiffusionPipeline.from_pretrained( ).to(device) +<<<<<<< HEAD >>>>>>> initial commit to add imagic to stable diffusion community pipelines def dummy(images, **kwargs): return images, False @@ -490,13 +487,11 @@ image.save('./seed_resize/seed_resize_{w}_{h}_image_compare.png'.format(w=width, ``` ======= images = [] +======= +>>>>>>> clean up example code and add line back in to pipeline_stable_diffusion for imagic pipeline generator = th.Generator("cuda").manual_seed(0) - seed = 0 prompt = "A photo of Barack Obama smiling with a big grin" - -images = [] - url = 'https://www.dropbox.com/s/6tlwzr73jd1r9yk/obama.png?dl=1' response = requests.get(url) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index cbce0b33a942..5927f36b12a1 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -264,6 +264,7 @@ def __call__( f" {self.tokenizer.model_max_length} tokens: {removed_text}" ) text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length] + text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0] # duplicate text embeddings for each generation per prompt, using mps friendly method bs_embed, seq_len, _ = text_embeddings.shape From 9af9c7fda9035daa74cf9cc77248ce75ac4954ef Mon Sep 17 00:00:00 2001 From: Mark Rich Date: Thu, 27 Oct 2022 10:39:36 -0700 Subject: [PATCH 06/10] remove unused functions --- examples/community/imagic_stable_diffusion.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py index 60c2756de8a9..8d877132fa2d 100644 --- a/examples/community/imagic_stable_diffusion.py +++ b/examples/community/imagic_stable_diffusion.py @@ -25,17 +25,6 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name - -def freeze_params(params): - for param in params: - param.requires_grad = False - - -def unfreeze_params(params): - for param in params: - param.requires_grad = True - - def preprocess(image): w, h = image.size w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 From a3448a3aa532066ffb518856d4967a9a2fe80cd5 Mon Sep 17 00:00:00 2001 From: Mark Rich Date: Thu, 27 Oct 2022 10:42:08 -0700 Subject: [PATCH 07/10] small code quality changes for imagic pipeline --- examples/community/README.md | 3 ++- examples/community/imagic_stable_diffusion.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/community/README.md b/examples/community/README.md index 8b7c47e2b499..b709f67af698 100644 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -1,5 +1,6 @@ -# Community Examples > **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).** +# Community Examples +> **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).** **Community** examples consist of both inference and training examples that have been added by the community. Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out. If a community doesn't work as expected, please open an issue and ping the author on it. diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py index 8d877132fa2d..2aada7fa165c 100644 --- a/examples/community/imagic_stable_diffusion.py +++ b/examples/community/imagic_stable_diffusion.py @@ -12,7 +12,6 @@ import PIL from accelerate import Accelerator -from diffusers import StableDiffusionPipeline from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.pipeline_utils import DiffusionPipeline from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput @@ -25,6 +24,7 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name + def preprocess(image): w, h = image.size w, h = map(lambda x: x - x % 32, (w, h)) # resize to integer multiple of 32 From fe912d77b00f1fca1e348124b87cd2191a397224 Mon Sep 17 00:00:00 2001 From: Mark Rich Date: Mon, 31 Oct 2022 20:29:26 -0700 Subject: [PATCH 08/10] clean up readme --- examples/community/README.md | 99 +++++++++++++++--------------------- 1 file changed, 41 insertions(+), 58 deletions(-) diff --git a/examples/community/README.md b/examples/community/README.md index b709f67af698..8ab3dd9e71df 100644 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -1,6 +1,7 @@ # Community Examples > **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).** + **Community** examples consist of both inference and training examples that have been added by the community. Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out. If a community doesn't work as expected, please open an issue and ping the author on it. @@ -15,12 +16,9 @@ If a community doesn't work as expected, please open an issue and ping the autho | Speech to Image | Using automatic-speech-recognition to transcribe text and Stable Diffusion to generate images | [Speech to Image](#speech-to-image) | - | [Mikail Duzenli](https://github.com/MikailINTech) | Wild Card Stable Diffusion | Stable Diffusion Pipeline that supports prompts that contain wildcard terms (indicated by surrounding double underscores), with values instantiated randomly from a corresponding txt file or a dictionary of possible values | [Wildcard Stable Diffusion](#wildcard-stable-diffusion) | - | [Shyam Sudhakaran](https://github.com/shyamsn97) | | Composable Stable Diffusion| Stable Diffusion Pipeline that supports prompts that contain "|" in prompts (as an AND condition) and weights (separated by "|" as well) to positively / negatively weight prompts. | [Composable Stable Diffusion](#composable-stable-diffusion) | - | [Mark Rich](https://github.com/MarkRich) | -<<<<<<< HEAD | Seed Resizing Stable Diffusion| Stable Diffusion Pipeline that supports resizing an image and retaining the concepts of the 512 by 512 generation. | [Seed Resizing](#seed-resizing) | - | [Mark Rich](https://github.com/MarkRich) | -======= | Imagic Stable Diffusion | Stable Diffusion Pipeline that enables writing a text prompt to edit an existing image| [Imagic Stable Diffusion](#imagic-stable-diffusion) | - | [Mark Rich](https://github.com/MarkRich) | ->>>>>>> initial commit to add imagic to stable diffusion community pipelines To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly. @@ -376,29 +374,56 @@ for i in range(4): for i, img in enumerate(images): img.save(f"./composable_diffusion/image_{i}.png") ``` -<<<<<<< HEAD -### Seed Resizing -Test seed resizing. Originally generate an image in 512 by 512, then generate image with same seed at 512 by 592 using seed resizing. Finally, generate 512 by 592 using original stable diffusion pipeline. - -```python -import torch as th -import numpy as np -from diffusers import DiffusionPipeline -======= ### Imagic Stable Diffusion Allows you to edit an image using stable diffusion. - ```python -import torch as th -import numpy as np import requests from PIL import Image from io import BytesIO import torch from diffusers import DiffusionPipeline, DDIMScheduler ->>>>>>> initial commit to add imagic to stable diffusion community pipelines +has_cuda = torch.cuda.is_available() +device = torch.device('cpu' if not has_cuda else 'cuda') +pipe = DiffusionPipeline.from_pretrained( + "CompVis/stable-diffusion-v1-4", + safety_checker=None, + use_auth_token=True, + custom_pipeline="imagic_stable_diffusion", + scheduler = DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler") +).to(device) +generator = th.Generator("cuda").manual_seed(0) +seed = 0 +prompt = "A photo of Barack Obama smiling with a big grin" +url = 'https://www.dropbox.com/s/6tlwzr73jd1r9yk/obama.png?dl=1' +response = requests.get(url) +init_image = Image.open(BytesIO(response.content)).convert("RGB") +init_image = init_image.resize((512, 512)) +res = pipe.train( + prompt, + init_image, + guidance_scale=7.5, + num_inference_steps=50, + generator=generator) +res = pipe(alpha=1) +image = res.images[0] +image.save('./imagic/imagic_image_alpha_1.png') +res = pipe(alpha=1.5) +image = res.images[0] +image.save('./imagic/imagic_image_alpha_1_5.png') +res = pipe(alpha=2) +image = res.images[0] +image.save('./imagic/imagic_image_alpha_2.png') +``` + +### Seed Resizing +Test seed resizing. Originally generate an image in 512 by 512, then generate image with same seed at 512 by 592 using seed resizing. Finally, generate 512 by 592 using original stable diffusion pipeline. + +```python +import torch as th +import numpy as np +from diffusers import DiffusionPipeline has_cuda = th.cuda.is_available() device = th.device('cpu' if not has_cuda else 'cuda') @@ -406,24 +431,14 @@ device = th.device('cpu' if not has_cuda else 'cuda') pipe = DiffusionPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", use_auth_token=True, -<<<<<<< HEAD custom_pipeline="seed_resize_stable_diffusion" ).to(device) -======= - custom_pipeline="imagic_stable_diffusion", - scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False) -).to(device) - - -<<<<<<< HEAD ->>>>>>> initial commit to add imagic to stable diffusion community pipelines def dummy(images, **kwargs): return images, False pipe.safety_checker = dummy -<<<<<<< HEAD images = [] th.manual_seed(0) @@ -486,35 +501,3 @@ res = pipe_compare( image = res.images[0] image.save('./seed_resize/seed_resize_{w}_{h}_image_compare.png'.format(w=width, h=height)) ``` -======= -images = [] -======= ->>>>>>> clean up example code and add line back in to pipeline_stable_diffusion for imagic pipeline -generator = th.Generator("cuda").manual_seed(0) -seed = 0 -prompt = "A photo of Barack Obama smiling with a big grin" -url = 'https://www.dropbox.com/s/6tlwzr73jd1r9yk/obama.png?dl=1' - -response = requests.get(url) -init_image = Image.open(BytesIO(response.content)).convert("RGB") -init_image = init_image.resize((512, 512)) - -res = pipe.train( - prompt, - init_image, - guidance_scale=7.5, - num_inference_steps=50, - generator=generator) -res = pipe(alpha=1) -image = res.images[0] -image.save('./imagic/imagic_image_alpha_1.png') - -res = pipe(alpha=1.5) -image = res.images[0] -image.save('./imagic/imagic_image_alpha_1_5.png') - -res = pipe(alpha=2) -image = res.images[0] -image.save('./imagic/imagic_image_alpha_2.png') -``` ->>>>>>> initial commit to add imagic to stable diffusion community pipelines From d41229d7fabe367153d27d31a18fb158888c4235 Mon Sep 17 00:00:00 2001 From: Mark Rich Date: Mon, 31 Oct 2022 20:31:55 -0700 Subject: [PATCH 09/10] remove hardcoded logging values for imagic community example --- examples/community/imagic_stable_diffusion.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py index 2aada7fa165c..92aa677b4626 100644 --- a/examples/community/imagic_stable_diffusion.py +++ b/examples/community/imagic_stable_diffusion.py @@ -166,8 +166,6 @@ def train( accelerator = Accelerator( gradient_accumulation_steps=1, mixed_precision="fp16", - log_with="wandb", - logging_dir="./logging", ) if "torch_device" in kwargs: From 8d1d60c4c2554c355f856ef330fb7b7f00f329b9 Mon Sep 17 00:00:00 2001 From: Mark Rich Date: Mon, 31 Oct 2022 21:25:46 -0700 Subject: [PATCH 10/10] undo change for DDIMScheduler --- examples/community/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/community/README.md b/examples/community/README.md index 8ab3dd9e71df..bb3964e1a7c4 100644 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -391,7 +391,7 @@ pipe = DiffusionPipeline.from_pretrained( safety_checker=None, use_auth_token=True, custom_pipeline="imagic_stable_diffusion", - scheduler = DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler") + scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False) ).to(device) generator = th.Generator("cuda").manual_seed(0) seed = 0