From a48b0266e170747f35d02be7d5e394f3ed796357 Mon Sep 17 00:00:00 2001 From: Ben Glickenhaus Date: Thu, 3 Nov 2022 11:00:04 -0400 Subject: [PATCH 01/18] v diffusion support for ddpm --- src/diffusers/schedulers/scheduling_ddpm.py | 92 ++++++++++++++------- 1 file changed, 60 insertions(+), 32 deletions(-) diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index ee4f608e09aa..19acf3dc2ff5 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -16,16 +16,25 @@ import math from dataclasses import dataclass -from typing import Optional, Tuple, Union - +from typing import Optional, Tuple, Union, Literal import numpy as np import torch from ..configuration_utils import ConfigMixin, register_to_config -from ..utils import BaseOutput, deprecate +from ..utils import BaseOutput from .scheduling_utils import SchedulerMixin +def expand_to_shape(input, timesteps, shape, device): + """ + Helper indexes a 1D tensor `input` using a 1D index tensor `timesteps`, then reshapes the result to broadcast nicely with `shape`. Useful for parellizing operations over `shape[0]` number of diffusion steps at once. + """ + out = torch.gather(input.to(device), 0, timesteps.to(device)) + reshape = [shape[0]] + [1] * (len(shape) - 1) + out = out.reshape(*reshape) + return out + + @dataclass class DDPMSchedulerOutput(BaseOutput): """ @@ -102,6 +111,14 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin): """ + _compatible_classes = [ + "DDIMScheduler", + "PNDMScheduler", + "LMSDiscreteScheduler", + "EulerDiscreteScheduler", + "EulerAncestralDiscreteScheduler", + ] + @register_to_config def __init__( self, @@ -112,15 +129,7 @@ def __init__( trained_betas: Optional[np.ndarray] = None, variance_type: str = "fixed_small", clip_sample: bool = True, - **kwargs, ): - deprecate( - "tensor_format", - "0.6.0", - "If you're running your code in PyTorch, you can safely remove this argument.", - take_from=kwargs, - ) - if trained_betas is not None: self.betas = torch.from_numpy(trained_betas) elif beta_schedule == "linear": @@ -142,8 +151,8 @@ def __init__( self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) - self.sigmas = 1 - self.alphas**2 - self.one = torch.tensor(1.0) + self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod) + self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1 - self.alphas_cumprod) # standard deviation of the initial noise distribution self.init_noise_sigma = 1.0 @@ -185,11 +194,11 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic def _get_variance(self, timestep, predicted_variance=None, variance_type=None): alpha_prod_t = self.alphas_cumprod[timestep] - alpha_prod_t_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else self.one + alpha_prod_t_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else torch.tensor(1.0) - # For timestep > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf) + # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf) # and sample from it to get previous sample - # x_{timestep-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample + # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[timestep] if variance_type is None: @@ -213,6 +222,8 @@ def _get_variance(self, timestep, predicted_variance=None, variance_type=None): max_log = self.betas[timestep] frac = (predicted_variance + 1) / 2 variance = frac * max_log + (1 - frac) * min_log + elif variance_type == "v_diffusion": + variance = torch.log(self.betas[timestep] * (1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) return variance @@ -221,9 +232,10 @@ def step( model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor, - prediction_type: str = "epsilon", + prediction_type: Literal["epsilon", "sample", "v"] = "epsilon", generator=None, return_dict: bool = True, + v_prediction: bool = True, ) -> Union[DDPMSchedulerOutput, Tuple]: """ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion @@ -234,10 +246,8 @@ def step( timestep (`int`): current discrete timestep in the diffusion chain. sample (`torch.FloatTensor`): current instance of sample being created by diffusion process. - prediction_type (`str`): - prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion - process), `sample` (directly predicting the noisy sample), or `v` (see section 2.4 - https://imagen.research.google/video/paper.pdf) + prediction_type (`Literal["epsilon", "sample", "v"]`, optional): + prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v` (see section 2.4 https://imagen.research.google/video/paper.pdf) generator: random number generator. return_dict (`bool`): option for returning tuple rather than DDPMSchedulerOutput class @@ -254,23 +264,26 @@ def step( # 1. compute alphas, betas alpha_prod_t = self.alphas_cumprod[timestep] - alpha_prod_t_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else self.one + alpha_prod_t_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else torch.tensor(1.0) beta_prod_t = 1 - alpha_prod_t beta_prod_t_prev = 1 - alpha_prod_t_prev # 2. compute predicted original sample from predicted noise also called # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf - if prediction_type == "epsilon": + if prediction_type == "v": + # x_recon in p_mean_variance + pred_original_sample = ( + sample * self.sqrt_alphas_cumprod[timestep] + - model_output * self.sqrt_one_minus_alphas_cumprod[timestep] + ) + eps = ( + model_output * self.sqrt_alphas_cumprod[timestep] + - sample * self.sqrt_one_minus_alphas_cumprod[timestep] + ) + elif prediction_type == "epsilon": pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) - elif prediction_type == "sample": - pred_original_sample = model_output - elif prediction_type == "v": - # v_t = alpha_t * epsilon - sigma_t * x - # need to merge the PRs for sigma to be available in DDPM - pred = sample * self.alphas[timestep] - model_output * self.sigmas[timestep] - eps = model_output * self.alphas[timestep] - sample * self.sigmas[timestep] else: - raise ValueError(f"prediction_type given as {prediction_type} must be one of `epsilon`, `sample`, or `v`") + pred_original_sample = model_output # 3. Clip "predicted x_0" if self.config.clip_sample: @@ -291,7 +304,12 @@ def step( noise = torch.randn( model_output.size(), dtype=model_output.dtype, layout=model_output.layout, generator=generator ).to(model_output.device) - variance = (self._get_variance(timestep, predicted_variance=predicted_variance) ** 0.5) * noise + if self.variance_type == "fixed_small_log": + variance = self._get_variance(t, predicted_variance=predicted_variance) * noise + elif self.variance_type == "v_diffusion": + variance = torch.exp(0.5 * self._get_variance(timestep, predicted_variance)) * noise + else: + variance = (self._get_variance(timestep, predicted_variance=predicted_variance) ** 0.5) * noise pred_prev_sample = pred_prev_sample + variance @@ -306,6 +324,11 @@ def add_noise( noise: torch.FloatTensor, timesteps: torch.IntTensor, ) -> torch.FloatTensor: + if self.variance_type == "v_diffusion": + alpha, sigma = self.get_alpha_sigma(original_samples, timesteps, original_samples.device) + z_t = alpha * original_samples + sigma * noise + return z_t + # Make sure alphas_cumprod and timestep have same device and dtype as original_samples self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) timesteps = timesteps.to(original_samples.device) @@ -325,3 +348,8 @@ def add_noise( def __len__(self): return self.config.num_train_timesteps + + def get_alpha_sigma(self, sample, timesteps, device): + alpha = expand_to_shape(self.sqrt_alphas_cumprod, timesteps, sample.shape, device) + sigma = expand_to_shape(self.sqrt_one_minus_alphas_cumprod, timesteps, sample.shape, device) + return alpha, sigma From 3d702c6d652cb4d7a50d9cea750ca7115b3c4375 Mon Sep 17 00:00:00 2001 From: Ben Glickenhaus Date: Thu, 3 Nov 2022 11:03:26 -0400 Subject: [PATCH 02/18] quality and style --- src/diffusers/schedulers/scheduling_ddpm.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index 19acf3dc2ff5..c3c72171f1cc 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -16,7 +16,8 @@ import math from dataclasses import dataclass -from typing import Optional, Tuple, Union, Literal +from typing import Literal, Optional, Tuple, Union + import numpy as np import torch @@ -27,7 +28,8 @@ def expand_to_shape(input, timesteps, shape, device): """ - Helper indexes a 1D tensor `input` using a 1D index tensor `timesteps`, then reshapes the result to broadcast nicely with `shape`. Useful for parellizing operations over `shape[0]` number of diffusion steps at once. + Helper indexes a 1D tensor `input` using a 1D index tensor `timesteps`, then reshapes the result to broadcast + nicely with `shape`. Useful for parellizing operations over `shape[0]` number of diffusion steps at once. """ out = torch.gather(input.to(device), 0, timesteps.to(device)) reshape = [shape[0]] + [1] * (len(shape) - 1) @@ -247,7 +249,9 @@ def step( sample (`torch.FloatTensor`): current instance of sample being created by diffusion process. prediction_type (`Literal["epsilon", "sample", "v"]`, optional): - prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v` (see section 2.4 https://imagen.research.google/video/paper.pdf) + prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion + process), `sample` (directly predicting the noisy sample`) or `v` (see section 2.4 + https://imagen.research.google/video/paper.pdf) generator: random number generator. return_dict (`bool`): option for returning tuple rather than DDPMSchedulerOutput class @@ -276,10 +280,6 @@ def step( sample * self.sqrt_alphas_cumprod[timestep] - model_output * self.sqrt_one_minus_alphas_cumprod[timestep] ) - eps = ( - model_output * self.sqrt_alphas_cumprod[timestep] - - sample * self.sqrt_one_minus_alphas_cumprod[timestep] - ) elif prediction_type == "epsilon": pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) else: @@ -305,7 +305,7 @@ def step( model_output.size(), dtype=model_output.dtype, layout=model_output.layout, generator=generator ).to(model_output.device) if self.variance_type == "fixed_small_log": - variance = self._get_variance(t, predicted_variance=predicted_variance) * noise + variance = self._get_variance(timestep, predicted_variance=predicted_variance) * noise elif self.variance_type == "v_diffusion": variance = torch.exp(0.5 * self._get_variance(timestep, predicted_variance)) * noise else: From 0889fd1d1178fc203321e5a095b737a111c9c9fb Mon Sep 17 00:00:00 2001 From: Ben Glickenhaus Date: Thu, 3 Nov 2022 11:04:44 -0400 Subject: [PATCH 03/18] variable name consistency --- src/diffusers/schedulers/scheduling_ddpm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index c3c72171f1cc..25b081da87f1 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -198,9 +198,9 @@ def _get_variance(self, timestep, predicted_variance=None, variance_type=None): alpha_prod_t = self.alphas_cumprod[timestep] alpha_prod_t_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else torch.tensor(1.0) - # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf) + # For timestep > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf) # and sample from it to get previous sample - # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample + # x_{timestep-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[timestep] if variance_type is None: From f7c709518fe1c036866ea60b09119fd3013b534b Mon Sep 17 00:00:00 2001 From: Ben Glickenhaus Date: Thu, 3 Nov 2022 11:07:38 -0400 Subject: [PATCH 04/18] missing base case --- src/diffusers/schedulers/scheduling_ddpm.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index 25b081da87f1..878d67e817d1 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -282,8 +282,11 @@ def step( ) elif prediction_type == "epsilon": pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) - else: + + elif prediction_type == "sample": pred_original_sample = model_output + else: + raise ValueError(f"prediction_type given as {prediction_type} must be one of `epsilon`, `sample`, or `v`") # 3. Clip "predicted x_0" if self.config.clip_sample: From 0c23e1162ad31fee62b327ce6ce4ab5ba7a82188 Mon Sep 17 00:00:00 2001 From: Ben Glickenhaus Date: Thu, 3 Nov 2022 11:23:35 -0400 Subject: [PATCH 05/18] pass prediction type along in the pipeline --- src/diffusers/pipelines/ddpm/pipeline_ddpm.py | 7 +++++-- src/diffusers/schedulers/scheduling_ddpm.py | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py index aae29737aae3..3d5afc94e3df 100644 --- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py +++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py @@ -14,7 +14,7 @@ # limitations under the License. -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Union, Literal import torch @@ -44,6 +44,7 @@ def __call__( generator: Optional[torch.Generator] = None, output_type: Optional[str] = "pil", return_dict: bool = True, + prediction_type: Literal["epsilon", "sample", "v"] = "epsilon", **kwargs, ) -> Union[ImagePipelineOutput, Tuple]: r""" @@ -80,7 +81,9 @@ def __call__( model_output = self.unet(image, t).sample # 2. compute previous image: x_t -> t_t-1 - image = self.scheduler.step(model_output, t, image, generator=generator).prev_sample + image = self.scheduler.step( + model_output, t, image, generator=generator, prediction_type=prediction_type + ).prev_sample image = (image / 2 + 0.5).clamp(0, 1) image = image.cpu().permute(0, 2, 3, 1).numpy() diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index 878d67e817d1..1813592a069d 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -237,7 +237,6 @@ def step( prediction_type: Literal["epsilon", "sample", "v"] = "epsilon", generator=None, return_dict: bool = True, - v_prediction: bool = True, ) -> Union[DDPMSchedulerOutput, Tuple]: """ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion @@ -261,6 +260,8 @@ def step( returning a tuple, the first element is the sample tensor. """ + if self.variance_type == "v_diffusion": + assert prediction_type == "v", "Need to use v prediction with v_diffusion" if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]: model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1) else: From b46327e89e079b2eaaa2351a27c86900d77c168d Mon Sep 17 00:00:00 2001 From: Ben Glickenhaus Date: Mon, 7 Nov 2022 09:19:22 -0500 Subject: [PATCH 06/18] put prediction type in scheduler config --- src/diffusers/pipelines/ddpm/pipeline_ddpm.py | 7 ++----- src/diffusers/schedulers/scheduling_ddpm.py | 16 ++++++++++------ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py index 3d5afc94e3df..a9284063e884 100644 --- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py +++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py @@ -14,7 +14,7 @@ # limitations under the License. -from typing import Optional, Tuple, Union, Literal +from typing import Literal, Optional, Tuple, Union import torch @@ -44,7 +44,6 @@ def __call__( generator: Optional[torch.Generator] = None, output_type: Optional[str] = "pil", return_dict: bool = True, - prediction_type: Literal["epsilon", "sample", "v"] = "epsilon", **kwargs, ) -> Union[ImagePipelineOutput, Tuple]: r""" @@ -81,9 +80,7 @@ def __call__( model_output = self.unet(image, t).sample # 2. compute previous image: x_t -> t_t-1 - image = self.scheduler.step( - model_output, t, image, generator=generator, prediction_type=prediction_type - ).prev_sample + image = self.scheduler.step(model_output, t, image, generator=generator).prev_sample image = (image / 2 + 0.5).clamp(0, 1) image = image.cpu().permute(0, 2, 3, 1).numpy() diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index 1813592a069d..0327c44e3c4a 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -131,6 +131,7 @@ def __init__( trained_betas: Optional[np.ndarray] = None, variance_type: str = "fixed_small", clip_sample: bool = True, + prediction_type: Literal["epsilon", "sample", "v"] = "epsilon", ): if trained_betas is not None: self.betas = torch.from_numpy(trained_betas) @@ -164,6 +165,7 @@ def __init__( self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy()) self.variance_type = variance_type + self.prediction_type = prediction_type def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: """ @@ -234,7 +236,7 @@ def step( model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor, - prediction_type: Literal["epsilon", "sample", "v"] = "epsilon", + # prediction_type: Literal["epsilon", "sample", "v"] = "epsilon", generator=None, return_dict: bool = True, ) -> Union[DDPMSchedulerOutput, Tuple]: @@ -261,7 +263,7 @@ def step( """ if self.variance_type == "v_diffusion": - assert prediction_type == "v", "Need to use v prediction with v_diffusion" + assert self.prediction_type == "v", "Need to use v prediction with v_diffusion" if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]: model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1) else: @@ -275,19 +277,21 @@ def step( # 2. compute predicted original sample from predicted noise also called # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf - if prediction_type == "v": + if self.prediction_type == "v": # x_recon in p_mean_variance pred_original_sample = ( sample * self.sqrt_alphas_cumprod[timestep] - model_output * self.sqrt_one_minus_alphas_cumprod[timestep] ) - elif prediction_type == "epsilon": + elif self.prediction_type == "epsilon": pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) - elif prediction_type == "sample": + elif self.prediction_type == "sample": pred_original_sample = model_output else: - raise ValueError(f"prediction_type given as {prediction_type} must be one of `epsilon`, `sample`, or `v`") + raise ValueError( + f"prediction_type given as {self.prediction_type} must be one of `epsilon`, `sample`, or `v`" + ) # 3. Clip "predicted x_0" if self.config.clip_sample: From 45c36c85d71ae278ca7d7ad87263394128b30eb1 Mon Sep 17 00:00:00 2001 From: Ben Glickenhaus Date: Mon, 7 Nov 2022 09:20:16 -0500 Subject: [PATCH 07/18] style --- src/diffusers/pipelines/ddpm/pipeline_ddpm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py index a9284063e884..aae29737aae3 100644 --- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py +++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py @@ -14,7 +14,7 @@ # limitations under the License. -from typing import Literal, Optional, Tuple, Union +from typing import Optional, Tuple, Union import torch From 13404a6857e96a2314df4a4d82b268a1ccb4922d Mon Sep 17 00:00:00 2001 From: Ben Glickenhaus Date: Wed, 9 Nov 2022 15:16:21 -0500 Subject: [PATCH 08/18] try to train on ddim --- examples/v_prediction/train_butterflies.py | 239 ++++++++++++++++++++ src/diffusers/schedulers/scheduling_ddim.py | 83 ++++--- 2 files changed, 295 insertions(+), 27 deletions(-) create mode 100644 examples/v_prediction/train_butterflies.py diff --git a/examples/v_prediction/train_butterflies.py b/examples/v_prediction/train_butterflies.py new file mode 100644 index 000000000000..8eaa971c80d3 --- /dev/null +++ b/examples/v_prediction/train_butterflies.py @@ -0,0 +1,239 @@ +from dataclasses import dataclass + + +@dataclass +class TrainingConfig: + image_size = 128 # the generated image resolution + train_batch_size = 16 + eval_batch_size = 16 # how many images to sample during evaluation + num_epochs = 50 + gradient_accumulation_steps = 1 + learning_rate = 5e-5 + lr_warmup_steps = 500 + save_image_epochs = 10 + save_model_epochs = 30 + mixed_precision = "fp16" # `no` for float32, `fp16` for automatic mixed precision + output_dir = "ddim-butterflies-128-v-diffusion" # the model namy locally and on the HF Hub + + push_to_hub = False # whether to upload the saved model to the HF Hub + hub_private_repo = False + overwrite_output_dir = True # overwrite the old model when re-running the notebook + seed = 0 + + +config = TrainingConfig() +from datasets import load_dataset + +config.dataset_name = "huggan/smithsonian_butterflies_subset" +dataset = load_dataset(config.dataset_name, split="train") + +import matplotlib.pyplot as plt + +from torchvision import transforms + +preprocess = transforms.Compose( + [ + transforms.Resize((config.image_size, config.image_size)), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] +) + + +def transform(examples): + images = [preprocess(image.convert("RGB")) for image in examples["image"]] + return {"images": images} + + +dataset.set_transform(transform) + +import torch + +train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=config.train_batch_size, shuffle=True) + +from diffusers import UNet2DModel + + +model = UNet2DModel( + sample_size=config.image_size, # the target image resolution + in_channels=3, # the number of input channels, 3 for RGB images + out_channels=3, # the number of output channels + layers_per_block=2, # how many ResNet layers to use per UNet block + block_out_channels=(128, 128, 256, 256, 512, 512), # the number of output channes for each UNet block + down_block_types=( + "DownBlock2D", # a regular ResNet downsampling block + "DownBlock2D", + "DownBlock2D", + "DownBlock2D", + "AttnDownBlock2D", # a ResNet downsampling block with spatial self-attention + "DownBlock2D", + ), + up_block_types=( + "UpBlock2D", # a regular ResNet upsampling block + "AttnUpBlock2D", # a ResNet upsampling block with spatial self-attention + "UpBlock2D", + "UpBlock2D", + "UpBlock2D", + "UpBlock2D", + ), +) + +from diffusers import DDPMScheduler, DDIMPipeline, DDIMScheduler + +noise_scheduler = DDIMScheduler( + num_train_timesteps=1000, + beta_schedule="squaredcos_cap_v2", + variance_type="v_diffusion", +) + +import torch +import torch.nn.functional as F + +from PIL import Image + +optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate) + + +from diffusers.optimization import get_cosine_schedule_with_warmup + +lr_scheduler = get_cosine_schedule_with_warmup( + optimizer=optimizer, + num_warmup_steps=config.lr_warmup_steps, + num_training_steps=(len(train_dataloader) * config.num_epochs), +) + +from diffusers import DDPMPipeline + +import math + + +def make_grid(images, rows, cols): + w, h = images[0].size + grid = Image.new("RGB", size=(cols * w, rows * h)) + for i, image in enumerate(images): + grid.paste(image, box=(i % cols * w, i // cols * h)) + return grid + + +def evaluate(config, epoch, pipeline): + # Sample some images from random noise (this is the backward diffusion process). + # The default pipeline output type is `List[PIL.Image]` + images = pipeline( + batch_size=config.eval_batch_size, + generator=torch.manual_seed(config.seed), + ).images + + # Make a grid out of the images + image_grid = make_grid(images, rows=4, cols=4) + + # Save the images + test_dir = os.path.join(config.output_dir, "samples") + os.makedirs(test_dir, exist_ok=True) + image_grid.save(f"{test_dir}/{epoch:04d}.png") + + +from accelerate import Accelerator +from diffusers.hub_utils import init_git_repo, push_to_hub + +from tqdm.auto import tqdm +import os + + +def train_loop(config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler): + # Initialize accelerator and tensorboard logging + accelerator = Accelerator( + mixed_precision=config.mixed_precision, + gradient_accumulation_steps=config.gradient_accumulation_steps, + log_with="tensorboard", + logging_dir=os.path.join(config.output_dir, "logs"), + ) + if accelerator.is_main_process: + if config.push_to_hub: + repo = init_git_repo(config, at_init=True) + accelerator.init_trackers("train_example") + + # Prepare everything + # There is no specific order to remember, you just need to unpack the + # objects in the same order you gave them to the prepare method. + model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, lr_scheduler + ) + + global_step = 0 + + # Now you train the model + for epoch in range(config.num_epochs): + progress_bar = tqdm(total=len(train_dataloader), disable=not accelerator.is_local_main_process) + progress_bar.set_description(f"Epoch {epoch}") + + for step, batch in enumerate(train_dataloader): + clean_images = batch["images"] + # Sample noise to add to the images + noise = torch.randn(clean_images.shape).to(clean_images.device) + bs = clean_images.shape[0] + + # Sample a random timestep for each image + timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bs,), device=clean_images.device).long() + + with accelerator.accumulate(model): + # Predict the noise residual + alpha_t, sigma_t = noise_scheduler.get_alpha_sigma(clean_images, timesteps, accelerator.device) + z_t = alpha_t * clean_images + sigma_t * noise + noise_pred = model(z_t, timesteps).sample + v = alpha_t * noise - sigma_t * clean_images + loss = F.mse_loss(noise_pred, v) + accelerator.backward(loss) + + accelerator.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + progress_bar.update(1) + logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step} + progress_bar.set_postfix(**logs) + accelerator.log(logs, step=global_step) + global_step += 1 + + # After each epoch you optionally sample some demo images with evaluate() and save the model + if accelerator.is_main_process: + pipeline = DDIMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler) + + if (epoch + 1) % config.save_image_epochs == 0 or epoch == config.num_epochs - 1: + evaluate(config, epoch, pipeline) + + if (epoch + 1) % config.save_model_epochs == 0 or epoch == config.num_epochs - 1: + if config.push_to_hub: + push_to_hub(config, pipeline, repo, commit_message=f"Epoch {epoch}", blocking=True) + else: + pipeline.save_pretrained(config.output_dir) + + +"""## Let's train! + +Let's launch the training (including multi-GPU training) from the notebook using Accelerate's `notebook_launcher` function: +""" + +from accelerate import notebook_launcher + +args = (config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler) + +train_loop(*args) + +"""Let's have a look at the final image grid produced by the trained diffusion model:""" + +import glob + +sample_images = sorted(glob.glob(f"{config.output_dir}/samples/*.png")) +Image.open(sample_images[-1]) + +"""Not bad! There's room for improvement of course, so feel free to play with the hyperparameters, model definition and image augmentations 🤗 + +If you've chosen to upload the model to the Hugging Face Hub, its repository should now look like so: +https://huggingface.co/anton-l/ddpm-butterflies-128 + +If you want to dive deeper into the code, we also have more advanced training scripts with features like Exponential Moving Average of model weights here: + +https://github.com/huggingface/diffusers/tree/main/examples +""" diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index abdcb3e81a58..177bd65dc517 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -17,7 +17,7 @@ import math from dataclasses import dataclass -from typing import Optional, Tuple, Union +from typing import Optional, Tuple, Union, Literal import numpy as np import torch @@ -27,11 +27,21 @@ from .scheduling_utils import SchedulerMixin +def expand_to_shape(input, timesteps, shape, device): + """ + Helper indexes a 1D tensor `input` using a 1D index tensor `timesteps`, then reshapes the result to broadcast + nicely with `shape`. Useful for parellizing operations over `shape[0]` number of diffusion steps at once. + """ + out = torch.gather(input.to(device), 0, timesteps.to(device)) + reshape = [shape[0]] + [1] * (len(shape) - 1) + out = out.reshape(*reshape) + return out + + @dataclass class DDIMSchedulerOutput(BaseOutput): """ Output class for the scheduler's step function output. - Args: prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the @@ -49,16 +59,12 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. - Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up to that part of the diffusion process. - - Args: num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. - Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ @@ -78,14 +84,11 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin): """ Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with non-Markovian guidance. - [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__` function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`. [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and [`~ConfigMixin.from_config`] functions. - For more details, see the original paper: https://arxiv.org/abs/2010.02502 - Args: num_train_timesteps (`int`): number of diffusion steps used to train the model. beta_start (`float`): the starting `beta` value of inference. @@ -105,7 +108,6 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin): an offset added to the inference steps. You can use a combination of `offset=1` and `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in stable diffusion. - """ @register_to_config @@ -118,7 +120,9 @@ def __init__( trained_betas: Optional[np.ndarray] = None, clip_sample: bool = True, set_alpha_to_one: bool = True, + variance_type: str = "fixed", steps_offset: int = 0, + prediction_type: Literal["epsilon", "sample", "v"] = "epsilon", **kwargs, ): deprecate( @@ -159,35 +163,42 @@ def __init__( # setable values self.num_inference_steps = None self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy()) + self.variance_type = variance_type + self.prediction_type = prediction_type def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: """ Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. - Args: sample (`torch.FloatTensor`): input sample timestep (`int`, optional): current timestep - Returns: `torch.FloatTensor`: scaled input sample """ return sample def _get_variance(self, timestep, prev_timestep): - alpha_prod_t = self.alphas_cumprod[timestep] - alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod - beta_prod_t = 1 - alpha_prod_t - beta_prod_t_prev = 1 - alpha_prod_t_prev - - variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev) + if self.variance_type == "fixed": + alpha_prod_t = self.alphas_cumprod[timestep] + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod + beta_prod_t = 1 - alpha_prod_t + beta_prod_t_prev = 1 - alpha_prod_t_prev + + variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev) + elif self.variance_type == "v_diffusion": + # If eta > 0, adjust the scaling factor for the predicted noise + # downward according to the amount of additional noise to add + ddim_sigma = (self.sigmas[timestep + 1] ** 2 / self.sigmas[timestep] ** 2).sqrt() * ( + 1 - self.alphas[timestep] ** 2 / self.alphas[timestep + 1] ** 2 + ).sqrt() + variance = (self.sigmas[timestep + 1] ** 2 - ddim_sigma**2).sqrt() return variance def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None, **kwargs): """ Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference. - Args: num_inference_steps (`int`): the number of diffusion steps used when generating samples with a pre-trained model. @@ -219,7 +230,6 @@ def step( """ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). - Args: model_output (`torch.FloatTensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. @@ -233,12 +243,10 @@ def step( use_clipped_model_output (`bool`): TODO generator: random number generator. return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class - Returns: [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`: [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor. - """ if self.num_inference_steps is None: raise ValueError( @@ -295,19 +303,31 @@ def step( model_output = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5) # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * model_output + if self.prediction_type == "epsilon": + pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * model_output - # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + eps * pred_sample_direction + # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + eps * pred_sample_direction + else: + if timestep < len(self.timesteps) - 1: + prev_sample = pred_original_sample + self.alphas[timestep + 1] + eps * variance + else: + prev_sample = None if eta > 0: # randn_like does not support generator https://github.com/pytorch/pytorch/issues/27072 device = model_output.device if torch.is_tensor(model_output) else "cpu" noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator).to(device) - variance = self._get_variance(timestep, prev_timestep) ** (0.5) * eta * noise + if self.prediction_type == "epsilon": - prev_sample = prev_sample + variance + variance = self._get_variance(timestep, prev_timestep) ** (0.5) * eta * noise + prev_sample = prev_sample + variance + else: + ddim_sigma = (self.sigmas[timestep + 1] ** 2 / self.sigmas[timestep] ** 2).sqrt() * ( + 1 - self.alphas[timestep] ** 2 / self.alphas[timestep + 1] ** 2 + ).sqrt() + prev_sample = prev_sample + ddim_sigma * noise if not return_dict: return (prev_sample,) @@ -319,6 +339,10 @@ def add_noise( noise: torch.FloatTensor, timesteps: torch.IntTensor, ) -> torch.FloatTensor: + if self.variance_type == "v_diffusion": + alpha, sigma = self.get_alpha_sigma(original_samples, timesteps, original_samples.device) + z_t = alpha * original_samples + sigma * noise + return z_t # Make sure alphas_cumprod and timestep have same device and dtype as original_samples self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) timesteps = timesteps.to(original_samples.device) @@ -338,3 +362,8 @@ def add_noise( def __len__(self): return self.config.num_train_timesteps + + def get_alpha_sigma(self, sample, timesteps, device): + alpha = expand_to_shape(self.sqrt_alphas_cumprod, timesteps, sample.shape, device) + sigma = expand_to_shape(self.sqrt_one_minus_alphas_cumprod, timesteps, sample.shape, device) + return alpha, sigma From 1fa3cc8ad7099d929adb7982508087cb5f6afb7a Mon Sep 17 00:00:00 2001 From: Ben Glickenhaus Date: Tue, 15 Nov 2022 12:27:10 -0500 Subject: [PATCH 09/18] changes to ddim --- src/diffusers/schedulers/scheduling_ddim.py | 45 ++++++++++++++------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index 177bd65dc517..eebd1f2cd975 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -80,6 +80,14 @@ def alpha_bar(time_step): return torch.tensor(betas) +def t_to_alpha_sigma(num_diffusion_timesteps): + """Returns the scaling factors for the clean image and for the noise, given + a timestep.""" + alphas = torch.cos(torch.tensor([t * math.pi / 2 for t in range(num_diffusion_timesteps)])) + sigmas = torch.sin(torch.tensor([t * math.pi / 2 for t in range(num_diffusion_timesteps)])) + return alphas, sigmas + + class DDIMScheduler(SchedulerMixin, ConfigMixin): """ Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising @@ -149,7 +157,8 @@ def __init__( self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) - self.sigmas = 1 - self.alphas**2 + if prediction_type == "v": + self.alphas, self.sigmas = t_to_alpha_sigma(num_train_timesteps) # At every step in ddim, we are looking into the previous alphas_cumprod # For the final step, there is no previous alphas_cumprod because we are already at 0 @@ -178,7 +187,7 @@ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = """ return sample - def _get_variance(self, timestep, prev_timestep): + def _get_variance(self, timestep, prev_timestep, eta=0): if self.variance_type == "fixed": alpha_prod_t = self.alphas_cumprod[timestep] alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod @@ -189,9 +198,14 @@ def _get_variance(self, timestep, prev_timestep): elif self.variance_type == "v_diffusion": # If eta > 0, adjust the scaling factor for the predicted noise # downward according to the amount of additional noise to add - ddim_sigma = (self.sigmas[timestep + 1] ** 2 / self.sigmas[timestep] ** 2).sqrt() * ( - 1 - self.alphas[timestep] ** 2 / self.alphas[timestep + 1] ** 2 - ).sqrt() + if eta: + numerator = ( + eta * (self.sigmas[timestep + 1] ** 2 / self.sigmas[timestep] ** 2).clamp(min=1.0e-7).sqrt() + ) + else: + numerator = 0 + denominator = (self.alphas[timestep + 1] / self.alphas[timestep]).clamp(min=1.0e-7).sqrt() + ddim_sigma = (numerator / denominator).clamp(min=1.0e-7) variance = (self.sigmas[timestep + 1] ** 2 - ddim_sigma**2).sqrt() return variance @@ -221,7 +235,6 @@ def step( model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor, - prediction_type: str = "epsilon", eta: float = 0.0, use_clipped_model_output: bool = False, generator=None, @@ -275,19 +288,21 @@ def step( # 3. compute predicted original sample from predicted noise also called # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - if prediction_type == "epsilon": + if self.prediction_type == "epsilon": pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) eps = torch.tensor(1) - elif prediction_type == "sample": + elif self.prediction_type == "sample": pred_original_sample = model_output eps = torch.tensor(1) - elif prediction_type == "v": + elif self.prediction_type == "v": # v_t = alpha_t * epsilon - sigma_t * x # need to merge the PRs for sigma to be available in DDPM pred_original_sample = sample * self.alphas[timestep] - model_output * self.sigmas[timestep] - eps = model_output * self.alphas[timestep] - sample * self.sigmas[timestep] + eps = model_output * self.alphas[timestep] + sample * self.sigmas[timestep] else: - raise ValueError(f"prediction_type given as {prediction_type} must be one of `epsilon`, `sample`, or `v`") + raise ValueError( + f"prediction_type given as {self.prediction_type} must be one of `epsilon`, `sample`, or `v`" + ) # 4. Clip "predicted x_0" if self.config.clip_sample: @@ -295,7 +310,7 @@ def step( # 5. compute variance: "sigma_t(η)" -> see formula (16) # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1) - variance = self._get_variance(timestep, prev_timestep) + variance = self._get_variance(timestep, prev_timestep, eta) std_dev_t = eta * variance ** (0.5) if use_clipped_model_output: @@ -309,7 +324,7 @@ def step( # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + eps * pred_sample_direction else: - if timestep < len(self.timesteps) - 1: + if timestep < len(self.alphas) - 1: prev_sample = pred_original_sample + self.alphas[timestep + 1] + eps * variance else: prev_sample = None @@ -364,6 +379,6 @@ def __len__(self): return self.config.num_train_timesteps def get_alpha_sigma(self, sample, timesteps, device): - alpha = expand_to_shape(self.sqrt_alphas_cumprod, timesteps, sample.shape, device) - sigma = expand_to_shape(self.sqrt_one_minus_alphas_cumprod, timesteps, sample.shape, device) + alpha = expand_to_shape(self.alphas, timesteps, sample.shape, device) + sigma = expand_to_shape(self.sigmas, timesteps, sample.shape, device) return alpha, sigma From 0b60c2b427450506bff7672550327fe22649ff7a Mon Sep 17 00:00:00 2001 From: Ben Glickenhaus Date: Wed, 16 Nov 2022 09:49:12 -0500 Subject: [PATCH 10/18] ddim v prediction works to train butterflies example --- examples/v_prediction/train_butterflies.py | 40 +++++++++++++++--- src/diffusers/schedulers/scheduling_ddim.py | 46 +++++++++++---------- 2 files changed, 59 insertions(+), 27 deletions(-) diff --git a/examples/v_prediction/train_butterflies.py b/examples/v_prediction/train_butterflies.py index 8eaa971c80d3..bb87671a238b 100644 --- a/examples/v_prediction/train_butterflies.py +++ b/examples/v_prediction/train_butterflies.py @@ -81,11 +81,20 @@ def transform(examples): from diffusers import DDPMScheduler, DDIMPipeline, DDIMScheduler -noise_scheduler = DDIMScheduler( - num_train_timesteps=1000, - beta_schedule="squaredcos_cap_v2", - variance_type="v_diffusion", -) +if config.output_dir.startswith("ddpm"): + noise_scheduler = DDPMScheduler( + num_train_timesteps=1000, + beta_schedule="squaredcos_cap_v2", + variance_type="v_diffusion", + prediction_type="v", + ) +else: + noise_scheduler = DDIMScheduler( + num_train_timesteps=1000, + beta_schedule="squaredcos_cap_v2", + variance_type="v_diffusion", + prediction_type="v", + ) import torch import torch.nn.functional as F @@ -162,6 +171,21 @@ def train_loop(config, model, noise_scheduler, optimizer, train_dataloader, lr_s global_step = 0 + if config.output_dir.startswith("ddpm"): + + pipeline = DDPMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler) + else: + pipeline = DDIMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler) + + def t_to_alpha_sigma(t): + """Returns the scaling factors for the clean image and for the noise, given + a timestep.""" + return torch.cos(t * math.pi / 2), torch.sin(t * math.pi / 2) + + alpha_sigmas = [t_to_alpha_sigma(t) for t in noise_scheduler.timesteps] + + evaluate(config, 0, pipeline) + # Now you train the model for epoch in range(config.num_epochs): progress_bar = tqdm(total=len(train_dataloader), disable=not accelerator.is_local_main_process) @@ -198,7 +222,11 @@ def train_loop(config, model, noise_scheduler, optimizer, train_dataloader, lr_s # After each epoch you optionally sample some demo images with evaluate() and save the model if accelerator.is_main_process: - pipeline = DDIMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler) + if config.output_dir.startswith("ddpm"): + + pipeline = DDPMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler) + else: + pipeline = DDIMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler) if (epoch + 1) % config.save_image_epochs == 0 or epoch == config.num_epochs - 1: evaluate(config, epoch, pipeline) diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index eebd1f2cd975..b6651e7c8daf 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -83,8 +83,12 @@ def alpha_bar(time_step): def t_to_alpha_sigma(num_diffusion_timesteps): """Returns the scaling factors for the clean image and for the noise, given a timestep.""" - alphas = torch.cos(torch.tensor([t * math.pi / 2 for t in range(num_diffusion_timesteps)])) - sigmas = torch.sin(torch.tensor([t * math.pi / 2 for t in range(num_diffusion_timesteps)])) + alphas = torch.cos( + torch.tensor([(t / num_diffusion_timesteps) * math.pi / 2 for t in range(num_diffusion_timesteps)]) + ) + sigmas = torch.sin( + torch.tensor([(t / num_diffusion_timesteps) * math.pi / 2 for t in range(num_diffusion_timesteps)]) + ) return alphas, sigmas @@ -155,6 +159,7 @@ def __init__( else: raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + self.variance_type = variance_type self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) if prediction_type == "v": @@ -165,6 +170,7 @@ def __init__( # `set_alpha_to_one` decides whether we set this parameter simply to one or # whether we use the final alpha of the "non-previous" one. self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0] + self.final_sigma = torch.tensor(0.0) if set_alpha_to_one else self.sigmas[0] # standard deviation of the initial noise distribution self.init_noise_sigma = 1.0 @@ -188,26 +194,29 @@ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = return sample def _get_variance(self, timestep, prev_timestep, eta=0): + alpha_prod_t = self.alphas_cumprod[timestep] + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod + beta_prod_t = 1 - alpha_prod_t + beta_prod_t_prev = 1 - alpha_prod_t_prev + if self.variance_type == "fixed": - alpha_prod_t = self.alphas_cumprod[timestep] - alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod - beta_prod_t = 1 - alpha_prod_t - beta_prod_t_prev = 1 - alpha_prod_t_prev variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev) elif self.variance_type == "v_diffusion": # If eta > 0, adjust the scaling factor for the predicted noise # downward according to the amount of additional noise to add + # variance = torch.log(self.betas[timestep] * (1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) + alpha_prev = self.alphas[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod + sigma_prev = self.sigmas[prev_timestep] if prev_timestep >= 0 else self.final_sigma if eta: - numerator = ( - eta * (self.sigmas[timestep + 1] ** 2 / self.sigmas[timestep] ** 2).clamp(min=1.0e-7).sqrt() - ) + numerator = eta * (sigma_prev**2 / self.sigmas[timestep] ** 2).clamp(min=1.0e-7).sqrt() else: numerator = 0 - denominator = (self.alphas[timestep + 1] / self.alphas[timestep]).clamp(min=1.0e-7).sqrt() - ddim_sigma = (numerator / denominator).clamp(min=1.0e-7) - variance = (self.sigmas[timestep + 1] ** 2 - ddim_sigma**2).sqrt() - + denominator = (1 - self.alphas[timestep] ** 2 / alpha_prev**2).clamp(min=1.0e-7).sqrt() + ddim_sigma = (numerator * denominator).clamp(min=1.0e-7) + variance = (sigma_prev**2 - ddim_sigma**2).sqrt() + if torch.isnan(variance): + variance = 0 return variance def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None, **kwargs): @@ -324,10 +333,8 @@ def step( # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + eps * pred_sample_direction else: - if timestep < len(self.alphas) - 1: - prev_sample = pred_original_sample + self.alphas[timestep + 1] + eps * variance - else: - prev_sample = None + alpha_prev = self.alphas[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod + prev_sample = pred_original_sample * alpha_prev + eps * variance if eta > 0: # randn_like does not support generator https://github.com/pytorch/pytorch/issues/27072 @@ -339,10 +346,7 @@ def step( prev_sample = prev_sample + variance else: - ddim_sigma = (self.sigmas[timestep + 1] ** 2 / self.sigmas[timestep] ** 2).sqrt() * ( - 1 - self.alphas[timestep] ** 2 / self.alphas[timestep + 1] ** 2 - ).sqrt() - prev_sample = prev_sample + ddim_sigma * noise + prev_sample = prev_sample + variance * noise if not return_dict: return (prev_sample,) From 8311d8980547ee5a6e4559573407236958220d11 Mon Sep 17 00:00:00 2001 From: Ben Glickenhaus Date: Wed, 16 Nov 2022 10:03:07 -0500 Subject: [PATCH 11/18] fix bad merge, style and quality --- examples/v_prediction/train_butterflies.py | 70 +++++---------------- src/diffusers/schedulers/scheduling_ddim.py | 61 ++++++++---------- 2 files changed, 40 insertions(+), 91 deletions(-) diff --git a/examples/v_prediction/train_butterflies.py b/examples/v_prediction/train_butterflies.py index bb87671a238b..5074ece86a98 100644 --- a/examples/v_prediction/train_butterflies.py +++ b/examples/v_prediction/train_butterflies.py @@ -1,5 +1,19 @@ +import glob +import os from dataclasses import dataclass +import torch +import torch.nn.functional as F + +from accelerate import Accelerator +from datasets import load_dataset +from diffusers import DDIMPipeline, DDIMScheduler, DDPMPipeline, DDPMScheduler, UNet2DModel +from diffusers.hub_utils import init_git_repo, push_to_hub +from diffusers.optimization import get_cosine_schedule_with_warmup +from PIL import Image +from torchvision import transforms +from tqdm.auto import tqdm + @dataclass class TrainingConfig: @@ -22,14 +36,11 @@ class TrainingConfig: config = TrainingConfig() -from datasets import load_dataset + config.dataset_name = "huggan/smithsonian_butterflies_subset" dataset = load_dataset(config.dataset_name, split="train") -import matplotlib.pyplot as plt - -from torchvision import transforms preprocess = transforms.Compose( [ @@ -48,12 +59,9 @@ def transform(examples): dataset.set_transform(transform) -import torch train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=config.train_batch_size, shuffle=True) -from diffusers import UNet2DModel - model = UNet2DModel( sample_size=config.image_size, # the target image resolution @@ -79,7 +87,6 @@ def transform(examples): ), ) -from diffusers import DDPMScheduler, DDIMPipeline, DDIMScheduler if config.output_dir.startswith("ddpm"): noise_scheduler = DDPMScheduler( @@ -96,26 +103,16 @@ def transform(examples): prediction_type="v", ) -import torch -import torch.nn.functional as F - -from PIL import Image optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate) -from diffusers.optimization import get_cosine_schedule_with_warmup - lr_scheduler = get_cosine_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=config.lr_warmup_steps, num_training_steps=(len(train_dataloader) * config.num_epochs), ) -from diffusers import DDPMPipeline - -import math - def make_grid(images, rows, cols): w, h = images[0].size @@ -142,13 +139,6 @@ def evaluate(config, epoch, pipeline): image_grid.save(f"{test_dir}/{epoch:04d}.png") -from accelerate import Accelerator -from diffusers.hub_utils import init_git_repo, push_to_hub - -from tqdm.auto import tqdm -import os - - def train_loop(config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler): # Initialize accelerator and tensorboard logging accelerator = Accelerator( @@ -172,18 +162,10 @@ def train_loop(config, model, noise_scheduler, optimizer, train_dataloader, lr_s global_step = 0 if config.output_dir.startswith("ddpm"): - pipeline = DDPMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler) else: pipeline = DDIMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler) - def t_to_alpha_sigma(t): - """Returns the scaling factors for the clean image and for the noise, given - a timestep.""" - return torch.cos(t * math.pi / 2), torch.sin(t * math.pi / 2) - - alpha_sigmas = [t_to_alpha_sigma(t) for t in noise_scheduler.timesteps] - evaluate(config, 0, pipeline) # Now you train the model @@ -223,7 +205,6 @@ def t_to_alpha_sigma(t): # After each epoch you optionally sample some demo images with evaluate() and save the model if accelerator.is_main_process: if config.output_dir.startswith("ddpm"): - pipeline = DDPMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler) else: pipeline = DDIMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler) @@ -238,30 +219,9 @@ def t_to_alpha_sigma(t): pipeline.save_pretrained(config.output_dir) -"""## Let's train! - -Let's launch the training (including multi-GPU training) from the notebook using Accelerate's `notebook_launcher` function: -""" - -from accelerate import notebook_launcher - args = (config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler) train_loop(*args) -"""Let's have a look at the final image grid produced by the trained diffusion model:""" - -import glob - sample_images = sorted(glob.glob(f"{config.output_dir}/samples/*.png")) Image.open(sample_images[-1]) - -"""Not bad! There's room for improvement of course, so feel free to play with the hyperparameters, model definition and image augmentations 🤗 - -If you've chosen to upload the model to the Hugging Face Hub, its repository should now look like so: -https://huggingface.co/anton-l/ddpm-butterflies-128 - -If you want to dive deeper into the code, we also have more advanced training scripts with features like Exponential Moving Average of model weights here: - -https://github.com/huggingface/diffusers/tree/main/examples -""" diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index 9e4dc2ee0627..a41ba49cb156 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -17,7 +17,7 @@ import math from dataclasses import dataclass -from typing import Optional, Tuple, Union, Literal +from typing import Literal, Optional, Tuple, Union import numpy as np import torch @@ -42,8 +42,8 @@ def expand_to_shape(input, timesteps, shape, device): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM class DDIMSchedulerOutput(BaseOutput): """ - Output class for the scheduler's step function output. Args: + Output class for the scheduler's step function output. prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the denoising loop. @@ -58,13 +58,12 @@ class DDIMSchedulerOutput(BaseOutput): def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor: """ - Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of - (1-beta) over time from t = [0,1]. - Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up - to that part of the diffusion process. Args: - num_diffusion_timesteps (`int`): the number of betas to produce. - max_beta (`float`): the maximum beta to use; use values lower than 1 to + Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of + (1-beta) over time from t = [0,1]. Contains a function alpha_bar that takes an argument t and transforms it to the + cumulative product of (1-beta) up to that part of the diffusion process. + num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; + use values lower than 1 to prevent singularities. Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs @@ -95,18 +94,15 @@ def t_to_alpha_sigma(num_diffusion_timesteps): class DDIMScheduler(SchedulerMixin, ConfigMixin): """ - Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising - diffusion probabilistic models (DDPMs) with non-Markovian guidance. - [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__` - function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`. - [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and - [`~ConfigMixin.from_config`] functions. - For more details, see the original paper: https://arxiv.org/abs/2010.02502 Args: - num_train_timesteps (`int`): number of diffusion steps used to train the model. - beta_start (`float`): the starting `beta` value of inference. - beta_end (`float`): the final `beta` value. - beta_schedule (`str`): + Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising + diffusion probabilistic models (DDPMs) with non-Markovian guidance. [`~ConfigMixin`] takes care of storing all + config attributes that are passed in the scheduler's `__init__` function, such as `num_train_timesteps`. They can + be accessed via `scheduler.config.num_train_timesteps`. [`~ConfigMixin`] also provides general loading and saving + functionality via the [`~ConfigMixin.save_config`] and [`~ConfigMixin.from_config`] functions. For more details, + see the original paper: https://arxiv.org/abs/2010.02502 + num_train_timesteps (`int`): number of diffusion steps used to train the model. beta_start (`float`): the + starting `beta` value of inference. beta_end (`float`): the final `beta` value. beta_schedule (`str`): the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from `linear`, `scaled_linear`, or `squaredcos_cap_v2`. trained_betas (`np.ndarray`, optional): @@ -186,11 +182,10 @@ def __init__( def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: """ + Args: Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. - Args: - sample (`torch.FloatTensor`): input sample - timestep (`int`, optional): current timestep + sample (`torch.FloatTensor`): input sample timestep (`int`, optional): current timestep Returns: `torch.FloatTensor`: scaled input sample """ @@ -203,7 +198,6 @@ def _get_variance(self, timestep, prev_timestep, eta=0): beta_prod_t_prev = 1 - alpha_prod_t_prev if self.variance_type == "fixed": - variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev) elif self.variance_type == "v_diffusion": # If eta > 0, adjust the scaling factor for the predicted noise @@ -224,8 +218,8 @@ def _get_variance(self, timestep, prev_timestep, eta=0): def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None): """ - Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference. Args: + Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference. num_inference_steps (`int`): the number of diffusion steps used when generating samples with a pre-trained model. """ @@ -249,24 +243,23 @@ def step( return_dict: bool = True, ) -> Union[DDIMSchedulerOutput, Tuple]: """ + Args: Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). - Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. - timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): + model_output (`torch.FloatTensor`): direct output from learned diffusion model. timestep (`int`): current + discrete timestep in the diffusion chain. sample (`torch.FloatTensor`): current instance of sample being created by diffusion process. prediction_type (`str`): prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample), or `v` (see section 2.4 https://imagen.research.google/video/paper.pdf) - eta (`float`): weight of noise for added noise in diffusion step. - use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped + eta (`float`): weight of noise for added noise in diffusion step. use_clipped_model_output (`bool`): if + `True`, compute "corrected" `model_output` from the clipped predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would coincide with the one provided as input and `use_clipped_model_output` will have not effect. - generator: random number generator. - variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we + generator: random number generator. variance_noise (`torch.FloatTensor`): instead of generating noise for + the variance using `generator`, we can directly provide the noise for the variance itself. This is useful for methods such as CycleDiffusion. (https://arxiv.org/abs/2210.05559) return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class @@ -362,10 +355,6 @@ def step( variance = self._get_variance(timestep, prev_timestep) ** (0.5) * eta * variance_noise prev_sample = prev_sample + variance - - prev_sample = prev_sample + variance - else: - prev_sample = prev_sample + variance * noise if not return_dict: return (prev_sample,) From 7117ff9abefc3d1107d3022685d1d5630e55b2cb Mon Sep 17 00:00:00 2001 From: Ben Glickenhaus Date: Thu, 17 Nov 2022 09:13:27 -0500 Subject: [PATCH 12/18] try to fix broken doc strings --- src/diffusers/schedulers/scheduling_ddim.py | 58 ++++++++++++--------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index a41ba49cb156..08fcbb20abb4 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -42,8 +42,11 @@ def expand_to_shape(input, timesteps, shape, device): # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM class DDIMSchedulerOutput(BaseOutput): """ - Args: Output class for the scheduler's step function output. + + + Args: + prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the denoising loop. @@ -58,13 +61,14 @@ class DDIMSchedulerOutput(BaseOutput): def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor: """ - Args: Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of - (1-beta) over time from t = [0,1]. Contains a function alpha_bar that takes an argument t and transforms it to the - cumulative product of (1-beta) up to that part of the diffusion process. - num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; - use values lower than 1 to - prevent singularities. + (1-beta) over time from t = [0,1]. + + Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up + to that part of the diffusion process. + Args: + num_diffusion_timesteps (`int`): the number of betas to produce. + max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ @@ -94,21 +98,21 @@ def t_to_alpha_sigma(num_diffusion_timesteps): class DDIMScheduler(SchedulerMixin, ConfigMixin): """ - Args: Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with non-Markovian guidance. [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__` function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`. [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and [`~ConfigMixin.from_config`] functions. For more details, see the original paper: https://arxiv.org/abs/2010.02502 - num_train_timesteps (`int`): number of diffusion steps used to train the model. beta_start (`float`): the - starting `beta` value of inference. beta_end (`float`): the final `beta` value. beta_schedule (`str`): - the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from - `linear`, `scaled_linear`, or `squaredcos_cap_v2`. - trained_betas (`np.ndarray`, optional): - option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc. - clip_sample (`bool`, default `True`): - option to clip predicted sample between -1 and 1 for numerical stability. + + + Args: + num_train_timesteps (`int`): number of diffusion steps used to train the model. + beta_start (`float`): the starting `beta` value of inference. + beta_end (`float`): the final `beta` value. + beta_schedule (`str`): the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from `linear`, `scaled_linear`, or `squaredcos_cap_v2`. + trained_betas (`np.ndarray`, optional): option to pass an array of betas directly to the constructor to bypass`beta_start`, `beta_end` etc. + clip_sample (`bool`, default `True`): option to clip predicted sample between -1 and 1 for numerical stability. set_alpha_to_one (`bool`, default `True`): each diffusion step uses the value of alphas product at that step and at the previous one. For the final step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`, @@ -182,10 +186,13 @@ def __init__( def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: """ - Args: Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. - sample (`torch.FloatTensor`): input sample timestep (`int`, optional): current timestep + + + Args: + sample (`torch.FloatTensor`): input sample + timestep (`int`, optional): current timestep Returns: `torch.FloatTensor`: scaled input sample """ @@ -211,17 +218,16 @@ def _get_variance(self, timestep, prev_timestep, eta=0): numerator = 0 denominator = (1 - self.alphas[timestep] ** 2 / alpha_prev**2).clamp(min=1.0e-7).sqrt() ddim_sigma = (numerator * denominator).clamp(min=1.0e-7) - variance = (sigma_prev**2 - ddim_sigma**2).sqrt() - if torch.isnan(variance): - variance = 0 + variance = (sigma_prev**2 - ddim_sigma**2).clamp(min=1.0e-7).sqrt() return variance def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None): """ - Args: Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference. - num_inference_steps (`int`): - the number of diffusion steps used when generating samples with a pre-trained model. + + + Args: + num_inference_steps (`int`): the number of diffusion steps used when generating samples with a pre-trained model. """ self.num_inference_steps = num_inference_steps step_ratio = self.config.num_train_timesteps // self.num_inference_steps @@ -243,9 +249,11 @@ def step( return_dict: bool = True, ) -> Union[DDIMSchedulerOutput, Tuple]: """ - Args: Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). + + + Args: model_output (`torch.FloatTensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. sample (`torch.FloatTensor`): current instance of sample being created by diffusion process. From de0c55841ddb013e8bc6a55f69fd183a948f3b32 Mon Sep 17 00:00:00 2001 From: Ben Glickenhaus Date: Thu, 17 Nov 2022 09:20:06 -0500 Subject: [PATCH 13/18] second pass --- src/diffusers/schedulers/scheduling_ddim.py | 44 ++++++++++++--------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index 08fcbb20abb4..d7adcd3c0629 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -44,9 +44,7 @@ class DDIMSchedulerOutput(BaseOutput): """ Output class for the scheduler's step function output. - Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the denoising loop. @@ -66,9 +64,11 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up to that part of the diffusion process. + Args: num_diffusion_timesteps (`int`): the number of betas to produce. - max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + max_beta (`float`): the maximum beta to use; use values lower than 1 to + prevent singularities. Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ @@ -99,20 +99,25 @@ def t_to_alpha_sigma(num_diffusion_timesteps): class DDIMScheduler(SchedulerMixin, ConfigMixin): """ Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising - diffusion probabilistic models (DDPMs) with non-Markovian guidance. [`~ConfigMixin`] takes care of storing all - config attributes that are passed in the scheduler's `__init__` function, such as `num_train_timesteps`. They can - be accessed via `scheduler.config.num_train_timesteps`. [`~ConfigMixin`] also provides general loading and saving - functionality via the [`~ConfigMixin.save_config`] and [`~ConfigMixin.from_config`] functions. For more details, - see the original paper: https://arxiv.org/abs/2010.02502 + diffusion probabilistic models (DDPMs) with non-Markovian guidance. + + [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__` + function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`. + [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and + [`~ConfigMixin.from_config`] functions. Args: num_train_timesteps (`int`): number of diffusion steps used to train the model. beta_start (`float`): the starting `beta` value of inference. beta_end (`float`): the final `beta` value. - beta_schedule (`str`): the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from `linear`, `scaled_linear`, or `squaredcos_cap_v2`. - trained_betas (`np.ndarray`, optional): option to pass an array of betas directly to the constructor to bypass`beta_start`, `beta_end` etc. - clip_sample (`bool`, default `True`): option to clip predicted sample between -1 and 1 for numerical stability. + beta_schedule (`str`): + the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from + `linear`, `scaled_linear`, or `squaredcos_cap_v2`. + trained_betas (`np.ndarray`, optional): + option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc. + clip_sample (`bool`, default `True`): + option to clip predicted sample between -1 and 1 for numerical stability. set_alpha_to_one (`bool`, default `True`): each diffusion step uses the value of alphas product at that step and at the previous one. For the final step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`, @@ -227,7 +232,8 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic Args: - num_inference_steps (`int`): the number of diffusion steps used when generating samples with a pre-trained model. + num_inference_steps (`int`): + the number of diffusion steps used when generating samples with a pre-trained model. """ self.num_inference_steps = num_inference_steps step_ratio = self.config.num_train_timesteps // self.num_inference_steps @@ -254,20 +260,20 @@ def step( Args: - model_output (`torch.FloatTensor`): direct output from learned diffusion model. timestep (`int`): current - discrete timestep in the diffusion chain. sample (`torch.FloatTensor`): - current instance of sample being created by diffusion process. + model_output (`torch.FloatTensor`): direct output from learned diffusion model. + timestep (`int`): current discrete timestep in the diffusion chain. + sample (`torch.FloatTensor`): current instance of sample being created by diffusion process. prediction_type (`str`): prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample), or `v` (see section 2.4 https://imagen.research.google/video/paper.pdf) - eta (`float`): weight of noise for added noise in diffusion step. use_clipped_model_output (`bool`): if - `True`, compute "corrected" `model_output` from the clipped + eta (`float`): weight of noise for added noise in diffusion step. + use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would coincide with the one provided as input and `use_clipped_model_output` will have not effect. - generator: random number generator. variance_noise (`torch.FloatTensor`): instead of generating noise for - the variance using `generator`, we + generator: random number generator. + variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we can directly provide the noise for the variance itself. This is useful for methods such as CycleDiffusion. (https://arxiv.org/abs/2210.05559) return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class From 39ccf3262f26f7face0db818966dd9bb0d629299 Mon Sep 17 00:00:00 2001 From: Ben Glickenhaus Date: Thu, 17 Nov 2022 09:24:10 -0500 Subject: [PATCH 14/18] one more --- src/diffusers/schedulers/scheduling_ddim.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index d7adcd3c0629..66968f34575d 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -44,6 +44,7 @@ class DDIMSchedulerOutput(BaseOutput): """ Output class for the scheduler's step function output. + Args: prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the @@ -106,6 +107,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin): [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and [`~ConfigMixin.from_config`] functions. + For more details, see the original paper: https://arxiv.org/abs/2010.02502 Args: num_train_timesteps (`int`): number of diffusion steps used to train the model. @@ -126,6 +128,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin): an offset added to the inference steps. You can use a combination of `offset=1` and `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in stable diffusion. + """ _compatible_classes = [ @@ -262,7 +265,8 @@ def step( Args: model_output (`torch.FloatTensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. - sample (`torch.FloatTensor`): current instance of sample being created by diffusion process. + sample (`torch.FloatTensor`): + current instance of sample being created by diffusion process. prediction_type (`str`): prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample), or `v` (see section 2.4 From 4b79f209bfc701cbe35bcfd5455e0b965f1bcb1d Mon Sep 17 00:00:00 2001 From: Ben Glickenhaus Date: Thu, 17 Nov 2022 09:25:58 -0500 Subject: [PATCH 15/18] white space --- src/diffusers/schedulers/scheduling_ddim.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index 66968f34575d..691ee3c0bad4 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -66,10 +66,12 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up to that part of the diffusion process. + Args: num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to - prevent singularities. + prevent singularities. + Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ @@ -281,10 +283,12 @@ def step( can directly provide the noise for the variance itself. This is useful for methods such as CycleDiffusion. (https://arxiv.org/abs/2210.05559) return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class + Returns: [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`: [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor. + """ if self.num_inference_steps is None: raise ValueError( From dbf206baef85dbebe5d30f1a695f0c532d52c2d3 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Thu, 17 Nov 2022 10:09:18 -0800 Subject: [PATCH 16/18] Update src/diffusers/schedulers/scheduling_ddim.py --- src/diffusers/schedulers/scheduling_ddim.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index 691ee3c0bad4..1f33957450bf 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -44,7 +44,6 @@ class DDIMSchedulerOutput(BaseOutput): """ Output class for the scheduler's step function output. - Args: prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the From 5fbbe50148a8f2e026349d782bab354bc370b8d0 Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Thu, 17 Nov 2022 10:22:29 -0800 Subject: [PATCH 17/18] remove extra lines --- src/diffusers/schedulers/scheduling_ddim.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index 1f33957450bf..0cf8782292dd 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -198,10 +198,10 @@ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = Ensures interchangeability with schedulers that need to scale the denoising model input depending on the current timestep. - Args: sample (`torch.FloatTensor`): input sample timestep (`int`, optional): current timestep + Returns: `torch.FloatTensor`: scaled input sample """ @@ -218,7 +218,6 @@ def _get_variance(self, timestep, prev_timestep, eta=0): elif self.variance_type == "v_diffusion": # If eta > 0, adjust the scaling factor for the predicted noise # downward according to the amount of additional noise to add - # variance = torch.log(self.betas[timestep] * (1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) alpha_prev = self.alphas[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod sigma_prev = self.sigmas[prev_timestep] if prev_timestep >= 0 else self.final_sigma if eta: @@ -234,7 +233,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic """ Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference. - Args: num_inference_steps (`int`): the number of diffusion steps used when generating samples with a pre-trained model. @@ -262,7 +260,6 @@ def step( Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion process from the learned model outputs (most often the predicted noise). - Args: model_output (`torch.FloatTensor`): direct output from learned diffusion model. timestep (`int`): current discrete timestep in the diffusion chain. From 5812e313298c76e3b64a85c37c982ac44b7bd00b Mon Sep 17 00:00:00 2001 From: Nathan Lambert Date: Thu, 17 Nov 2022 10:23:04 -0800 Subject: [PATCH 18/18] Update src/diffusers/schedulers/scheduling_ddim.py --- src/diffusers/schedulers/scheduling_ddim.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index 0cf8782292dd..89d90ba60ad4 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -201,7 +201,6 @@ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = Args: sample (`torch.FloatTensor`): input sample timestep (`int`, optional): current timestep - Returns: `torch.FloatTensor`: scaled input sample """