From a48b0266e170747f35d02be7d5e394f3ed796357 Mon Sep 17 00:00:00 2001
From: Ben Glickenhaus <benglickenhaus@gmail.com>
Date: Thu, 3 Nov 2022 11:00:04 -0400
Subject: [PATCH 01/18] v diffusion support for ddpm

---
 src/diffusers/schedulers/scheduling_ddpm.py | 92 ++++++++++++++-------
 1 file changed, 60 insertions(+), 32 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index ee4f608e09aa..19acf3dc2ff5 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -16,16 +16,25 @@
 
 import math
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-
+from typing import Optional, Tuple, Union, Literal
 import numpy as np
 import torch
 
 from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, deprecate
+from ..utils import BaseOutput
 from .scheduling_utils import SchedulerMixin
 
 
+def expand_to_shape(input, timesteps, shape, device):
+    """
+    Helper indexes a 1D tensor `input` using a 1D index tensor `timesteps`, then reshapes the result to broadcast nicely with `shape`. Useful for parellizing operations over `shape[0]` number of diffusion steps at once.
+    """
+    out = torch.gather(input.to(device), 0, timesteps.to(device))
+    reshape = [shape[0]] + [1] * (len(shape) - 1)
+    out = out.reshape(*reshape)
+    return out
+
+
 @dataclass
 class DDPMSchedulerOutput(BaseOutput):
     """
@@ -102,6 +111,14 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
 
     """
 
+    _compatible_classes = [
+        "DDIMScheduler",
+        "PNDMScheduler",
+        "LMSDiscreteScheduler",
+        "EulerDiscreteScheduler",
+        "EulerAncestralDiscreteScheduler",
+    ]
+
     @register_to_config
     def __init__(
         self,
@@ -112,15 +129,7 @@ def __init__(
         trained_betas: Optional[np.ndarray] = None,
         variance_type: str = "fixed_small",
         clip_sample: bool = True,
-        **kwargs,
     ):
-        deprecate(
-            "tensor_format",
-            "0.6.0",
-            "If you're running your code in PyTorch, you can safely remove this argument.",
-            take_from=kwargs,
-        )
-
         if trained_betas is not None:
             self.betas = torch.from_numpy(trained_betas)
         elif beta_schedule == "linear":
@@ -142,8 +151,8 @@ def __init__(
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
-        self.sigmas = 1 - self.alphas**2
-        self.one = torch.tensor(1.0)
+        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1 - self.alphas_cumprod)
 
         # standard deviation of the initial noise distribution
         self.init_noise_sigma = 1.0
@@ -185,11 +194,11 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
     def _get_variance(self, timestep, predicted_variance=None, variance_type=None):
         alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else self.one
+        alpha_prod_t_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else torch.tensor(1.0)
 
-        # For timestep > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
+        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
         # and sample from it to get previous sample
-        # x_{timestep-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
+        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
         variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[timestep]
 
         if variance_type is None:
@@ -213,6 +222,8 @@ def _get_variance(self, timestep, predicted_variance=None, variance_type=None):
             max_log = self.betas[timestep]
             frac = (predicted_variance + 1) / 2
             variance = frac * max_log + (1 - frac) * min_log
+        elif variance_type == "v_diffusion":
+            variance = torch.log(self.betas[timestep] * (1 - alpha_prod_t_prev) / (1 - alpha_prod_t))
 
         return variance
 
@@ -221,9 +232,10 @@ def step(
         model_output: torch.FloatTensor,
         timestep: int,
         sample: torch.FloatTensor,
-        prediction_type: str = "epsilon",
+        prediction_type: Literal["epsilon", "sample", "v"] = "epsilon",
         generator=None,
         return_dict: bool = True,
+        v_prediction: bool = True,
     ) -> Union[DDPMSchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
@@ -234,10 +246,8 @@ def step(
             timestep (`int`): current discrete timestep in the diffusion chain.
             sample (`torch.FloatTensor`):
                 current instance of sample being created by diffusion process.
-            prediction_type (`str`):
-                prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
-                process), `sample` (directly predicting the noisy sample), or `v` (see section 2.4
-                https://imagen.research.google/video/paper.pdf)
+            prediction_type (`Literal["epsilon", "sample", "v"]`, optional):
+                prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v` (see section 2.4 https://imagen.research.google/video/paper.pdf)
             generator: random number generator.
             return_dict (`bool`): option for returning tuple rather than DDPMSchedulerOutput class
 
@@ -254,23 +264,26 @@ def step(
 
         # 1. compute alphas, betas
         alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else self.one
+        alpha_prod_t_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else torch.tensor(1.0)
         beta_prod_t = 1 - alpha_prod_t
         beta_prod_t_prev = 1 - alpha_prod_t_prev
 
         # 2. compute predicted original sample from predicted noise also called
         # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
-        if prediction_type == "epsilon":
+        if prediction_type == "v":
+            # x_recon in p_mean_variance
+            pred_original_sample = (
+                sample * self.sqrt_alphas_cumprod[timestep]
+                - model_output * self.sqrt_one_minus_alphas_cumprod[timestep]
+            )
+            eps = (
+                model_output * self.sqrt_alphas_cumprod[timestep]
+                - sample * self.sqrt_one_minus_alphas_cumprod[timestep]
+            )
+        elif prediction_type == "epsilon":
             pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-        elif prediction_type == "sample":
-            pred_original_sample = model_output
-        elif prediction_type == "v":
-            # v_t = alpha_t * epsilon - sigma_t * x
-            # need to merge the PRs for sigma to be available in DDPM
-            pred = sample * self.alphas[timestep] - model_output * self.sigmas[timestep]
-            eps = model_output * self.alphas[timestep] - sample * self.sigmas[timestep]
         else:
-            raise ValueError(f"prediction_type given as {prediction_type} must be one of `epsilon`, `sample`, or `v`")
+            pred_original_sample = model_output
 
         # 3. Clip "predicted x_0"
         if self.config.clip_sample:
@@ -291,7 +304,12 @@ def step(
             noise = torch.randn(
                 model_output.size(), dtype=model_output.dtype, layout=model_output.layout, generator=generator
             ).to(model_output.device)
-            variance = (self._get_variance(timestep, predicted_variance=predicted_variance) ** 0.5) * noise
+            if self.variance_type == "fixed_small_log":
+                variance = self._get_variance(t, predicted_variance=predicted_variance) * noise
+            elif self.variance_type == "v_diffusion":
+                variance = torch.exp(0.5 * self._get_variance(timestep, predicted_variance)) * noise
+            else:
+                variance = (self._get_variance(timestep, predicted_variance=predicted_variance) ** 0.5) * noise
 
         pred_prev_sample = pred_prev_sample + variance
 
@@ -306,6 +324,11 @@ def add_noise(
         noise: torch.FloatTensor,
         timesteps: torch.IntTensor,
     ) -> torch.FloatTensor:
+        if self.variance_type == "v_diffusion":
+            alpha, sigma = self.get_alpha_sigma(original_samples, timesteps, original_samples.device)
+            z_t = alpha * original_samples + sigma * noise
+            return z_t
+
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
         self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
         timesteps = timesteps.to(original_samples.device)
@@ -325,3 +348,8 @@ def add_noise(
 
     def __len__(self):
         return self.config.num_train_timesteps
+
+    def get_alpha_sigma(self, sample, timesteps, device):
+        alpha = expand_to_shape(self.sqrt_alphas_cumprod, timesteps, sample.shape, device)
+        sigma = expand_to_shape(self.sqrt_one_minus_alphas_cumprod, timesteps, sample.shape, device)
+        return alpha, sigma

From 3d702c6d652cb4d7a50d9cea750ca7115b3c4375 Mon Sep 17 00:00:00 2001
From: Ben Glickenhaus <benglickenhaus@gmail.com>
Date: Thu, 3 Nov 2022 11:03:26 -0400
Subject: [PATCH 02/18] quality and style

---
 src/diffusers/schedulers/scheduling_ddpm.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index 19acf3dc2ff5..c3c72171f1cc 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -16,7 +16,8 @@
 
 import math
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union, Literal
+from typing import Literal, Optional, Tuple, Union
+
 import numpy as np
 import torch
 
@@ -27,7 +28,8 @@
 
 def expand_to_shape(input, timesteps, shape, device):
     """
-    Helper indexes a 1D tensor `input` using a 1D index tensor `timesteps`, then reshapes the result to broadcast nicely with `shape`. Useful for parellizing operations over `shape[0]` number of diffusion steps at once.
+    Helper indexes a 1D tensor `input` using a 1D index tensor `timesteps`, then reshapes the result to broadcast
+    nicely with `shape`. Useful for parellizing operations over `shape[0]` number of diffusion steps at once.
     """
     out = torch.gather(input.to(device), 0, timesteps.to(device))
     reshape = [shape[0]] + [1] * (len(shape) - 1)
@@ -247,7 +249,9 @@ def step(
             sample (`torch.FloatTensor`):
                 current instance of sample being created by diffusion process.
             prediction_type (`Literal["epsilon", "sample", "v"]`, optional):
-                prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v` (see section 2.4 https://imagen.research.google/video/paper.pdf)
+                prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+                process), `sample` (directly predicting the noisy sample`) or `v` (see section 2.4
+                https://imagen.research.google/video/paper.pdf)
             generator: random number generator.
             return_dict (`bool`): option for returning tuple rather than DDPMSchedulerOutput class
 
@@ -276,10 +280,6 @@ def step(
                 sample * self.sqrt_alphas_cumprod[timestep]
                 - model_output * self.sqrt_one_minus_alphas_cumprod[timestep]
             )
-            eps = (
-                model_output * self.sqrt_alphas_cumprod[timestep]
-                - sample * self.sqrt_one_minus_alphas_cumprod[timestep]
-            )
         elif prediction_type == "epsilon":
             pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
         else:
@@ -305,7 +305,7 @@ def step(
                 model_output.size(), dtype=model_output.dtype, layout=model_output.layout, generator=generator
             ).to(model_output.device)
             if self.variance_type == "fixed_small_log":
-                variance = self._get_variance(t, predicted_variance=predicted_variance) * noise
+                variance = self._get_variance(timestep, predicted_variance=predicted_variance) * noise
             elif self.variance_type == "v_diffusion":
                 variance = torch.exp(0.5 * self._get_variance(timestep, predicted_variance)) * noise
             else:

From 0889fd1d1178fc203321e5a095b737a111c9c9fb Mon Sep 17 00:00:00 2001
From: Ben Glickenhaus <benglickenhaus@gmail.com>
Date: Thu, 3 Nov 2022 11:04:44 -0400
Subject: [PATCH 03/18] variable name consistency

---
 src/diffusers/schedulers/scheduling_ddpm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index c3c72171f1cc..25b081da87f1 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -198,9 +198,9 @@ def _get_variance(self, timestep, predicted_variance=None, variance_type=None):
         alpha_prod_t = self.alphas_cumprod[timestep]
         alpha_prod_t_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else torch.tensor(1.0)
 
-        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
+        # For timestep > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
         # and sample from it to get previous sample
-        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
+        # x_{timestep-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
         variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[timestep]
 
         if variance_type is None:

From f7c709518fe1c036866ea60b09119fd3013b534b Mon Sep 17 00:00:00 2001
From: Ben Glickenhaus <benglickenhaus@gmail.com>
Date: Thu, 3 Nov 2022 11:07:38 -0400
Subject: [PATCH 04/18] missing base case

---
 src/diffusers/schedulers/scheduling_ddpm.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index 25b081da87f1..878d67e817d1 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -282,8 +282,11 @@ def step(
             )
         elif prediction_type == "epsilon":
             pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-        else:
+
+        elif prediction_type == "sample":
             pred_original_sample = model_output
+        else:
+            raise ValueError(f"prediction_type given as {prediction_type} must be one of `epsilon`, `sample`, or `v`")
 
         # 3. Clip "predicted x_0"
         if self.config.clip_sample:

From 0c23e1162ad31fee62b327ce6ce4ab5ba7a82188 Mon Sep 17 00:00:00 2001
From: Ben Glickenhaus <benglickenhaus@gmail.com>
Date: Thu, 3 Nov 2022 11:23:35 -0400
Subject: [PATCH 05/18] pass prediction type along in the pipeline

---
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py | 7 +++++--
 src/diffusers/schedulers/scheduling_ddpm.py   | 3 ++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index aae29737aae3..3d5afc94e3df 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Union, Literal
 
 import torch
 
@@ -44,6 +44,7 @@ def __call__(
         generator: Optional[torch.Generator] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
+        prediction_type: Literal["epsilon", "sample", "v"] = "epsilon",
         **kwargs,
     ) -> Union[ImagePipelineOutput, Tuple]:
         r"""
@@ -80,7 +81,9 @@ def __call__(
             model_output = self.unet(image, t).sample
 
             # 2. compute previous image: x_t -> t_t-1
-            image = self.scheduler.step(model_output, t, image, generator=generator).prev_sample
+            image = self.scheduler.step(
+                model_output, t, image, generator=generator, prediction_type=prediction_type
+            ).prev_sample
 
         image = (image / 2 + 0.5).clamp(0, 1)
         image = image.cpu().permute(0, 2, 3, 1).numpy()
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index 878d67e817d1..1813592a069d 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -237,7 +237,6 @@ def step(
         prediction_type: Literal["epsilon", "sample", "v"] = "epsilon",
         generator=None,
         return_dict: bool = True,
-        v_prediction: bool = True,
     ) -> Union[DDPMSchedulerOutput, Tuple]:
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
@@ -261,6 +260,8 @@ def step(
             returning a tuple, the first element is the sample tensor.
 
         """
+        if self.variance_type == "v_diffusion":
+            assert prediction_type == "v", "Need to use v prediction with v_diffusion"
         if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
             model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
         else:

From b46327e89e079b2eaaa2351a27c86900d77c168d Mon Sep 17 00:00:00 2001
From: Ben Glickenhaus <benglickenhaus@gmail.com>
Date: Mon, 7 Nov 2022 09:19:22 -0500
Subject: [PATCH 06/18] put prediction type in scheduler config

---
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py |  7 ++-----
 src/diffusers/schedulers/scheduling_ddpm.py   | 16 ++++++++++------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index 3d5afc94e3df..a9284063e884 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 
-from typing import Optional, Tuple, Union, Literal
+from typing import Literal, Optional, Tuple, Union
 
 import torch
 
@@ -44,7 +44,6 @@ def __call__(
         generator: Optional[torch.Generator] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
-        prediction_type: Literal["epsilon", "sample", "v"] = "epsilon",
         **kwargs,
     ) -> Union[ImagePipelineOutput, Tuple]:
         r"""
@@ -81,9 +80,7 @@ def __call__(
             model_output = self.unet(image, t).sample
 
             # 2. compute previous image: x_t -> t_t-1
-            image = self.scheduler.step(
-                model_output, t, image, generator=generator, prediction_type=prediction_type
-            ).prev_sample
+            image = self.scheduler.step(model_output, t, image, generator=generator).prev_sample
 
         image = (image / 2 + 0.5).clamp(0, 1)
         image = image.cpu().permute(0, 2, 3, 1).numpy()
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
index 1813592a069d..0327c44e3c4a 100644
--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -131,6 +131,7 @@ def __init__(
         trained_betas: Optional[np.ndarray] = None,
         variance_type: str = "fixed_small",
         clip_sample: bool = True,
+        prediction_type: Literal["epsilon", "sample", "v"] = "epsilon",
     ):
         if trained_betas is not None:
             self.betas = torch.from_numpy(trained_betas)
@@ -164,6 +165,7 @@ def __init__(
         self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
 
         self.variance_type = variance_type
+        self.prediction_type = prediction_type
 
     def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
         """
@@ -234,7 +236,7 @@ def step(
         model_output: torch.FloatTensor,
         timestep: int,
         sample: torch.FloatTensor,
-        prediction_type: Literal["epsilon", "sample", "v"] = "epsilon",
+        # prediction_type: Literal["epsilon", "sample", "v"] = "epsilon",
         generator=None,
         return_dict: bool = True,
     ) -> Union[DDPMSchedulerOutput, Tuple]:
@@ -261,7 +263,7 @@ def step(
 
         """
         if self.variance_type == "v_diffusion":
-            assert prediction_type == "v", "Need to use v prediction with v_diffusion"
+            assert self.prediction_type == "v", "Need to use v prediction with v_diffusion"
         if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
             model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
         else:
@@ -275,19 +277,21 @@ def step(
 
         # 2. compute predicted original sample from predicted noise also called
         # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
-        if prediction_type == "v":
+        if self.prediction_type == "v":
             # x_recon in p_mean_variance
             pred_original_sample = (
                 sample * self.sqrt_alphas_cumprod[timestep]
                 - model_output * self.sqrt_one_minus_alphas_cumprod[timestep]
             )
-        elif prediction_type == "epsilon":
+        elif self.prediction_type == "epsilon":
             pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
 
-        elif prediction_type == "sample":
+        elif self.prediction_type == "sample":
             pred_original_sample = model_output
         else:
-            raise ValueError(f"prediction_type given as {prediction_type} must be one of `epsilon`, `sample`, or `v`")
+            raise ValueError(
+                f"prediction_type given as {self.prediction_type} must be one of `epsilon`, `sample`, or `v`"
+            )
 
         # 3. Clip "predicted x_0"
         if self.config.clip_sample:

From 45c36c85d71ae278ca7d7ad87263394128b30eb1 Mon Sep 17 00:00:00 2001
From: Ben Glickenhaus <benglickenhaus@gmail.com>
Date: Mon, 7 Nov 2022 09:20:16 -0500
Subject: [PATCH 07/18] style

---
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index a9284063e884..aae29737aae3 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 
-from typing import Literal, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import torch
 

From 13404a6857e96a2314df4a4d82b268a1ccb4922d Mon Sep 17 00:00:00 2001
From: Ben Glickenhaus <benglickenhaus@gmail.com>
Date: Wed, 9 Nov 2022 15:16:21 -0500
Subject: [PATCH 08/18] try to train on ddim

---
 examples/v_prediction/train_butterflies.py  | 239 ++++++++++++++++++++
 src/diffusers/schedulers/scheduling_ddim.py |  83 ++++---
 2 files changed, 295 insertions(+), 27 deletions(-)
 create mode 100644 examples/v_prediction/train_butterflies.py

diff --git a/examples/v_prediction/train_butterflies.py b/examples/v_prediction/train_butterflies.py
new file mode 100644
index 000000000000..8eaa971c80d3
--- /dev/null
+++ b/examples/v_prediction/train_butterflies.py
@@ -0,0 +1,239 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class TrainingConfig:
+    image_size = 128  # the generated image resolution
+    train_batch_size = 16
+    eval_batch_size = 16  # how many images to sample during evaluation
+    num_epochs = 50
+    gradient_accumulation_steps = 1
+    learning_rate = 5e-5
+    lr_warmup_steps = 500
+    save_image_epochs = 10
+    save_model_epochs = 30
+    mixed_precision = "fp16"  # `no` for float32, `fp16` for automatic mixed precision
+    output_dir = "ddim-butterflies-128-v-diffusion"  # the model namy locally and on the HF Hub
+
+    push_to_hub = False  # whether to upload the saved model to the HF Hub
+    hub_private_repo = False
+    overwrite_output_dir = True  # overwrite the old model when re-running the notebook
+    seed = 0
+
+
+config = TrainingConfig()
+from datasets import load_dataset
+
+config.dataset_name = "huggan/smithsonian_butterflies_subset"
+dataset = load_dataset(config.dataset_name, split="train")
+
+import matplotlib.pyplot as plt
+
+from torchvision import transforms
+
+preprocess = transforms.Compose(
+    [
+        transforms.Resize((config.image_size, config.image_size)),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ]
+)
+
+
+def transform(examples):
+    images = [preprocess(image.convert("RGB")) for image in examples["image"]]
+    return {"images": images}
+
+
+dataset.set_transform(transform)
+
+import torch
+
+train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=config.train_batch_size, shuffle=True)
+
+from diffusers import UNet2DModel
+
+
+model = UNet2DModel(
+    sample_size=config.image_size,  # the target image resolution
+    in_channels=3,  # the number of input channels, 3 for RGB images
+    out_channels=3,  # the number of output channels
+    layers_per_block=2,  # how many ResNet layers to use per UNet block
+    block_out_channels=(128, 128, 256, 256, 512, 512),  # the number of output channes for each UNet block
+    down_block_types=(
+        "DownBlock2D",  # a regular ResNet downsampling block
+        "DownBlock2D",
+        "DownBlock2D",
+        "DownBlock2D",
+        "AttnDownBlock2D",  # a ResNet downsampling block with spatial self-attention
+        "DownBlock2D",
+    ),
+    up_block_types=(
+        "UpBlock2D",  # a regular ResNet upsampling block
+        "AttnUpBlock2D",  # a ResNet upsampling block with spatial self-attention
+        "UpBlock2D",
+        "UpBlock2D",
+        "UpBlock2D",
+        "UpBlock2D",
+    ),
+)
+
+from diffusers import DDPMScheduler, DDIMPipeline, DDIMScheduler
+
+noise_scheduler = DDIMScheduler(
+    num_train_timesteps=1000,
+    beta_schedule="squaredcos_cap_v2",
+    variance_type="v_diffusion",
+)
+
+import torch
+import torch.nn.functional as F
+
+from PIL import Image
+
+optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
+
+
+from diffusers.optimization import get_cosine_schedule_with_warmup
+
+lr_scheduler = get_cosine_schedule_with_warmup(
+    optimizer=optimizer,
+    num_warmup_steps=config.lr_warmup_steps,
+    num_training_steps=(len(train_dataloader) * config.num_epochs),
+)
+
+from diffusers import DDPMPipeline
+
+import math
+
+
+def make_grid(images, rows, cols):
+    w, h = images[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    for i, image in enumerate(images):
+        grid.paste(image, box=(i % cols * w, i // cols * h))
+    return grid
+
+
+def evaluate(config, epoch, pipeline):
+    # Sample some images from random noise (this is the backward diffusion process).
+    # The default pipeline output type is `List[PIL.Image]`
+    images = pipeline(
+        batch_size=config.eval_batch_size,
+        generator=torch.manual_seed(config.seed),
+    ).images
+
+    # Make a grid out of the images
+    image_grid = make_grid(images, rows=4, cols=4)
+
+    # Save the images
+    test_dir = os.path.join(config.output_dir, "samples")
+    os.makedirs(test_dir, exist_ok=True)
+    image_grid.save(f"{test_dir}/{epoch:04d}.png")
+
+
+from accelerate import Accelerator
+from diffusers.hub_utils import init_git_repo, push_to_hub
+
+from tqdm.auto import tqdm
+import os
+
+
+def train_loop(config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler):
+    # Initialize accelerator and tensorboard logging
+    accelerator = Accelerator(
+        mixed_precision=config.mixed_precision,
+        gradient_accumulation_steps=config.gradient_accumulation_steps,
+        log_with="tensorboard",
+        logging_dir=os.path.join(config.output_dir, "logs"),
+    )
+    if accelerator.is_main_process:
+        if config.push_to_hub:
+            repo = init_git_repo(config, at_init=True)
+        accelerator.init_trackers("train_example")
+
+    # Prepare everything
+    # There is no specific order to remember, you just need to unpack the
+    # objects in the same order you gave them to the prepare method.
+    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, lr_scheduler
+    )
+
+    global_step = 0
+
+    # Now you train the model
+    for epoch in range(config.num_epochs):
+        progress_bar = tqdm(total=len(train_dataloader), disable=not accelerator.is_local_main_process)
+        progress_bar.set_description(f"Epoch {epoch}")
+
+        for step, batch in enumerate(train_dataloader):
+            clean_images = batch["images"]
+            # Sample noise to add to the images
+            noise = torch.randn(clean_images.shape).to(clean_images.device)
+            bs = clean_images.shape[0]
+
+            # Sample a random timestep for each image
+            timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bs,), device=clean_images.device).long()
+
+            with accelerator.accumulate(model):
+                # Predict the noise residual
+                alpha_t, sigma_t = noise_scheduler.get_alpha_sigma(clean_images, timesteps, accelerator.device)
+                z_t = alpha_t * clean_images + sigma_t * noise
+                noise_pred = model(z_t, timesteps).sample
+                v = alpha_t * noise - sigma_t * clean_images
+                loss = F.mse_loss(noise_pred, v)
+                accelerator.backward(loss)
+
+                accelerator.clip_grad_norm_(model.parameters(), 1.0)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            progress_bar.update(1)
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            global_step += 1
+
+        # After each epoch you optionally sample some demo images with evaluate() and save the model
+        if accelerator.is_main_process:
+            pipeline = DDIMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler)
+
+            if (epoch + 1) % config.save_image_epochs == 0 or epoch == config.num_epochs - 1:
+                evaluate(config, epoch, pipeline)
+
+            if (epoch + 1) % config.save_model_epochs == 0 or epoch == config.num_epochs - 1:
+                if config.push_to_hub:
+                    push_to_hub(config, pipeline, repo, commit_message=f"Epoch {epoch}", blocking=True)
+                else:
+                    pipeline.save_pretrained(config.output_dir)
+
+
+"""## Let's train!
+
+Let's launch the training (including multi-GPU training) from the notebook using Accelerate's `notebook_launcher` function:
+"""
+
+from accelerate import notebook_launcher
+
+args = (config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler)
+
+train_loop(*args)
+
+"""Let's have a look at the final image grid produced by the trained diffusion model:"""
+
+import glob
+
+sample_images = sorted(glob.glob(f"{config.output_dir}/samples/*.png"))
+Image.open(sample_images[-1])
+
+"""Not bad! There's room for improvement of course, so feel free to play with the hyperparameters, model definition and image augmentations 🤗
+
+If you've chosen to upload the model to the Hugging Face Hub, its repository should now look like so: 
+https://huggingface.co/anton-l/ddpm-butterflies-128
+
+If you want to dive deeper into the code, we also have more advanced training scripts with features like Exponential Moving Average of model weights here: 
+
+https://github.com/huggingface/diffusers/tree/main/examples
+"""
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index abdcb3e81a58..177bd65dc517 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -17,7 +17,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple, Union, Literal
 
 import numpy as np
 import torch
@@ -27,11 +27,21 @@
 from .scheduling_utils import SchedulerMixin
 
 
+def expand_to_shape(input, timesteps, shape, device):
+    """
+    Helper indexes a 1D tensor `input` using a 1D index tensor `timesteps`, then reshapes the result to broadcast
+    nicely with `shape`. Useful for parellizing operations over `shape[0]` number of diffusion steps at once.
+    """
+    out = torch.gather(input.to(device), 0, timesteps.to(device))
+    reshape = [shape[0]] + [1] * (len(shape) - 1)
+    out = out.reshape(*reshape)
+    return out
+
+
 @dataclass
 class DDIMSchedulerOutput(BaseOutput):
     """
     Output class for the scheduler's step function output.
-
     Args:
         prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
             Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
@@ -49,16 +59,12 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor
     """
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
     (1-beta) over time from t = [0,1].
-
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
-
-
     Args:
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
                      prevent singularities.
-
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
@@ -78,14 +84,11 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
     """
     Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising
     diffusion probabilistic models (DDPMs) with non-Markovian guidance.
-
     [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
     function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
     [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
     [`~ConfigMixin.from_config`] functions.
-
     For more details, see the original paper: https://arxiv.org/abs/2010.02502
-
     Args:
         num_train_timesteps (`int`): number of diffusion steps used to train the model.
         beta_start (`float`): the starting `beta` value of inference.
@@ -105,7 +108,6 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
             an offset added to the inference steps. You can use a combination of `offset=1` and
             `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
             stable diffusion.
-
     """
 
     @register_to_config
@@ -118,7 +120,9 @@ def __init__(
         trained_betas: Optional[np.ndarray] = None,
         clip_sample: bool = True,
         set_alpha_to_one: bool = True,
+        variance_type: str = "fixed",
         steps_offset: int = 0,
+        prediction_type: Literal["epsilon", "sample", "v"] = "epsilon",
         **kwargs,
     ):
         deprecate(
@@ -159,35 +163,42 @@ def __init__(
         # setable values
         self.num_inference_steps = None
         self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+        self.variance_type = variance_type
+        self.prediction_type = prediction_type
 
     def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
         """
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
-
         Args:
             sample (`torch.FloatTensor`): input sample
             timestep (`int`, optional): current timestep
-
         Returns:
             `torch.FloatTensor`: scaled input sample
         """
         return sample
 
     def _get_variance(self, timestep, prev_timestep):
-        alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
-        beta_prod_t = 1 - alpha_prod_t
-        beta_prod_t_prev = 1 - alpha_prod_t_prev
-
-        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+        if self.variance_type == "fixed":
+            alpha_prod_t = self.alphas_cumprod[timestep]
+            alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+            beta_prod_t = 1 - alpha_prod_t
+            beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+            variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+        elif self.variance_type == "v_diffusion":
+            # If eta > 0, adjust the scaling factor for the predicted noise
+            # downward according to the amount of additional noise to add
+            ddim_sigma = (self.sigmas[timestep + 1] ** 2 / self.sigmas[timestep] ** 2).sqrt() * (
+                1 - self.alphas[timestep] ** 2 / self.alphas[timestep + 1] ** 2
+            ).sqrt()
+            variance = (self.sigmas[timestep + 1] ** 2 - ddim_sigma**2).sqrt()
 
         return variance
 
     def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None, **kwargs):
         """
         Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
-
         Args:
             num_inference_steps (`int`):
                 the number of diffusion steps used when generating samples with a pre-trained model.
@@ -219,7 +230,6 @@ def step(
         """
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
-
         Args:
             model_output (`torch.FloatTensor`): direct output from learned diffusion model.
             timestep (`int`): current discrete timestep in the diffusion chain.
@@ -233,12 +243,10 @@ def step(
             use_clipped_model_output (`bool`): TODO
             generator: random number generator.
             return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class
-
         Returns:
             [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`:
             [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
             returning a tuple, the first element is the sample tensor.
-
         """
         if self.num_inference_steps is None:
             raise ValueError(
@@ -295,19 +303,31 @@ def step(
             model_output = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
 
         # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * model_output
+        if self.prediction_type == "epsilon":
+            pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * model_output
 
-        # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + eps * pred_sample_direction
+            # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+            prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + eps * pred_sample_direction
+        else:
+            if timestep < len(self.timesteps) - 1:
+                prev_sample = pred_original_sample + self.alphas[timestep + 1] + eps * variance
+            else:
+                prev_sample = None
 
         if eta > 0:
             # randn_like does not support generator https://github.com/pytorch/pytorch/issues/27072
             device = model_output.device if torch.is_tensor(model_output) else "cpu"
             noise = torch.randn(model_output.shape, dtype=model_output.dtype, generator=generator).to(device)
-            variance = self._get_variance(timestep, prev_timestep) ** (0.5) * eta * noise
+            if self.prediction_type == "epsilon":
 
-            prev_sample = prev_sample + variance
+                variance = self._get_variance(timestep, prev_timestep) ** (0.5) * eta * noise
 
+                prev_sample = prev_sample + variance
+            else:
+                ddim_sigma = (self.sigmas[timestep + 1] ** 2 / self.sigmas[timestep] ** 2).sqrt() * (
+                    1 - self.alphas[timestep] ** 2 / self.alphas[timestep + 1] ** 2
+                ).sqrt()
+                prev_sample = prev_sample + ddim_sigma * noise
         if not return_dict:
             return (prev_sample,)
 
@@ -319,6 +339,10 @@ def add_noise(
         noise: torch.FloatTensor,
         timesteps: torch.IntTensor,
     ) -> torch.FloatTensor:
+        if self.variance_type == "v_diffusion":
+            alpha, sigma = self.get_alpha_sigma(original_samples, timesteps, original_samples.device)
+            z_t = alpha * original_samples + sigma * noise
+            return z_t
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
         self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
         timesteps = timesteps.to(original_samples.device)
@@ -338,3 +362,8 @@ def add_noise(
 
     def __len__(self):
         return self.config.num_train_timesteps
+
+    def get_alpha_sigma(self, sample, timesteps, device):
+        alpha = expand_to_shape(self.sqrt_alphas_cumprod, timesteps, sample.shape, device)
+        sigma = expand_to_shape(self.sqrt_one_minus_alphas_cumprod, timesteps, sample.shape, device)
+        return alpha, sigma

From 1fa3cc8ad7099d929adb7982508087cb5f6afb7a Mon Sep 17 00:00:00 2001
From: Ben Glickenhaus <ben@mail.cs.umass.edu>
Date: Tue, 15 Nov 2022 12:27:10 -0500
Subject: [PATCH 09/18] changes to ddim

---
 src/diffusers/schedulers/scheduling_ddim.py | 45 ++++++++++++++-------
 1 file changed, 30 insertions(+), 15 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 177bd65dc517..eebd1f2cd975 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -80,6 +80,14 @@ def alpha_bar(time_step):
     return torch.tensor(betas)
 
 
+def t_to_alpha_sigma(num_diffusion_timesteps):
+    """Returns the scaling factors for the clean image and for the noise, given
+    a timestep."""
+    alphas = torch.cos(torch.tensor([t * math.pi / 2 for t in range(num_diffusion_timesteps)]))
+    sigmas = torch.sin(torch.tensor([t * math.pi / 2 for t in range(num_diffusion_timesteps)]))
+    return alphas, sigmas
+
+
 class DDIMScheduler(SchedulerMixin, ConfigMixin):
     """
     Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising
@@ -149,7 +157,8 @@ def __init__(
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
-        self.sigmas = 1 - self.alphas**2
+        if prediction_type == "v":
+            self.alphas, self.sigmas = t_to_alpha_sigma(num_train_timesteps)
 
         # At every step in ddim, we are looking into the previous alphas_cumprod
         # For the final step, there is no previous alphas_cumprod because we are already at 0
@@ -178,7 +187,7 @@ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] =
         """
         return sample
 
-    def _get_variance(self, timestep, prev_timestep):
+    def _get_variance(self, timestep, prev_timestep, eta=0):
         if self.variance_type == "fixed":
             alpha_prod_t = self.alphas_cumprod[timestep]
             alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
@@ -189,9 +198,14 @@ def _get_variance(self, timestep, prev_timestep):
         elif self.variance_type == "v_diffusion":
             # If eta > 0, adjust the scaling factor for the predicted noise
             # downward according to the amount of additional noise to add
-            ddim_sigma = (self.sigmas[timestep + 1] ** 2 / self.sigmas[timestep] ** 2).sqrt() * (
-                1 - self.alphas[timestep] ** 2 / self.alphas[timestep + 1] ** 2
-            ).sqrt()
+            if eta:
+                numerator = (
+                    eta * (self.sigmas[timestep + 1] ** 2 / self.sigmas[timestep] ** 2).clamp(min=1.0e-7).sqrt()
+                )
+            else:
+                numerator = 0
+            denominator = (self.alphas[timestep + 1] / self.alphas[timestep]).clamp(min=1.0e-7).sqrt()
+            ddim_sigma = (numerator / denominator).clamp(min=1.0e-7)
             variance = (self.sigmas[timestep + 1] ** 2 - ddim_sigma**2).sqrt()
 
         return variance
@@ -221,7 +235,6 @@ def step(
         model_output: torch.FloatTensor,
         timestep: int,
         sample: torch.FloatTensor,
-        prediction_type: str = "epsilon",
         eta: float = 0.0,
         use_clipped_model_output: bool = False,
         generator=None,
@@ -275,19 +288,21 @@ def step(
 
         # 3. compute predicted original sample from predicted noise also called
         # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        if prediction_type == "epsilon":
+        if self.prediction_type == "epsilon":
             pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
             eps = torch.tensor(1)
-        elif prediction_type == "sample":
+        elif self.prediction_type == "sample":
             pred_original_sample = model_output
             eps = torch.tensor(1)
-        elif prediction_type == "v":
+        elif self.prediction_type == "v":
             # v_t = alpha_t * epsilon - sigma_t * x
             # need to merge the PRs for sigma to be available in DDPM
             pred_original_sample = sample * self.alphas[timestep] - model_output * self.sigmas[timestep]
-            eps = model_output * self.alphas[timestep] - sample * self.sigmas[timestep]
+            eps = model_output * self.alphas[timestep] + sample * self.sigmas[timestep]
         else:
-            raise ValueError(f"prediction_type given as {prediction_type} must be one of `epsilon`, `sample`, or `v`")
+            raise ValueError(
+                f"prediction_type given as {self.prediction_type} must be one of `epsilon`, `sample`, or `v`"
+            )
 
         # 4. Clip "predicted x_0"
         if self.config.clip_sample:
@@ -295,7 +310,7 @@ def step(
 
         # 5. compute variance: "sigma_t(η)" -> see formula (16)
         # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
-        variance = self._get_variance(timestep, prev_timestep)
+        variance = self._get_variance(timestep, prev_timestep, eta)
         std_dev_t = eta * variance ** (0.5)
 
         if use_clipped_model_output:
@@ -309,7 +324,7 @@ def step(
             # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
             prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + eps * pred_sample_direction
         else:
-            if timestep < len(self.timesteps) - 1:
+            if timestep < len(self.alphas) - 1:
                 prev_sample = pred_original_sample + self.alphas[timestep + 1] + eps * variance
             else:
                 prev_sample = None
@@ -364,6 +379,6 @@ def __len__(self):
         return self.config.num_train_timesteps
 
     def get_alpha_sigma(self, sample, timesteps, device):
-        alpha = expand_to_shape(self.sqrt_alphas_cumprod, timesteps, sample.shape, device)
-        sigma = expand_to_shape(self.sqrt_one_minus_alphas_cumprod, timesteps, sample.shape, device)
+        alpha = expand_to_shape(self.alphas, timesteps, sample.shape, device)
+        sigma = expand_to_shape(self.sigmas, timesteps, sample.shape, device)
         return alpha, sigma

From 0b60c2b427450506bff7672550327fe22649ff7a Mon Sep 17 00:00:00 2001
From: Ben Glickenhaus <ben@mail.cs.umass.edu>
Date: Wed, 16 Nov 2022 09:49:12 -0500
Subject: [PATCH 10/18] ddim v prediction works to train butterflies example

---
 examples/v_prediction/train_butterflies.py  | 40 +++++++++++++++---
 src/diffusers/schedulers/scheduling_ddim.py | 46 +++++++++++----------
 2 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/examples/v_prediction/train_butterflies.py b/examples/v_prediction/train_butterflies.py
index 8eaa971c80d3..bb87671a238b 100644
--- a/examples/v_prediction/train_butterflies.py
+++ b/examples/v_prediction/train_butterflies.py
@@ -81,11 +81,20 @@ def transform(examples):
 
 from diffusers import DDPMScheduler, DDIMPipeline, DDIMScheduler
 
-noise_scheduler = DDIMScheduler(
-    num_train_timesteps=1000,
-    beta_schedule="squaredcos_cap_v2",
-    variance_type="v_diffusion",
-)
+if config.output_dir.startswith("ddpm"):
+    noise_scheduler = DDPMScheduler(
+        num_train_timesteps=1000,
+        beta_schedule="squaredcos_cap_v2",
+        variance_type="v_diffusion",
+        prediction_type="v",
+    )
+else:
+    noise_scheduler = DDIMScheduler(
+        num_train_timesteps=1000,
+        beta_schedule="squaredcos_cap_v2",
+        variance_type="v_diffusion",
+        prediction_type="v",
+    )
 
 import torch
 import torch.nn.functional as F
@@ -162,6 +171,21 @@ def train_loop(config, model, noise_scheduler, optimizer, train_dataloader, lr_s
 
     global_step = 0
 
+    if config.output_dir.startswith("ddpm"):
+
+        pipeline = DDPMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler)
+    else:
+        pipeline = DDIMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler)
+
+    def t_to_alpha_sigma(t):
+        """Returns the scaling factors for the clean image and for the noise, given
+        a timestep."""
+        return torch.cos(t * math.pi / 2), torch.sin(t * math.pi / 2)
+
+    alpha_sigmas = [t_to_alpha_sigma(t) for t in noise_scheduler.timesteps]
+
+    evaluate(config, 0, pipeline)
+
     # Now you train the model
     for epoch in range(config.num_epochs):
         progress_bar = tqdm(total=len(train_dataloader), disable=not accelerator.is_local_main_process)
@@ -198,7 +222,11 @@ def train_loop(config, model, noise_scheduler, optimizer, train_dataloader, lr_s
 
         # After each epoch you optionally sample some demo images with evaluate() and save the model
         if accelerator.is_main_process:
-            pipeline = DDIMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler)
+            if config.output_dir.startswith("ddpm"):
+
+                pipeline = DDPMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler)
+            else:
+                pipeline = DDIMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler)
 
             if (epoch + 1) % config.save_image_epochs == 0 or epoch == config.num_epochs - 1:
                 evaluate(config, epoch, pipeline)
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index eebd1f2cd975..b6651e7c8daf 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -83,8 +83,12 @@ def alpha_bar(time_step):
 def t_to_alpha_sigma(num_diffusion_timesteps):
     """Returns the scaling factors for the clean image and for the noise, given
     a timestep."""
-    alphas = torch.cos(torch.tensor([t * math.pi / 2 for t in range(num_diffusion_timesteps)]))
-    sigmas = torch.sin(torch.tensor([t * math.pi / 2 for t in range(num_diffusion_timesteps)]))
+    alphas = torch.cos(
+        torch.tensor([(t / num_diffusion_timesteps) * math.pi / 2 for t in range(num_diffusion_timesteps)])
+    )
+    sigmas = torch.sin(
+        torch.tensor([(t / num_diffusion_timesteps) * math.pi / 2 for t in range(num_diffusion_timesteps)])
+    )
     return alphas, sigmas
 
 
@@ -155,6 +159,7 @@ def __init__(
         else:
             raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
 
+        self.variance_type = variance_type
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
         if prediction_type == "v":
@@ -165,6 +170,7 @@ def __init__(
         # `set_alpha_to_one` decides whether we set this parameter simply to one or
         # whether we use the final alpha of the "non-previous" one.
         self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+        self.final_sigma = torch.tensor(0.0) if set_alpha_to_one else self.sigmas[0]
 
         # standard deviation of the initial noise distribution
         self.init_noise_sigma = 1.0
@@ -188,26 +194,29 @@ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] =
         return sample
 
     def _get_variance(self, timestep, prev_timestep, eta=0):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
         if self.variance_type == "fixed":
-            alpha_prod_t = self.alphas_cumprod[timestep]
-            alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
-            beta_prod_t = 1 - alpha_prod_t
-            beta_prod_t_prev = 1 - alpha_prod_t_prev
 
             variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
         elif self.variance_type == "v_diffusion":
             # If eta > 0, adjust the scaling factor for the predicted noise
             # downward according to the amount of additional noise to add
+            # variance = torch.log(self.betas[timestep] * (1 - alpha_prod_t_prev) / (1 - alpha_prod_t))
+            alpha_prev = self.alphas[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+            sigma_prev = self.sigmas[prev_timestep] if prev_timestep >= 0 else self.final_sigma
             if eta:
-                numerator = (
-                    eta * (self.sigmas[timestep + 1] ** 2 / self.sigmas[timestep] ** 2).clamp(min=1.0e-7).sqrt()
-                )
+                numerator = eta * (sigma_prev**2 / self.sigmas[timestep] ** 2).clamp(min=1.0e-7).sqrt()
             else:
                 numerator = 0
-            denominator = (self.alphas[timestep + 1] / self.alphas[timestep]).clamp(min=1.0e-7).sqrt()
-            ddim_sigma = (numerator / denominator).clamp(min=1.0e-7)
-            variance = (self.sigmas[timestep + 1] ** 2 - ddim_sigma**2).sqrt()
-
+            denominator = (1 - self.alphas[timestep] ** 2 / alpha_prev**2).clamp(min=1.0e-7).sqrt()
+            ddim_sigma = (numerator * denominator).clamp(min=1.0e-7)
+            variance = (sigma_prev**2 - ddim_sigma**2).sqrt()
+            if torch.isnan(variance):
+                variance = 0
         return variance
 
     def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None, **kwargs):
@@ -324,10 +333,8 @@ def step(
             # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
             prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + eps * pred_sample_direction
         else:
-            if timestep < len(self.alphas) - 1:
-                prev_sample = pred_original_sample + self.alphas[timestep + 1] + eps * variance
-            else:
-                prev_sample = None
+            alpha_prev = self.alphas[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+            prev_sample = pred_original_sample * alpha_prev + eps * variance
 
         if eta > 0:
             # randn_like does not support generator https://github.com/pytorch/pytorch/issues/27072
@@ -339,10 +346,7 @@ def step(
 
                 prev_sample = prev_sample + variance
             else:
-                ddim_sigma = (self.sigmas[timestep + 1] ** 2 / self.sigmas[timestep] ** 2).sqrt() * (
-                    1 - self.alphas[timestep] ** 2 / self.alphas[timestep + 1] ** 2
-                ).sqrt()
-                prev_sample = prev_sample + ddim_sigma * noise
+                prev_sample = prev_sample + variance * noise
         if not return_dict:
             return (prev_sample,)
 

From 8311d8980547ee5a6e4559573407236958220d11 Mon Sep 17 00:00:00 2001
From: Ben Glickenhaus <benglickenhaus@gmail.com>
Date: Wed, 16 Nov 2022 10:03:07 -0500
Subject: [PATCH 11/18] fix bad merge, style and quality

---
 examples/v_prediction/train_butterflies.py  | 70 +++++----------------
 src/diffusers/schedulers/scheduling_ddim.py | 61 ++++++++----------
 2 files changed, 40 insertions(+), 91 deletions(-)

diff --git a/examples/v_prediction/train_butterflies.py b/examples/v_prediction/train_butterflies.py
index bb87671a238b..5074ece86a98 100644
--- a/examples/v_prediction/train_butterflies.py
+++ b/examples/v_prediction/train_butterflies.py
@@ -1,5 +1,19 @@
+import glob
+import os
 from dataclasses import dataclass
 
+import torch
+import torch.nn.functional as F
+
+from accelerate import Accelerator
+from datasets import load_dataset
+from diffusers import DDIMPipeline, DDIMScheduler, DDPMPipeline, DDPMScheduler, UNet2DModel
+from diffusers.hub_utils import init_git_repo, push_to_hub
+from diffusers.optimization import get_cosine_schedule_with_warmup
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+
 
 @dataclass
 class TrainingConfig:
@@ -22,14 +36,11 @@ class TrainingConfig:
 
 
 config = TrainingConfig()
-from datasets import load_dataset
+
 
 config.dataset_name = "huggan/smithsonian_butterflies_subset"
 dataset = load_dataset(config.dataset_name, split="train")
 
-import matplotlib.pyplot as plt
-
-from torchvision import transforms
 
 preprocess = transforms.Compose(
     [
@@ -48,12 +59,9 @@ def transform(examples):
 
 dataset.set_transform(transform)
 
-import torch
 
 train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=config.train_batch_size, shuffle=True)
 
-from diffusers import UNet2DModel
-
 
 model = UNet2DModel(
     sample_size=config.image_size,  # the target image resolution
@@ -79,7 +87,6 @@ def transform(examples):
     ),
 )
 
-from diffusers import DDPMScheduler, DDIMPipeline, DDIMScheduler
 
 if config.output_dir.startswith("ddpm"):
     noise_scheduler = DDPMScheduler(
@@ -96,26 +103,16 @@ def transform(examples):
         prediction_type="v",
     )
 
-import torch
-import torch.nn.functional as F
-
-from PIL import Image
 
 optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
 
 
-from diffusers.optimization import get_cosine_schedule_with_warmup
-
 lr_scheduler = get_cosine_schedule_with_warmup(
     optimizer=optimizer,
     num_warmup_steps=config.lr_warmup_steps,
     num_training_steps=(len(train_dataloader) * config.num_epochs),
 )
 
-from diffusers import DDPMPipeline
-
-import math
-
 
 def make_grid(images, rows, cols):
     w, h = images[0].size
@@ -142,13 +139,6 @@ def evaluate(config, epoch, pipeline):
     image_grid.save(f"{test_dir}/{epoch:04d}.png")
 
 
-from accelerate import Accelerator
-from diffusers.hub_utils import init_git_repo, push_to_hub
-
-from tqdm.auto import tqdm
-import os
-
-
 def train_loop(config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler):
     # Initialize accelerator and tensorboard logging
     accelerator = Accelerator(
@@ -172,18 +162,10 @@ def train_loop(config, model, noise_scheduler, optimizer, train_dataloader, lr_s
     global_step = 0
 
     if config.output_dir.startswith("ddpm"):
-
         pipeline = DDPMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler)
     else:
         pipeline = DDIMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler)
 
-    def t_to_alpha_sigma(t):
-        """Returns the scaling factors for the clean image and for the noise, given
-        a timestep."""
-        return torch.cos(t * math.pi / 2), torch.sin(t * math.pi / 2)
-
-    alpha_sigmas = [t_to_alpha_sigma(t) for t in noise_scheduler.timesteps]
-
     evaluate(config, 0, pipeline)
 
     # Now you train the model
@@ -223,7 +205,6 @@ def t_to_alpha_sigma(t):
         # After each epoch you optionally sample some demo images with evaluate() and save the model
         if accelerator.is_main_process:
             if config.output_dir.startswith("ddpm"):
-
                 pipeline = DDPMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler)
             else:
                 pipeline = DDIMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler)
@@ -238,30 +219,9 @@ def t_to_alpha_sigma(t):
                     pipeline.save_pretrained(config.output_dir)
 
 
-"""## Let's train!
-
-Let's launch the training (including multi-GPU training) from the notebook using Accelerate's `notebook_launcher` function:
-"""
-
-from accelerate import notebook_launcher
-
 args = (config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler)
 
 train_loop(*args)
 
-"""Let's have a look at the final image grid produced by the trained diffusion model:"""
-
-import glob
-
 sample_images = sorted(glob.glob(f"{config.output_dir}/samples/*.png"))
 Image.open(sample_images[-1])
-
-"""Not bad! There's room for improvement of course, so feel free to play with the hyperparameters, model definition and image augmentations 🤗
-
-If you've chosen to upload the model to the Hugging Face Hub, its repository should now look like so: 
-https://huggingface.co/anton-l/ddpm-butterflies-128
-
-If you want to dive deeper into the code, we also have more advanced training scripts with features like Exponential Moving Average of model weights here: 
-
-https://github.com/huggingface/diffusers/tree/main/examples
-"""
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 9e4dc2ee0627..a41ba49cb156 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -17,7 +17,7 @@
 
 import math
 from dataclasses import dataclass
-from typing import Optional, Tuple, Union, Literal
+from typing import Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -42,8 +42,8 @@ def expand_to_shape(input, timesteps, shape, device):
 # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
 class DDIMSchedulerOutput(BaseOutput):
     """
-    Output class for the scheduler's step function output.
     Args:
+    Output class for the scheduler's step function output.
         prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
             Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
             denoising loop.
@@ -58,13 +58,12 @@ class DDIMSchedulerOutput(BaseOutput):
 
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
     """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
     Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1]. Contains a function alpha_bar that takes an argument t and transforms it to the
+    cumulative product of (1-beta) up to that part of the diffusion process.
+        num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use;
+        use values lower than 1 to
                      prevent singularities.
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
@@ -95,18 +94,15 @@ def t_to_alpha_sigma(num_diffusion_timesteps):
 
 class DDIMScheduler(SchedulerMixin, ConfigMixin):
     """
-    Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising
-    diffusion probabilistic models (DDPMs) with non-Markovian guidance.
-    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
-    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
-    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
-    [`~ConfigMixin.from_config`] functions.
-    For more details, see the original paper: https://arxiv.org/abs/2010.02502
     Args:
-        num_train_timesteps (`int`): number of diffusion steps used to train the model.
-        beta_start (`float`): the starting `beta` value of inference.
-        beta_end (`float`): the final `beta` value.
-        beta_schedule (`str`):
+    Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising
+    diffusion probabilistic models (DDPMs) with non-Markovian guidance. [`~ConfigMixin`] takes care of storing all
+    config attributes that are passed in the scheduler's `__init__` function, such as `num_train_timesteps`. They can
+    be accessed via `scheduler.config.num_train_timesteps`. [`~ConfigMixin`] also provides general loading and saving
+    functionality via the [`~ConfigMixin.save_config`] and [`~ConfigMixin.from_config`] functions. For more details,
+    see the original paper: https://arxiv.org/abs/2010.02502
+        num_train_timesteps (`int`): number of diffusion steps used to train the model. beta_start (`float`): the
+        starting `beta` value of inference. beta_end (`float`): the final `beta` value. beta_schedule (`str`):
             the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
             `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
         trained_betas (`np.ndarray`, optional):
@@ -186,11 +182,10 @@ def __init__(
 
     def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
         """
+        Args:
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
-        Args:
-            sample (`torch.FloatTensor`): input sample
-            timestep (`int`, optional): current timestep
+            sample (`torch.FloatTensor`): input sample timestep (`int`, optional): current timestep
         Returns:
             `torch.FloatTensor`: scaled input sample
         """
@@ -203,7 +198,6 @@ def _get_variance(self, timestep, prev_timestep, eta=0):
         beta_prod_t_prev = 1 - alpha_prod_t_prev
 
         if self.variance_type == "fixed":
-
             variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
         elif self.variance_type == "v_diffusion":
             # If eta > 0, adjust the scaling factor for the predicted noise
@@ -224,8 +218,8 @@ def _get_variance(self, timestep, prev_timestep, eta=0):
 
     def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
         """
-        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
         Args:
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
             num_inference_steps (`int`):
                 the number of diffusion steps used when generating samples with a pre-trained model.
         """
@@ -249,24 +243,23 @@ def step(
         return_dict: bool = True,
     ) -> Union[DDIMSchedulerOutput, Tuple]:
         """
+        Args:
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
-        Args:
-            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
-            timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`torch.FloatTensor`):
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model. timestep (`int`): current
+            discrete timestep in the diffusion chain. sample (`torch.FloatTensor`):
                 current instance of sample being created by diffusion process.
             prediction_type (`str`):
                 prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
                 process), `sample` (directly predicting the noisy sample), or `v` (see section 2.4
                 https://imagen.research.google/video/paper.pdf)
-            eta (`float`): weight of noise for added noise in diffusion step.
-            use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped
+            eta (`float`): weight of noise for added noise in diffusion step. use_clipped_model_output (`bool`): if
+            `True`, compute "corrected" `model_output` from the clipped
                 predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when
                 `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would
                 coincide with the one provided as input and `use_clipped_model_output` will have not effect.
-            generator: random number generator.
-            variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we
+            generator: random number generator. variance_noise (`torch.FloatTensor`): instead of generating noise for
+            the variance using `generator`, we
                 can directly provide the noise for the variance itself. This is useful for methods such as
                 CycleDiffusion. (https://arxiv.org/abs/2210.05559)
             return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class
@@ -362,10 +355,6 @@ def step(
             variance = self._get_variance(timestep, prev_timestep) ** (0.5) * eta * variance_noise
 
             prev_sample = prev_sample + variance
-
-                prev_sample = prev_sample + variance
-            else:
-                prev_sample = prev_sample + variance * noise
         if not return_dict:
             return (prev_sample,)
 

From 7117ff9abefc3d1107d3022685d1d5630e55b2cb Mon Sep 17 00:00:00 2001
From: Ben Glickenhaus <benglickenhaus@gmail.com>
Date: Thu, 17 Nov 2022 09:13:27 -0500
Subject: [PATCH 12/18] try to fix broken doc strings

---
 src/diffusers/schedulers/scheduling_ddim.py | 58 ++++++++++++---------
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index a41ba49cb156..08fcbb20abb4 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -42,8 +42,11 @@ def expand_to_shape(input, timesteps, shape, device):
 # Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
 class DDIMSchedulerOutput(BaseOutput):
     """
-    Args:
     Output class for the scheduler's step function output.
+
+
+    Args:
+
         prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
             Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
             denoising loop.
@@ -58,13 +61,14 @@ class DDIMSchedulerOutput(BaseOutput):
 
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor:
     """
-    Args:
     Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1]. Contains a function alpha_bar that takes an argument t and transforms it to the
-    cumulative product of (1-beta) up to that part of the diffusion process.
-        num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use;
-        use values lower than 1 to
-                     prevent singularities.
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities.
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
@@ -94,21 +98,21 @@ def t_to_alpha_sigma(num_diffusion_timesteps):
 
 class DDIMScheduler(SchedulerMixin, ConfigMixin):
     """
-    Args:
     Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising
     diffusion probabilistic models (DDPMs) with non-Markovian guidance. [`~ConfigMixin`] takes care of storing all
     config attributes that are passed in the scheduler's `__init__` function, such as `num_train_timesteps`. They can
     be accessed via `scheduler.config.num_train_timesteps`. [`~ConfigMixin`] also provides general loading and saving
     functionality via the [`~ConfigMixin.save_config`] and [`~ConfigMixin.from_config`] functions. For more details,
     see the original paper: https://arxiv.org/abs/2010.02502
-        num_train_timesteps (`int`): number of diffusion steps used to train the model. beta_start (`float`): the
-        starting `beta` value of inference. beta_end (`float`): the final `beta` value. beta_schedule (`str`):
-            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
-        trained_betas (`np.ndarray`, optional):
-            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
-        clip_sample (`bool`, default `True`):
-            option to clip predicted sample between -1 and 1 for numerical stability.
+
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`): the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, optional): option to pass an array of betas directly to the constructor to bypass`beta_start`, `beta_end` etc.
+        clip_sample (`bool`, default `True`): option to clip predicted sample between -1 and 1 for numerical stability.
         set_alpha_to_one (`bool`, default `True`):
             each diffusion step uses the value of alphas product at that step and at the previous one. For the final
             step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
@@ -182,10 +186,13 @@ def __init__(
 
     def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
         """
-        Args:
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
-            sample (`torch.FloatTensor`): input sample timestep (`int`, optional): current timestep
+
+
+        Args:
+            sample (`torch.FloatTensor`): input sample
+            timestep (`int`, optional): current timestep
         Returns:
             `torch.FloatTensor`: scaled input sample
         """
@@ -211,17 +218,16 @@ def _get_variance(self, timestep, prev_timestep, eta=0):
                 numerator = 0
             denominator = (1 - self.alphas[timestep] ** 2 / alpha_prev**2).clamp(min=1.0e-7).sqrt()
             ddim_sigma = (numerator * denominator).clamp(min=1.0e-7)
-            variance = (sigma_prev**2 - ddim_sigma**2).sqrt()
-            if torch.isnan(variance):
-                variance = 0
+            variance = (sigma_prev**2 - ddim_sigma**2).clamp(min=1.0e-7).sqrt()
         return variance
 
     def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
         """
-        Args:
         Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
-            num_inference_steps (`int`):
-                the number of diffusion steps used when generating samples with a pre-trained model.
+
+
+        Args:
+            num_inference_steps (`int`): the number of diffusion steps used when generating samples with a pre-trained model.
         """
         self.num_inference_steps = num_inference_steps
         step_ratio = self.config.num_train_timesteps // self.num_inference_steps
@@ -243,9 +249,11 @@ def step(
         return_dict: bool = True,
     ) -> Union[DDIMSchedulerOutput, Tuple]:
         """
-        Args:
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
+
+
+        Args:
             model_output (`torch.FloatTensor`): direct output from learned diffusion model. timestep (`int`): current
             discrete timestep in the diffusion chain. sample (`torch.FloatTensor`):
                 current instance of sample being created by diffusion process.

From de0c55841ddb013e8bc6a55f69fd183a948f3b32 Mon Sep 17 00:00:00 2001
From: Ben Glickenhaus <benglickenhaus@gmail.com>
Date: Thu, 17 Nov 2022 09:20:06 -0500
Subject: [PATCH 13/18] second pass

---
 src/diffusers/schedulers/scheduling_ddim.py | 44 ++++++++++++---------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 08fcbb20abb4..d7adcd3c0629 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -44,9 +44,7 @@ class DDIMSchedulerOutput(BaseOutput):
     """
     Output class for the scheduler's step function output.
 
-
     Args:
-
         prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
             Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
             denoising loop.
@@ -66,9 +64,11 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor
 
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
+
     Args:
         num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                    prevent singularities.
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
@@ -99,20 +99,25 @@ def t_to_alpha_sigma(num_diffusion_timesteps):
 class DDIMScheduler(SchedulerMixin, ConfigMixin):
     """
     Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising
-    diffusion probabilistic models (DDPMs) with non-Markovian guidance. [`~ConfigMixin`] takes care of storing all
-    config attributes that are passed in the scheduler's `__init__` function, such as `num_train_timesteps`. They can
-    be accessed via `scheduler.config.num_train_timesteps`. [`~ConfigMixin`] also provides general loading and saving
-    functionality via the [`~ConfigMixin.save_config`] and [`~ConfigMixin.from_config`] functions. For more details,
-    see the original paper: https://arxiv.org/abs/2010.02502
+    diffusion probabilistic models (DDPMs) with non-Markovian guidance.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
+    [`~ConfigMixin.from_config`] functions.
 
 
     Args:
         num_train_timesteps (`int`): number of diffusion steps used to train the model.
         beta_start (`float`): the starting `beta` value of inference.
         beta_end (`float`): the final `beta` value.
-        beta_schedule (`str`): the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
-        trained_betas (`np.ndarray`, optional): option to pass an array of betas directly to the constructor to bypass`beta_start`, `beta_end` etc.
-        clip_sample (`bool`, default `True`): option to clip predicted sample between -1 and 1 for numerical stability.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        clip_sample (`bool`, default `True`):
+            option to clip predicted sample between -1 and 1 for numerical stability.
         set_alpha_to_one (`bool`, default `True`):
             each diffusion step uses the value of alphas product at that step and at the previous one. For the final
             step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
@@ -227,7 +232,8 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
 
 
         Args:
-            num_inference_steps (`int`): the number of diffusion steps used when generating samples with a pre-trained model.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
         """
         self.num_inference_steps = num_inference_steps
         step_ratio = self.config.num_train_timesteps // self.num_inference_steps
@@ -254,20 +260,20 @@ def step(
 
 
         Args:
-            model_output (`torch.FloatTensor`): direct output from learned diffusion model. timestep (`int`): current
-            discrete timestep in the diffusion chain. sample (`torch.FloatTensor`):
-                current instance of sample being created by diffusion process.
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`): current instance of sample being created by diffusion process.
             prediction_type (`str`):
                 prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
                 process), `sample` (directly predicting the noisy sample), or `v` (see section 2.4
                 https://imagen.research.google/video/paper.pdf)
-            eta (`float`): weight of noise for added noise in diffusion step. use_clipped_model_output (`bool`): if
-            `True`, compute "corrected" `model_output` from the clipped
+            eta (`float`): weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped
                 predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when
                 `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would
                 coincide with the one provided as input and `use_clipped_model_output` will have not effect.
-            generator: random number generator. variance_noise (`torch.FloatTensor`): instead of generating noise for
-            the variance using `generator`, we
+            generator: random number generator.
+            variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we
                 can directly provide the noise for the variance itself. This is useful for methods such as
                 CycleDiffusion. (https://arxiv.org/abs/2210.05559)
             return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class

From 39ccf3262f26f7face0db818966dd9bb0d629299 Mon Sep 17 00:00:00 2001
From: Ben Glickenhaus <benglickenhaus@gmail.com>
Date: Thu, 17 Nov 2022 09:24:10 -0500
Subject: [PATCH 14/18] one more

---
 src/diffusers/schedulers/scheduling_ddim.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index d7adcd3c0629..66968f34575d 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -44,6 +44,7 @@ class DDIMSchedulerOutput(BaseOutput):
     """
     Output class for the scheduler's step function output.
 
+
     Args:
         prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
             Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
@@ -106,6 +107,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
     [`~ConfigMixin`] also provides general loading and saving functionality via the [`~ConfigMixin.save_config`] and
     [`~ConfigMixin.from_config`] functions.
 
+    For more details, see the original paper: https://arxiv.org/abs/2010.02502
 
     Args:
         num_train_timesteps (`int`): number of diffusion steps used to train the model.
@@ -126,6 +128,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
             an offset added to the inference steps. You can use a combination of `offset=1` and
             `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in
             stable diffusion.
+
     """
 
     _compatible_classes = [
@@ -262,7 +265,8 @@ def step(
         Args:
             model_output (`torch.FloatTensor`): direct output from learned diffusion model.
             timestep (`int`): current discrete timestep in the diffusion chain.
-            sample (`torch.FloatTensor`): current instance of sample being created by diffusion process.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
             prediction_type (`str`):
                 prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
                 process), `sample` (directly predicting the noisy sample), or `v` (see section 2.4

From 4b79f209bfc701cbe35bcfd5455e0b965f1bcb1d Mon Sep 17 00:00:00 2001
From: Ben Glickenhaus <benglickenhaus@gmail.com>
Date: Thu, 17 Nov 2022 09:25:58 -0500
Subject: [PATCH 15/18] white space

---
 src/diffusers/schedulers/scheduling_ddim.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 66968f34575d..691ee3c0bad4 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -66,10 +66,12 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor
     Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
     to that part of the diffusion process.
 
+
     Args:
         num_diffusion_timesteps (`int`): the number of betas to produce.
         max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                    prevent singularities.
+                     prevent singularities.
+
     Returns:
         betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
     """
@@ -281,10 +283,12 @@ def step(
                 can directly provide the noise for the variance itself. This is useful for methods such as
                 CycleDiffusion. (https://arxiv.org/abs/2210.05559)
             return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class
+
         Returns:
             [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`:
             [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
             returning a tuple, the first element is the sample tensor.
+
         """
         if self.num_inference_steps is None:
             raise ValueError(

From dbf206baef85dbebe5d30f1a695f0c532d52c2d3 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathan@huggingface.co>
Date: Thu, 17 Nov 2022 10:09:18 -0800
Subject: [PATCH 16/18] Update src/diffusers/schedulers/scheduling_ddim.py

---
 src/diffusers/schedulers/scheduling_ddim.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 691ee3c0bad4..1f33957450bf 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -44,7 +44,6 @@ class DDIMSchedulerOutput(BaseOutput):
     """
     Output class for the scheduler's step function output.
 
-
     Args:
         prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
             Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the

From 5fbbe50148a8f2e026349d782bab354bc370b8d0 Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathan@huggingface.co>
Date: Thu, 17 Nov 2022 10:22:29 -0800
Subject: [PATCH 17/18] remove extra lines

---
 src/diffusers/schedulers/scheduling_ddim.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 1f33957450bf..0cf8782292dd 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -198,10 +198,10 @@ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] =
         Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
         current timestep.
 
-
         Args:
             sample (`torch.FloatTensor`): input sample
             timestep (`int`, optional): current timestep
+            
         Returns:
             `torch.FloatTensor`: scaled input sample
         """
@@ -218,7 +218,6 @@ def _get_variance(self, timestep, prev_timestep, eta=0):
         elif self.variance_type == "v_diffusion":
             # If eta > 0, adjust the scaling factor for the predicted noise
             # downward according to the amount of additional noise to add
-            # variance = torch.log(self.betas[timestep] * (1 - alpha_prod_t_prev) / (1 - alpha_prod_t))
             alpha_prev = self.alphas[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
             sigma_prev = self.sigmas[prev_timestep] if prev_timestep >= 0 else self.final_sigma
             if eta:
@@ -234,7 +233,6 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         """
         Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
 
-
         Args:
             num_inference_steps (`int`):
                 the number of diffusion steps used when generating samples with a pre-trained model.
@@ -262,7 +260,6 @@ def step(
         Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
         process from the learned model outputs (most often the predicted noise).
 
-
         Args:
             model_output (`torch.FloatTensor`): direct output from learned diffusion model.
             timestep (`int`): current discrete timestep in the diffusion chain.

From 5812e313298c76e3b64a85c37c982ac44b7bd00b Mon Sep 17 00:00:00 2001
From: Nathan Lambert <nathan@huggingface.co>
Date: Thu, 17 Nov 2022 10:23:04 -0800
Subject: [PATCH 18/18] Update src/diffusers/schedulers/scheduling_ddim.py

---
 src/diffusers/schedulers/scheduling_ddim.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
index 0cf8782292dd..89d90ba60ad4 100644
--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -201,7 +201,6 @@ def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] =
         Args:
             sample (`torch.FloatTensor`): input sample
             timestep (`int`, optional): current timestep
-            
         Returns:
             `torch.FloatTensor`: scaled input sample
         """