add ddim

natolambert · natolambert · commit 4c6850473dfb · 2022-10-18T11:22:46.000-07:00
diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py
@@ -145,6 +145,7 @@ def __init__(
 
         self.alphas = 1.0 - self.betas
         self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.sigmas = 1 - self.alphas**2
 
         # At every step in ddim, we are looking into the previous alphas_cumprod
         # For the final step, there is no previous alphas_cumprod because we are already at 0
@@ -209,6 +210,7 @@ def step(
         model_output: torch.FloatTensor,
         timestep: int,
         sample: torch.FloatTensor,
+        prediction_type: str = "epsilon",
         eta: float = 0.0,
         use_clipped_model_output: bool = False,
         generator=None,
@@ -223,6 +225,10 @@ def step(
             timestep (`int`): current discrete timestep in the diffusion chain.
             sample (`torch.FloatTensor`):
                 current instance of sample being created by diffusion process.
+            prediction_type (`str`):
+                prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+                process), `sample` (directly predicting the noisy sample), or `v` (see section 2.4
+                https://imagen.research.google/video/paper.pdf)
             eta (`float`): weight of noise for added noise in diffusion step.
             use_clipped_model_output (`bool`): TODO
             generator: random number generator.
@@ -243,14 +249,14 @@ def step(
         # Ideally, read DDIM paper in-detail understanding
 
         # Notation (<variable name> -> <name in paper>
-        # - pred_noise_t -> e_theta(x_t, t)
-        # - pred_original_sample -> f_theta(x_t, t) or x_0
+        # - pred_noise_t -> e_theta(x_t, timestep)
+        # - pred_original_sample -> f_theta(x_t, timestep) or x_0
         # - std_dev_t -> sigma_t
         # - eta -> η
         # - pred_sample_direction -> "direction pointing to x_t"
         # - pred_prev_sample -> "x_t-1"
 
-        # 1. get previous step value (=t-1)
+        # 1. get previous step value (=timestep-1)
         prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
 
         # 2. compute alphas, betas
@@ -261,7 +267,20 @@ def step(
 
         # 3. compute predicted original sample from predicted noise also called
         # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        if prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            eps = torch.tensor(1)
+        elif prediction_type == "sample":
+            pred_original_sample = model_output
+            eps = torch.tensor(1)
+        elif prediction_type == "v":
+            # v_t = alpha_t * epsilon - sigma_t * x
+            # need to merge the PRs for sigma to be available in DDPM
+            pred_original_sample = sample * self.alphas[timestep] - model_output * self.sigmas[timestep]
+            eps = model_output * self.alphas[timestep] - sample * self.sigmas[timestep]
+            raise NotImplementedError(f"v prediction not yet implemented for DDPM")
+        else:
+            raise ValueError(f"prediction_type given as {prediction_type} must be one of `epsilon`, `sample`, or `v`")
 
         # 4. Clip "predicted x_0"
         if self.config.clip_sample:
@@ -280,7 +299,7 @@ def step(
         pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * model_output
 
         # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + eps * pred_sample_direction
 
         if eta > 0:
             # randn_like does not support generator https://github.com/pytorch/pytorch/issues/27072
diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -183,14 +183,14 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic
         )[::-1].copy()
         self.timesteps = torch.from_numpy(timesteps).to(device)
 
-    def _get_variance(self, t, predicted_variance=None, variance_type=None):
-        alpha_prod_t = self.alphas_cumprod[t]
-        alpha_prod_t_prev = self.alphas_cumprod[t - 1] if t > 0 else self.one
+    def _get_variance(self, timestep, predicted_variance=None, variance_type=None):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else self.one
 
-        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
+        # For timestep > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
         # and sample from it to get previous sample
-        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
-        variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[t]
+        # x_{timestep-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
+        variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[timestep]
 
         if variance_type is None:
             variance_type = self.config.variance_type
@@ -202,15 +202,15 @@ def _get_variance(self, t, predicted_variance=None, variance_type=None):
         elif variance_type == "fixed_small_log":
             variance = torch.log(torch.clamp(variance, min=1e-20))
         elif variance_type == "fixed_large":
-            variance = self.betas[t]
+            variance = self.betas[timestep]
         elif variance_type == "fixed_large_log":
             # Glide max_log
-            variance = torch.log(self.betas[t])
+            variance = torch.log(self.betas[timestep])
         elif variance_type == "learned":
             return predicted_variance
         elif variance_type == "learned_range":
             min_log = variance
-            max_log = self.betas[t]
+            max_log = self.betas[timestep]
             frac = (predicted_variance + 1) / 2
             variance = frac * max_log + (1 - frac) * min_log
 
@@ -247,16 +247,14 @@ def step(
             returning a tuple, the first element is the sample tensor.
 
         """
-        t = timestep
-
         if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
             model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
         else:
             predicted_variance = None
 
         # 1. compute alphas, betas
-        alpha_prod_t = self.alphas_cumprod[t]
-        alpha_prod_t_prev = self.alphas_cumprod[t - 1] if t > 0 else self.one
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else self.one
         beta_prod_t = 1 - alpha_prod_t
         beta_prod_t_prev = 1 - alpha_prod_t_prev
 
@@ -269,8 +267,8 @@ def step(
         elif prediction_type == "v":
             # v_t = alpha_t * epsilon - sigma_t * x
             # need to merge the PRs for sigma to be available in DDPM
-            pred_original_sample = sample * self.alphas[t] - model_output * self.sigmas[t]
-            eps = model_output * self.alphas[t] - sample * self.sigmas[t]
+            pred = sample * self.alphas[timestep] - model_output * self.sigmas[timestep]
+            eps = model_output * self.alphas[timestep] - sample * self.sigmas[timestep]
             raise NotImplementedError(f"v prediction not yet implemented for DDPM")
         else:
             raise ValueError(f"prediction_type given as {prediction_type} must be one of `epsilon`, `sample`, or `v`")
@@ -281,20 +279,20 @@ def step(
 
         # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
         # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
-        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * self.betas[t]) / beta_prod_t
-        current_sample_coeff = self.alphas[t] ** (0.5) * beta_prod_t_prev / beta_prod_t
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * self.betas[timestep]) / beta_prod_t
+        current_sample_coeff = self.alphas[timestep] ** (0.5) * beta_prod_t_prev / beta_prod_t
 
         # 5. Compute predicted previous sample µ_t
         # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
         pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
 
         # 6. Add noise
         variance = 0
-        if t > 0:
+        if timestep > 0:
             noise = torch.randn(
                 model_output.size(), dtype=model_output.dtype, layout=model_output.layout, generator=generator
             ).to(model_output.device)
-            variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * noise
+            variance = (self._get_variance(timestep, predicted_variance=predicted_variance) ** 0.5) * noise
 
         pred_prev_sample = pred_prev_sample + variance