@@ -1009,7 +1009,10 @@ static void sample_k_diffusion(sample_method_t method,
10091009 case DDIM_TRAILING: // Denoising Diffusion Implicit Models
10101010 // with the "trailing" timestep spacing
10111011 {
1012- // DDIM itself needs alphas_cumprod (DDPM, Ho et al.,
1012+ // See J. Song et al., "Denoising Diffusion Implicit
1013+ // Models", arXiv:2010.02502 [cs.LG]
1014+ //
1015+ // DDIM itself needs alphas_cumprod (DDPM, J. Ho et al.,
10131016 // arXiv:2006.11239 [cs.LG] with k-diffusion's start and
10141017 // end beta) (which unfortunately k-diffusion's data
10151018 // structure hides from the denoiser), and the sigmas are
@@ -1045,9 +1048,8 @@ static void sample_k_diffusion(sample_method_t method,
10451048 // are Flawed", arXiv:2305.08891 [cs], p. 4, Table
10461049 // 2. Most variables below follow Diffusers naming
10471050 //
1048- // Diffuser naming vs. J. Song et al., "Denoising
1049- // Diffusion Implicit Models", arXiv:2010.02502, p. 5,
1050- // (12) and p. 16, (16) (<variable name> -> <name in
1051+ // Diffuser naming vs. Song et al. (2010), p. 5, (12)
1052+ // and p. 16, (16) (<variable name> -> <name in
10511053 // paper>):
10521054 //
10531055 // - pred_noise_t -> epsilon_theta^(t)(x_t)
@@ -1100,9 +1102,8 @@ static void sample_k_diffusion(sample_method_t method,
11001102 }
11011103 // Note (also noise_pred in Diffuser's pipeline)
11021104 // model_output = model() is the D(x, sigma) as
1103- // defined in T. Karras et al., arXiv:2206.00364,
1104- // p. 3, Table 1 and p. 8 (7), compare also p. 38
1105- // (226) therein.
1105+ // defined in Karras et al. (2022), p. 3, Table 1 and
1106+ // p. 8 (7), compare also p. 38 (226) therein.
11061107 struct ggml_tensor * model_output =
11071108 model (x, sigma, i + 1 );
11081109 // Here model_output is still the k-diffusion denoiser
@@ -1202,6 +1203,10 @@ static void sample_k_diffusion(sample_method_t method,
12021203 case TCD: // Strategic Stochastic Sampling (Algorithm 4) in
12031204 // Trajectory Consistency Distillation
12041205 {
1206+ // See J. Zheng et al., "Trajectory Consistency
1207+ // Distillation: Improved Latent Consistency Distillation
1208+ // by Semi-Linear Consistency Function with Trajectory
1209+ // Mapping", arXiv:2402.19159 [cs.CV]
12051210 float beta_start = 0 .00085f ;
12061211 float beta_end = 0 .0120f ;
12071212 std::vector<double > alphas_cumprod;
@@ -1238,7 +1243,9 @@ static void sample_k_diffusion(sample_method_t method,
12381243 (int )floor ((i + 1 ) *
12391244 ((float )original_steps / steps));
12401245 // Here timestep_s is tau_n' in Algorithm 4. The _s
1241- // notation appears to be that from DPM-Solver, C. Lu,
1246+ // notation appears to be that from C. Lu,
1247+ // "DPM-Solver: A Fast ODE Solver for Diffusion
1248+ // Probabilistic Model Sampling in Around 10 Steps",
12421249 // arXiv:2206.00927 [cs.LG], but this notation is not
12431250 // continued in Algorithm 4, where _n' is used.
12441251 int timestep_s =
@@ -1315,12 +1322,12 @@ static void sample_k_diffusion(sample_method_t method,
13151322 }
13161323 }
13171324 // This consistency function step can be difficult to
1318- // decipher from Algorithm 4, as it involves a
1319- // difficult notation ("|->"). In Diffusers it is
1320- // borrowed verbatim (with the same comments below for
1321- // step (4)) from LCMScheduler's noise injection step,
1322- // compare in S. Luo et al., arXiv:2310.04378 p. 14,
1323- // Algorithm 3 .
1325+ // decipher from Algorithm 4, as it is simply stated
1326+ // using a consistency function. This step is the
1327+ // modified DDIM, i.e. p. 8 (32) in Zheng et
1328+ // al. (2024), with eta set to 0 (see the paragraph
1329+ // immediately thereafter that states this somewhat
1330+ // obliquely) .
13241331 {
13251332 float * vec_pred_original_sample =
13261333 (float *)pred_original_sample->data ;
0 commit comments