@@ -111,6 +111,8 @@ class StableDiffusionGGML {
111111 bool is_using_v_parameterization = false ;
112112 bool is_using_edm_v_parameterization = false ;
113113
114+ bool has_vision = false ;
115+
114116 std::map<std::string, struct ggml_tensor *> tensors;
115117
116118 std::string lora_model_dir;
@@ -412,6 +414,7 @@ class StableDiffusionGGML {
412414 if (!vae_decode_only) {
413415 enable_vision = true ;
414416 }
417+ has_vision = enable_vision;
415418 cond_stage_model = std::make_shared<Qwen2_5_VLCLIPEmbedder>(clip_backend,
416419 offload_params_to_cpu,
417420 model_loader.tensor_storages_types ,
@@ -1141,7 +1144,8 @@ class StableDiffusionGGML {
11411144 float img_cfg_scale = std::isfinite (guidance.img_cfg ) ? guidance.img_cfg : guidance.txt_cfg ;
11421145 float slg_scale = guidance.slg .scale ;
11431146
1144- if (img_cfg_scale != 1.0 && !sd_version_is_inpaint_or_unet_edit (version)) {
1147+ if (img_cfg_scale != 1.0 && !sd_version_is_inpaint_or_unet_edit (version)
1148+ && (version != VERSION_FLUX || ref_latents.size ()==0 ) && (version != VERSION_QWEN_IMAGE || ref_latents.size ()==0 )) {
11451149 LOG_WARN (" 2-conditioning CFG is not supported with this model, disabling it for better performance..." );
11461150 img_cfg_scale = 1 .0f ;
11471151 }
@@ -1155,7 +1159,7 @@ class StableDiffusionGGML {
11551159 }
11561160
11571161 struct ggml_tensor * noised_input = ggml_dup_tensor (work_ctx, x);
1158-
1162+
11591163 bool has_skiplayer = slg_scale != 0.0 && skip_layers.size () > 0 ;
11601164 bool has_conditionned = (has_skiplayer || cfg_scale != 0.0 ) && cond.c_crossattn != nullptr ;
11611165 bool has_unconditioned = cfg_scale != img_cfg_scale && uncond.c_crossattn != nullptr ;
@@ -1284,9 +1288,10 @@ class StableDiffusionGGML {
12841288
12851289 float * img_uncond_data = nullptr ;
12861290 if (has_img_uncond) {
1287- diffusion_params.context = img_uncond.c_crossattn ;
1288- diffusion_params.c_concat = img_uncond.c_concat ;
1289- diffusion_params.y = img_uncond.c_vector ;
1291+ diffusion_params.ref_latents = {};
1292+ diffusion_params.context = img_uncond.c_crossattn ;
1293+ diffusion_params.c_concat = img_uncond.c_concat ;
1294+ diffusion_params.y = img_uncond.c_vector ;
12901295 work_diffusion_model->compute (n_threads,
12911296 diffusion_params,
12921297 &out_img_cond);
@@ -2270,6 +2275,15 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
22702275 sd_ctx->sd ->n_threads ,
22712276 condition_params);
22722277 }
2278+ SDCondition img_uncond = uncond;
2279+ if (uncond.c_crossattn != nullptr && guidance.img_cfg != 1.0 && sd_ctx->sd ->has_vision && condition_params.ref_images .size () > 0 ) {
2280+ // Recompute negative conditionning without ref images
2281+ condition_params.ref_images = {};
2282+ img_uncond = sd_ctx->sd ->cond_stage_model ->get_learned_condition (work_ctx,
2283+ sd_ctx->sd ->n_threads ,
2284+ condition_params);
2285+ }
2286+
22732287 int64_t t1 = ggml_time_ms ();
22742288 LOG_INFO (" get_learned_condition completed, taking %" PRId64 " ms" , t1 - t0);
22752289
@@ -2377,10 +2391,9 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
23772391 }
23782392 uncond.c_concat = cond.c_concat ;
23792393 }
2380- SDCondition img_uncond = uncond;
23812394 if (uncond.c_crossattn != nullptr &&
23822395 (sd_version_is_inpaint_or_unet_edit (sd_ctx->sd ->version ) && guidance.img_cfg != 1.0 )) {
2383- img_uncond = SDCondition (uncond. c_crossattn , uncond. c_vector , empty_latent) ;
2396+ img_uncond. c_concat = empty_latent;
23842397 }
23852398 for (int b = 0 ; b < batch_count; b++) {
23862399 int64_t sampling_start = ggml_time_ms ();
0 commit comments