Skip to content

Commit c12989d

Browse files
committed
support img_cfg for qwen image edit (and flux kontext?)
1 parent 99d320a commit c12989d

File tree

1 file changed

+20
-7
lines changed

1 file changed

+20
-7
lines changed

stable-diffusion.cpp

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@ class StableDiffusionGGML {
111111
bool is_using_v_parameterization = false;
112112
bool is_using_edm_v_parameterization = false;
113113

114+
bool has_vision = false;
115+
114116
std::map<std::string, struct ggml_tensor*> tensors;
115117

116118
std::string lora_model_dir;
@@ -412,6 +414,7 @@ class StableDiffusionGGML {
412414
if (!vae_decode_only) {
413415
enable_vision = true;
414416
}
417+
has_vision = enable_vision;
415418
cond_stage_model = std::make_shared<Qwen2_5_VLCLIPEmbedder>(clip_backend,
416419
offload_params_to_cpu,
417420
model_loader.tensor_storages_types,
@@ -1141,7 +1144,8 @@ class StableDiffusionGGML {
11411144
float img_cfg_scale = std::isfinite(guidance.img_cfg) ? guidance.img_cfg : guidance.txt_cfg;
11421145
float slg_scale = guidance.slg.scale;
11431146

1144-
if (img_cfg_scale != 1.0 && !sd_version_is_inpaint_or_unet_edit(version)) {
1147+
if (img_cfg_scale != 1.0 && !sd_version_is_inpaint_or_unet_edit(version)
1148+
&& (version != VERSION_FLUX || ref_latents.size()==0) && (version != VERSION_QWEN_IMAGE || ref_latents.size()==0)) {
11451149
LOG_WARN("2-conditioning CFG is not supported with this model, disabling it for better performance...");
11461150
img_cfg_scale = 1.0f;
11471151
}
@@ -1155,7 +1159,7 @@ class StableDiffusionGGML {
11551159
}
11561160

11571161
struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x);
1158-
1162+
11591163
bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0;
11601164
bool has_conditionned = (has_skiplayer || cfg_scale != 0.0) && cond.c_crossattn != nullptr;
11611165
bool has_unconditioned = cfg_scale != img_cfg_scale && uncond.c_crossattn != nullptr;
@@ -1284,9 +1288,10 @@ class StableDiffusionGGML {
12841288

12851289
float* img_uncond_data = nullptr;
12861290
if (has_img_uncond) {
1287-
diffusion_params.context = img_uncond.c_crossattn;
1288-
diffusion_params.c_concat = img_uncond.c_concat;
1289-
diffusion_params.y = img_uncond.c_vector;
1291+
diffusion_params.ref_latents = {};
1292+
diffusion_params.context = img_uncond.c_crossattn;
1293+
diffusion_params.c_concat = img_uncond.c_concat;
1294+
diffusion_params.y = img_uncond.c_vector;
12901295
work_diffusion_model->compute(n_threads,
12911296
diffusion_params,
12921297
&out_img_cond);
@@ -2270,6 +2275,15 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
22702275
sd_ctx->sd->n_threads,
22712276
condition_params);
22722277
}
2278+
SDCondition img_uncond = uncond;
2279+
if (uncond.c_crossattn != nullptr && guidance.img_cfg != 1.0 && sd_ctx->sd->has_vision && condition_params.ref_images.size() > 0) {
2280+
// Recompute negative conditionning without ref images
2281+
condition_params.ref_images = {};
2282+
img_uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
2283+
sd_ctx->sd->n_threads,
2284+
condition_params);
2285+
}
2286+
22732287
int64_t t1 = ggml_time_ms();
22742288
LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
22752289

@@ -2377,10 +2391,9 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
23772391
}
23782392
uncond.c_concat = cond.c_concat;
23792393
}
2380-
SDCondition img_uncond = uncond;
23812394
if (uncond.c_crossattn != nullptr &&
23822395
(sd_version_is_inpaint_or_unet_edit(sd_ctx->sd->version) && guidance.img_cfg != 1.0)) {
2383-
img_uncond = SDCondition(uncond.c_crossattn, uncond.c_vector, empty_latent);
2396+
img_uncond.c_concat = empty_latent;
23842397
}
23852398
for (int b = 0; b < batch_count; b++) {
23862399
int64_t sampling_start = ggml_time_ms();

0 commit comments

Comments
 (0)