@@ -245,15 +245,15 @@ class FeedForward : public GGMLBlock {
245245 Activation activation = Activation::GEGLU,
246246 bool force_prec_f32 = false ) {
247247 int64_t inner_dim = dim * mult;
248-
248+ SD_UNUSED (force_prec_f32);
249249 if (activation == Activation::GELU) {
250250 blocks[" net.0" ] = std::shared_ptr<GGMLBlock>(new GELU (dim, inner_dim));
251251 } else {
252252 blocks[" net.0" ] = std::shared_ptr<GGMLBlock>(new GEGLU (dim, inner_dim));
253253 }
254254
255255 // net_1 is nn.Dropout(), skip for inference
256- blocks[" net.2" ] = std::shared_ptr<GGMLBlock>(new Linear (inner_dim, dim_out, true , false , force_prec_f32 ));
256+ blocks[" net.2" ] = std::shared_ptr<GGMLBlock>(new Linear (inner_dim, dim_out));
257257 }
258258
259259 struct ggml_tensor * forward (struct ggml_context * ctx, struct ggml_tensor * x) {
@@ -264,7 +264,13 @@ class FeedForward : public GGMLBlock {
264264 auto net_2 = std::dynamic_pointer_cast<Linear>(blocks[" net.2" ]);
265265
266266 x = net_0->forward (ctx, x); // [ne3, ne2, ne1, inner_dim]
267- x = net_2->forward (ctx, x); // [ne3, ne2, ne1, dim_out]
267+ // The purpose of the scale here is to prevent NaN issues in certain situations.
268+ // For example, when using Vulkan without enabling force_prec_f32,
269+ // or when using CUDA but the weights are k-quants.
270+ float scale = 1 .f / 128 .f ;
271+ x = ggml_scale (ctx, x, scale);
272+ x = net_2->forward (ctx, x); // [ne3, ne2, ne1, dim_out]
273+ x = ggml_scale (ctx, x, 1 .f / scale);
268274 return x;
269275 }
270276};
0 commit comments