fix the issue that occurs when using CUDA with k-quants weights

leejet · leejet · commit 98d6e71492b5 · 2025-10-12T15:41:40.000+08:00
diff --git a/common.hpp b/common.hpp
@@ -245,15 +245,15 @@ class FeedForward : public GGMLBlock {
                 Activation activation = Activation::GEGLU,
                 bool force_prec_f32   = false) {
         int64_t inner_dim = dim * mult;
-
+        SD_UNUSED(force_prec_f32);
         if (activation == Activation::GELU) {
             blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
         } else {
             blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
         }
 
         // net_1 is nn.Dropout(), skip for inference
-        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32));
+        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -264,7 +264,13 @@ class FeedForward : public GGMLBlock {
         auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
 
         x = net_0->forward(ctx, x);  // [ne3, ne2, ne1, inner_dim]
-        x = net_2->forward(ctx, x);  // [ne3, ne2, ne1, dim_out]
+        // The purpose of the scale here is to prevent NaN issues in certain situations.
+        // For example, when using Vulkan without enabling force_prec_f32,
+        // or when using CUDA but the weights are k-quants.
+        float scale = 1.f / 128.f;
+        x           = ggml_scale(ctx, x, scale);
+        x           = net_2->forward(ctx, x);  // [ne3, ne2, ne1, dim_out]
+        x           = ggml_scale(ctx, x, 1.f / scale);
         return x;
     }
 };
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
@@ -56,6 +56,10 @@
 #define __STATIC_INLINE__ static inline
 #endif
 
+#ifndef SD_UNUSED
+#define SD_UNUSED(x) (void)(x)
+#endif
+
 __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
     switch (level) {
         case GGML_LOG_LEVEL_DEBUG: