From ff452b8d8c0dc6d5683d99ecc9826b88e4ee68a6 Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Mon, 28 Jul 2025 16:44:13 +0200
Subject: [PATCH 01/12] Conv2DDirect for VAE stage

---
 common.hpp      |  8 +++----
 ggml_extend.hpp | 64 +++++++++++++++++++++++++++++++++++++++++++++++++
 vae.hpp         | 58 ++++++++++++++++++++++----------------------
 3 files changed, 97 insertions(+), 33 deletions(-)
diff --git a/common.hpp b/common.hpp
index 9b5cc53be..2afee2260 100644
--- a/common.hpp
+++ b/common.hpp
@@ -17,7 +17,7 @@ class DownSampleBlock : public GGMLBlock {
           out_channels(out_channels),
           vae_downsample(vae_downsample) {
         if (vae_downsample) {
-            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}));
+            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(channels, out_channels, {3, 3}, {2, 2}, {0, 0}));
         } else {
             blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}));
         }
@@ -26,7 +26,7 @@ class DownSampleBlock : public GGMLBlock {
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         // x: [N, channels, h, w]
         if (vae_downsample) {
-            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
+            auto conv = std::dynamic_pointer_cast<Conv2dDirect>(blocks["conv"]);
 
             x = ggml_pad(ctx, x, 1, 1, 0, 0);
             x = conv->forward(ctx, x);
@@ -49,12 +49,12 @@ class UpSampleBlock : public GGMLBlock {
                   int out_channels)
         : channels(channels),
           out_channels(out_channels) {
-        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         // x: [N, channels, h, w]
-        auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
+        auto conv = std::dynamic_pointer_cast<Conv2dDirect>(blocks["conv"]);
 
         x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST);  // [N, channels, h*2, w*2]
         x = conv->forward(ctx, x);                             // [N, out_channels, h*2, w*2]
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index eb33f0248..a17162f43 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -706,6 +706,25 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
     return x;
 }
 
+__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx,
+                                                      struct ggml_tensor* x,
+                                                      struct ggml_tensor* w,
+                                                      struct ggml_tensor* b,
+                                                      int s0 = 1,
+                                                      int s1 = 1,
+                                                      int p0 = 0,
+                                                      int p1 = 0,
+                                                      int d0 = 1,
+                                                      int d1 = 1) {
+    x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
+    if (b != NULL) {
+        b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
+        // b = ggml_repeat(ctx, b, x);
+        x = ggml_add(ctx, x, b);
+    }
+    return x;
+}
+
 // w: [OC，IC, KD, 1 * 1]
 // x: [N, IC, IH, IW]
 // b: [OC,]
@@ -1492,6 +1511,51 @@ class Conv2d : public UnaryBlock {
     }
 };
 
+class Conv2dDirect : public UnaryBlock {
+protected:
+    int64_t in_channels;
+    int64_t out_channels;
+    std::pair<int, int> kernel_size;
+    std::pair<int, int> stride;
+    std::pair<int, int> padding;
+    std::pair<int, int> dilation;
+    bool bias;
+
+    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
+        enum ggml_type wtype = GGML_TYPE_F16;  //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F16;
+        params["weight"]     = ggml_new_tensor_4d(ctx, wtype, kernel_size.second, kernel_size.first, in_channels, out_channels);
+        if (bias) {
+            enum ggml_type wtype = GGML_TYPE_F32;  // (tensor_types.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32;
+            params["bias"]       = ggml_new_tensor_1d(ctx, wtype, out_channels);
+        }
+    }
+
+public:
+    Conv2dDirect(int64_t in_channels,
+           int64_t out_channels,
+           std::pair<int, int> kernel_size,
+           std::pair<int, int> stride   = {1, 1},
+           std::pair<int, int> padding  = {0, 0},
+           std::pair<int, int> dilation = {1, 1},
+           bool bias                    = true)
+        : in_channels(in_channels),
+          out_channels(out_channels),
+          kernel_size(kernel_size),
+          stride(stride),
+          padding(padding),
+          dilation(dilation),
+          bias(bias) {}
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* w = params["weight"];
+        struct ggml_tensor* b = NULL;
+        if (bias) {
+            b = params["bias"];
+        }
+        return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+    }
+};
+
 class Conv3dnx1x1 : public UnaryBlock {
 protected:
     int64_t in_channels;
diff --git a/vae.hpp b/vae.hpp
index 4add881f6..88bcc349c 100644
--- a/vae.hpp
+++ b/vae.hpp
@@ -20,13 +20,13 @@ class ResnetBlock : public UnaryBlock {
           out_channels(out_channels) {
         // temb_channels is always 0
         blocks["norm1"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
 
         blocks["norm2"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
-        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
 
         if (out_channels != in_channels) {
-            blocks["nin_shortcut"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {1, 1}));
+            blocks["nin_shortcut"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(in_channels, out_channels, {1, 1}));
         }
     }
 
@@ -34,9 +34,9 @@ class ResnetBlock : public UnaryBlock {
         // x: [N, in_channels, h, w]
         // t_emb is always None
         auto norm1 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm1"]);
-        auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv1"]);
+        auto conv1 = std::dynamic_pointer_cast<Conv2dDirect>(blocks["conv1"]);
         auto norm2 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm2"]);
-        auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv2"]);
+        auto conv2 = std::dynamic_pointer_cast<Conv2dDirect>(blocks["conv2"]);
 
         auto h = x;
         h      = norm1->forward(ctx, h);
@@ -51,7 +51,7 @@ class ResnetBlock : public UnaryBlock {
 
         // skip connection
         if (out_channels != in_channels) {
-            auto nin_shortcut = std::dynamic_pointer_cast<Conv2d>(blocks["nin_shortcut"]);
+            auto nin_shortcut = std::dynamic_pointer_cast<Conv2dDirect>(blocks["nin_shortcut"]);
 
             x = nin_shortcut->forward(ctx, x);  // [N, out_channels, h, w]
         }
@@ -69,20 +69,20 @@ class AttnBlock : public UnaryBlock {
     AttnBlock(int64_t in_channels)
         : in_channels(in_channels) {
         blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        blocks["q"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
-        blocks["k"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
-        blocks["v"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+        blocks["q"]    = std::shared_ptr<GGMLBlock>(new Conv2dDirect(in_channels, in_channels, {1, 1}));
+        blocks["k"]    = std::shared_ptr<GGMLBlock>(new Conv2dDirect(in_channels, in_channels, {1, 1}));
+        blocks["v"]    = std::shared_ptr<GGMLBlock>(new Conv2dDirect(in_channels, in_channels, {1, 1}));
 
-        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(in_channels, in_channels, {1, 1}));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         // x: [N, in_channels, h, w]
         auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
-        auto q_proj   = std::dynamic_pointer_cast<Conv2d>(blocks["q"]);
-        auto k_proj   = std::dynamic_pointer_cast<Conv2d>(blocks["k"]);
-        auto v_proj   = std::dynamic_pointer_cast<Conv2d>(blocks["v"]);
-        auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
+        auto q_proj   = std::dynamic_pointer_cast<Conv2dDirect>(blocks["q"]);
+        auto k_proj   = std::dynamic_pointer_cast<Conv2dDirect>(blocks["k"]);
+        auto v_proj   = std::dynamic_pointer_cast<Conv2dDirect>(blocks["v"]);
+        auto proj_out = std::dynamic_pointer_cast<Conv2dDirect>(blocks["proj_out"]);
 
         auto h_ = norm->forward(ctx, x);
 
@@ -114,7 +114,7 @@ class AttnBlock : public UnaryBlock {
     }
 };
 
-class AE3DConv : public Conv2d {
+class AE3DConv : public Conv2dDirect {
 public:
     AE3DConv(int64_t in_channels,
              int64_t out_channels,
@@ -124,7 +124,7 @@ class AE3DConv : public Conv2d {
              std::pair<int, int> padding  = {0, 0},
              std::pair<int, int> dilation = {1, 1},
              bool bias                    = true)
-        : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) {
+        : Conv2dDirect(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) {
         int64_t kernel_padding  = video_kernel_size / 2;
         blocks["time_mix_conv"] = std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(out_channels,
                                                                              out_channels,
@@ -141,7 +141,7 @@ class AE3DConv : public Conv2d {
         // result: [N, OC, OH, OW]
         auto time_mix_conv = std::dynamic_pointer_cast<Conv3dnx1x1>(blocks["time_mix_conv"]);
 
-        x = Conv2d::forward(ctx, x);
+        x = Conv2dDirect::forward(ctx, x);
         // timesteps = x.shape[0]
         // x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
         // x = conv3d(x)
@@ -240,7 +240,7 @@ class Encoder : public GGMLBlock {
           in_channels(in_channels),
           z_channels(z_channels),
           double_z(double_z) {
-        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(in_channels, ch, {3, 3}, {1, 1}, {1, 1}));
 
         size_t num_resolutions = ch_mult.size();
 
@@ -268,18 +268,18 @@ class Encoder : public GGMLBlock {
         blocks["mid.block_2"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
 
         blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
-        blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}));
     }
 
     virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         // x: [N, in_channels, h, w]
 
-        auto conv_in     = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
+        auto conv_in     = std::dynamic_pointer_cast<Conv2dDirect>(blocks["conv_in"]);
         auto mid_block_1 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_1"]);
         auto mid_attn_1  = std::dynamic_pointer_cast<AttnBlock>(blocks["mid.attn_1"]);
         auto mid_block_2 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_2"]);
         auto norm_out    = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm_out"]);
-        auto conv_out    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
+        auto conv_out    = std::dynamic_pointer_cast<Conv2dDirect>(blocks["conv_out"]);
 
         auto h = conv_in->forward(ctx, x);  // [N, ch, h, w]
 
@@ -332,7 +332,7 @@ class Decoder : public GGMLBlock {
         if (video_decoder) {
             return std::shared_ptr<GGMLBlock>(new AE3DConv(in_channels, out_channels, kernel_size, video_kernel_size, stride, padding));
         } else {
-            return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, stride, padding));
+            return std::shared_ptr<GGMLBlock>(new Conv2dDirect(in_channels, out_channels, kernel_size, stride, padding));
         }
     }
 
@@ -363,7 +363,7 @@ class Decoder : public GGMLBlock {
         size_t num_resolutions = ch_mult.size();
         int block_in           = ch * ch_mult[num_resolutions - 1];
 
-        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}));
 
         blocks["mid.block_1"] = get_resnet_block(block_in, block_in);
         blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in));
@@ -394,12 +394,12 @@ class Decoder : public GGMLBlock {
         // merge_strategy is always learned
         // time_mode is always conv-only, so we need to replace conv_out_op/resnet_op to AE3DConv/VideoResBlock
         // AttnVideoBlock will not be used
-        auto conv_in     = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
+        auto conv_in     = std::dynamic_pointer_cast<Conv2dDirect>(blocks["conv_in"]);
         auto mid_block_1 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_1"]);
         auto mid_attn_1  = std::dynamic_pointer_cast<AttnBlock>(blocks["mid.attn_1"]);
         auto mid_block_2 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_2"]);
         auto norm_out    = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm_out"]);
-        auto conv_out    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
+        auto conv_out    = std::dynamic_pointer_cast<Conv2dDirect>(blocks["conv_out"]);
 
         // conv_in
         auto h = conv_in->forward(ctx, z);  // [N, block_in, h, w]
@@ -472,7 +472,7 @@ class AutoencodingEngine : public GGMLBlock {
                                                                    dd_config.z_channels,
                                                                    use_video_decoder));
         if (use_quant) {
-            blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(dd_config.z_channels,
+            blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(dd_config.z_channels,
                                                                               embed_dim,
                                                                               {1, 1}));
         }
@@ -486,7 +486,7 @@ class AutoencodingEngine : public GGMLBlock {
             if (use_quant) {
                 int factor = dd_config.double_z ? 2 : 1;
 
-                blocks["quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(embed_dim * factor,
+                blocks["quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(embed_dim * factor,
                                                                              dd_config.z_channels * factor,
                                                                              {1, 1}));
             }
@@ -496,7 +496,7 @@ class AutoencodingEngine : public GGMLBlock {
     struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) {
         // z: [N, z_channels, h, w]
         if (use_quant) {
-            auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]);
+            auto post_quant_conv = std::dynamic_pointer_cast<Conv2dDirect>(blocks["post_quant_conv"]);
             z                    = post_quant_conv->forward(ctx, z);  // [N, z_channels, h, w]
         }
         auto decoder = std::dynamic_pointer_cast<Decoder>(blocks["decoder"]);
@@ -513,7 +513,7 @@ class AutoencodingEngine : public GGMLBlock {
 
         auto h = encoder->forward(ctx, x);  // [N, 2*z_channels, h/8, w/8]
         if (use_quant) {
-            auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
+            auto quant_conv = std::dynamic_pointer_cast<Conv2dDirect>(blocks["quant_conv"]);
             h               = quant_conv->forward(ctx, h);  // [N, 2*embed_dim, h/8, w/8]
         }
         return h;

From 6624650eaa9ae5f4773723b370f57fce4f00bdd6 Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Mon, 28 Jul 2025 18:58:16 +0200
Subject: [PATCH 02/12] Enable only for Vulkan, reduced duplicated code

---
 common.hpp      |  8 ++---
 ggml_extend.hpp | 60 ++++++++----------------------------
 vae.hpp         | 82 +++++++++++++++++++++++++++++--------------------
 3 files changed, 65 insertions(+), 85 deletions(-)

diff --git a/common.hpp b/common.hpp
index 2afee2260..c2aa397ca 100644
--- a/common.hpp
+++ b/common.hpp
@@ -17,7 +17,7 @@ class DownSampleBlock : public GGMLBlock {
           out_channels(out_channels),
           vae_downsample(vae_downsample) {
         if (vae_downsample) {
-            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(channels, out_channels, {3, 3}, {2, 2}, {0, 0}));
+            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, true, true));
         } else {
             blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}));
         }
@@ -26,7 +26,7 @@ class DownSampleBlock : public GGMLBlock {
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         // x: [N, channels, h, w]
         if (vae_downsample) {
-            auto conv = std::dynamic_pointer_cast<Conv2dDirect>(blocks["conv"]);
+            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
 
             x = ggml_pad(ctx, x, 1, 1, 0, 0);
             x = conv->forward(ctx, x);
@@ -49,12 +49,12 @@ class UpSampleBlock : public GGMLBlock {
                   int out_channels)
         : channels(channels),
           out_channels(out_channels) {
-        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         // x: [N, channels, h, w]
-        auto conv = std::dynamic_pointer_cast<Conv2dDirect>(blocks["conv"]);
+        auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
 
         x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST);  // [N, channels, h*2, w*2]
         x = conv->forward(ctx, x);                             // [N, out_channels, h*2, w*2]
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index a17162f43..161b48e91 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1475,6 +1475,7 @@ class Conv2d : public UnaryBlock {
     std::pair<int, int> padding;
     std::pair<int, int> dilation;
     bool bias;
+    bool direct;
 
     void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
         enum ggml_type wtype = GGML_TYPE_F16;  //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F16;
@@ -1492,14 +1493,16 @@ class Conv2d : public UnaryBlock {
            std::pair<int, int> stride   = {1, 1},
            std::pair<int, int> padding  = {0, 0},
            std::pair<int, int> dilation = {1, 1},
-           bool bias                    = true)
+           bool bias                    = true,
+           bool direct                  = false)
         : in_channels(in_channels),
           out_channels(out_channels),
           kernel_size(kernel_size),
           stride(stride),
           padding(padding),
           dilation(dilation),
-          bias(bias) {}
+          bias(bias),
+          direct(direct) {}
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         struct ggml_tensor* w = params["weight"];
@@ -1507,52 +1510,15 @@ class Conv2d : public UnaryBlock {
         if (bias) {
             b = params["bias"];
         }
-        return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
-    }
-};
-
-class Conv2dDirect : public UnaryBlock {
-protected:
-    int64_t in_channels;
-    int64_t out_channels;
-    std::pair<int, int> kernel_size;
-    std::pair<int, int> stride;
-    std::pair<int, int> padding;
-    std::pair<int, int> dilation;
-    bool bias;
-
-    void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type wtype = GGML_TYPE_F16;  //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F16;
-        params["weight"]     = ggml_new_tensor_4d(ctx, wtype, kernel_size.second, kernel_size.first, in_channels, out_channels);
-        if (bias) {
-            enum ggml_type wtype = GGML_TYPE_F32;  // (tensor_types.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32;
-            params["bias"]       = ggml_new_tensor_1d(ctx, wtype, out_channels);
-        }
-    }
-
-public:
-    Conv2dDirect(int64_t in_channels,
-           int64_t out_channels,
-           std::pair<int, int> kernel_size,
-           std::pair<int, int> stride   = {1, 1},
-           std::pair<int, int> padding  = {0, 0},
-           std::pair<int, int> dilation = {1, 1},
-           bool bias                    = true)
-        : in_channels(in_channels),
-          out_channels(out_channels),
-          kernel_size(kernel_size),
-          stride(stride),
-          padding(padding),
-          dilation(dilation),
-          bias(bias) {}
-
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
-        struct ggml_tensor* w = params["weight"];
-        struct ggml_tensor* b = NULL;
-        if (bias) {
-            b = params["bias"];
+        if (direct) {
+            #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL) || defined(SD_USE_OPENCL)
+                return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+            #else
+                return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+            #endif
+        } else {
+            return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
         }
-        return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
     }
 };
 
diff --git a/vae.hpp b/vae.hpp
index 88bcc349c..42e64a95e 100644
--- a/vae.hpp
+++ b/vae.hpp
@@ -20,13 +20,13 @@ class ResnetBlock : public UnaryBlock {
           out_channels(out_channels) {
         // temb_channels is always 0
         blocks["norm1"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true));
 
         blocks["norm2"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
-        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true));
 
         if (out_channels != in_channels) {
-            blocks["nin_shortcut"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(in_channels, out_channels, {1, 1}));
+            blocks["nin_shortcut"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true));
         }
     }
 
@@ -34,9 +34,9 @@ class ResnetBlock : public UnaryBlock {
         // x: [N, in_channels, h, w]
         // t_emb is always None
         auto norm1 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm1"]);
-        auto conv1 = std::dynamic_pointer_cast<Conv2dDirect>(blocks["conv1"]);
+        auto conv1 = std::dynamic_pointer_cast<Conv2d>(blocks["conv1"]);
         auto norm2 = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm2"]);
-        auto conv2 = std::dynamic_pointer_cast<Conv2dDirect>(blocks["conv2"]);
+        auto conv2 = std::dynamic_pointer_cast<Conv2d>(blocks["conv2"]);
 
         auto h = x;
         h      = norm1->forward(ctx, h);
@@ -51,7 +51,7 @@ class ResnetBlock : public UnaryBlock {
 
         // skip connection
         if (out_channels != in_channels) {
-            auto nin_shortcut = std::dynamic_pointer_cast<Conv2dDirect>(blocks["nin_shortcut"]);
+            auto nin_shortcut = std::dynamic_pointer_cast<Conv2d>(blocks["nin_shortcut"]);
 
             x = nin_shortcut->forward(ctx, x);  // [N, out_channels, h, w]
         }
@@ -69,20 +69,20 @@ class AttnBlock : public UnaryBlock {
     AttnBlock(int64_t in_channels)
         : in_channels(in_channels) {
         blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        blocks["q"]    = std::shared_ptr<GGMLBlock>(new Conv2dDirect(in_channels, in_channels, {1, 1}));
-        blocks["k"]    = std::shared_ptr<GGMLBlock>(new Conv2dDirect(in_channels, in_channels, {1, 1}));
-        blocks["v"]    = std::shared_ptr<GGMLBlock>(new Conv2dDirect(in_channels, in_channels, {1, 1}));
+        blocks["q"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true));
+        blocks["k"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true));
+        blocks["v"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true));
 
-        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(in_channels, in_channels, {1, 1}));
+        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         // x: [N, in_channels, h, w]
         auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
-        auto q_proj   = std::dynamic_pointer_cast<Conv2dDirect>(blocks["q"]);
-        auto k_proj   = std::dynamic_pointer_cast<Conv2dDirect>(blocks["k"]);
-        auto v_proj   = std::dynamic_pointer_cast<Conv2dDirect>(blocks["v"]);
-        auto proj_out = std::dynamic_pointer_cast<Conv2dDirect>(blocks["proj_out"]);
+        auto q_proj   = std::dynamic_pointer_cast<Conv2d>(blocks["q"]);
+        auto k_proj   = std::dynamic_pointer_cast<Conv2d>(blocks["k"]);
+        auto v_proj   = std::dynamic_pointer_cast<Conv2d>(blocks["v"]);
+        auto proj_out = std::dynamic_pointer_cast<Conv2d>(blocks["proj_out"]);
 
         auto h_ = norm->forward(ctx, x);
 
@@ -114,7 +114,7 @@ class AttnBlock : public UnaryBlock {
     }
 };
 
-class AE3DConv : public Conv2dDirect {
+class AE3DConv : public Conv2d {
 public:
     AE3DConv(int64_t in_channels,
              int64_t out_channels,
@@ -123,8 +123,9 @@ class AE3DConv : public Conv2dDirect {
              std::pair<int, int> stride   = {1, 1},
              std::pair<int, int> padding  = {0, 0},
              std::pair<int, int> dilation = {1, 1},
-             bool bias                    = true)
-        : Conv2dDirect(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) {
+             bool bias                    = true,
+             bool direct                  = false)
+        : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias, direct) {
         int64_t kernel_padding  = video_kernel_size / 2;
         blocks["time_mix_conv"] = std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(out_channels,
                                                                              out_channels,
@@ -141,7 +142,7 @@ class AE3DConv : public Conv2dDirect {
         // result: [N, OC, OH, OW]
         auto time_mix_conv = std::dynamic_pointer_cast<Conv3dnx1x1>(blocks["time_mix_conv"]);
 
-        x = Conv2dDirect::forward(ctx, x);
+        x = Conv2d::forward(ctx, x);
         // timesteps = x.shape[0]
         // x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
         // x = conv3d(x)
@@ -240,7 +241,7 @@ class Encoder : public GGMLBlock {
           in_channels(in_channels),
           z_channels(z_channels),
           double_z(double_z) {
-        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(in_channels, ch, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true));
 
         size_t num_resolutions = ch_mult.size();
 
@@ -268,18 +269,18 @@ class Encoder : public GGMLBlock {
         blocks["mid.block_2"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
 
         blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
-        blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true));
     }
 
     virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         // x: [N, in_channels, h, w]
 
-        auto conv_in     = std::dynamic_pointer_cast<Conv2dDirect>(blocks["conv_in"]);
+        auto conv_in     = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
         auto mid_block_1 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_1"]);
         auto mid_attn_1  = std::dynamic_pointer_cast<AttnBlock>(blocks["mid.attn_1"]);
         auto mid_block_2 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_2"]);
         auto norm_out    = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm_out"]);
-        auto conv_out    = std::dynamic_pointer_cast<Conv2dDirect>(blocks["conv_out"]);
+        auto conv_out    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
 
         auto h = conv_in->forward(ctx, x);  // [N, ch, h, w]
 
@@ -328,11 +329,14 @@ class Decoder : public GGMLBlock {
                                                     int64_t out_channels,
                                                     std::pair<int, int> kernel_size,
                                                     std::pair<int, int> stride  = {1, 1},
-                                                    std::pair<int, int> padding = {0, 0}) {
+                                                    std::pair<int, int> padding = {0, 0},
+                                                    std::pair<int, int> dilation = {1, 1},
+                                                    bool bias                    = true,
+                                                    bool direct                  = false){
         if (video_decoder) {
             return std::shared_ptr<GGMLBlock>(new AE3DConv(in_channels, out_channels, kernel_size, video_kernel_size, stride, padding));
         } else {
-            return std::shared_ptr<GGMLBlock>(new Conv2dDirect(in_channels, out_channels, kernel_size, stride, padding));
+            return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias, direct));
         }
     }
 
@@ -363,7 +367,7 @@ class Decoder : public GGMLBlock {
         size_t num_resolutions = ch_mult.size();
         int block_in           = ch * ch_mult[num_resolutions - 1];
 
-        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true));
 
         blocks["mid.block_1"] = get_resnet_block(block_in, block_in);
         blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in));
@@ -385,7 +389,7 @@ class Decoder : public GGMLBlock {
         }
 
         blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
-        blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1});
+        blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true);
     }
 
     virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
@@ -394,12 +398,12 @@ class Decoder : public GGMLBlock {
         // merge_strategy is always learned
         // time_mode is always conv-only, so we need to replace conv_out_op/resnet_op to AE3DConv/VideoResBlock
         // AttnVideoBlock will not be used
-        auto conv_in     = std::dynamic_pointer_cast<Conv2dDirect>(blocks["conv_in"]);
+        auto conv_in     = std::dynamic_pointer_cast<Conv2d>(blocks["conv_in"]);
         auto mid_block_1 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_1"]);
         auto mid_attn_1  = std::dynamic_pointer_cast<AttnBlock>(blocks["mid.attn_1"]);
         auto mid_block_2 = std::dynamic_pointer_cast<ResnetBlock>(blocks["mid.block_2"]);
         auto norm_out    = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm_out"]);
-        auto conv_out    = std::dynamic_pointer_cast<Conv2dDirect>(blocks["conv_out"]);
+        auto conv_out    = std::dynamic_pointer_cast<Conv2d>(blocks["conv_out"]);
 
         // conv_in
         auto h = conv_in->forward(ctx, z);  // [N, block_in, h, w]
@@ -472,9 +476,14 @@ class AutoencodingEngine : public GGMLBlock {
                                                                    dd_config.z_channels,
                                                                    use_video_decoder));
         if (use_quant) {
-            blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(dd_config.z_channels,
+            blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(dd_config.z_channels,
                                                                               embed_dim,
-                                                                              {1, 1}));
+                                                                              {1, 1},
+                                                                              {1, 1},
+                                                                              {0, 0},
+                                                                              {1, 1},
+                                                                              true,
+                                                                              true));
         }
         if (!decode_only) {
             blocks["encoder"] = std::shared_ptr<GGMLBlock>(new Encoder(dd_config.ch,
@@ -486,9 +495,14 @@ class AutoencodingEngine : public GGMLBlock {
             if (use_quant) {
                 int factor = dd_config.double_z ? 2 : 1;
 
-                blocks["quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2dDirect(embed_dim * factor,
+                blocks["quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(embed_dim * factor,
                                                                              dd_config.z_channels * factor,
-                                                                             {1, 1}));
+                                                                             {1, 1},
+                                                                             {1, 1},
+                                                                             {0, 0},
+                                                                             {1, 1},
+                                                                             true,
+                                                                             true));
             }
         }
     }
@@ -496,7 +510,7 @@ class AutoencodingEngine : public GGMLBlock {
     struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) {
         // z: [N, z_channels, h, w]
         if (use_quant) {
-            auto post_quant_conv = std::dynamic_pointer_cast<Conv2dDirect>(blocks["post_quant_conv"]);
+            auto post_quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["post_quant_conv"]);
             z                    = post_quant_conv->forward(ctx, z);  // [N, z_channels, h, w]
         }
         auto decoder = std::dynamic_pointer_cast<Decoder>(blocks["decoder"]);
@@ -513,7 +527,7 @@ class AutoencodingEngine : public GGMLBlock {
 
         auto h = encoder->forward(ctx, x);  // [N, 2*z_channels, h/8, w/8]
         if (use_quant) {
-            auto quant_conv = std::dynamic_pointer_cast<Conv2dDirect>(blocks["quant_conv"]);
+            auto quant_conv = std::dynamic_pointer_cast<Conv2d>(blocks["quant_conv"]);
             h               = quant_conv->forward(ctx, h);  // [N, 2*embed_dim, h/8, w/8]
         }
         return h;

From b06ddf9853792db9d73aca2504d454a39c801a7f Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Mon, 28 Jul 2025 19:21:24 +0200
Subject: [PATCH 03/12] Cmake option to use conv2d direct

---
 CMakeLists.txt  |  6 ++++++
 ggml_extend.hpp | 10 +++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 06de0d58b..f7b63c9e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,6 +31,7 @@ option(SD_VULKAN                     "sd: vulkan backend" OFF)
 option(SD_OPENCL                     "sd: opencl backend" OFF)
 option(SD_SYCL                       "sd: sycl backend" OFF)
 option(SD_MUSA                       "sd: musa backend" OFF)
+option(SD_CONV2D_DIRECT              "sd: enable conv2d direct support" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)
@@ -77,6 +78,11 @@ if(SD_MUSA)
     endif()
 endif()
 
+if(SD_CONV2D_DIRECT)
+    message("-- Use CONV2D Direct for VAE")
+    add_definitions(-DSD_USE_CONV2D_DIRECT)
+endif ()
+
 set(SD_LIB stable-diffusion)
 
 file(GLOB SD_LIB_SOURCES
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 161b48e91..45ad2381d 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1511,10 +1511,14 @@ class Conv2d : public UnaryBlock {
             b = params["bias"];
         }
         if (direct) {
-            #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL) || defined(SD_USE_OPENCL)
-                return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+            #if defined(SD_USE_CONV2D_DIRECT)
+                #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL) || defined(SD_USE_OPENCL)
+                    return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+                #else
+                    return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+                #endif
             #else
-                return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+                return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
             #endif
         } else {
             return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);

From f5b5f5c77463476c6e4e7f059ba4a05ae3da682d Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Mon, 28 Jul 2025 19:28:08 +0200
Subject: [PATCH 04/12] conv2d direct always on for opencl

---
 CMakeLists.txt  | 1 +
 ggml_extend.hpp | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f7b63c9e0..8bdbcd211 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,6 +58,7 @@ if (SD_OPENCL)
     message("-- Use OpenCL as backend stable-diffusion")
     set(GGML_OPENCL ON)
     add_definitions(-DSD_USE_OPENCL)
+    add_definitions(-DSD_USE_CONV2D_DIRECT)
 endif ()
 
 if (SD_HIPBLAS)
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 45ad2381d..d5cfd00f8 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1510,9 +1510,12 @@ class Conv2d : public UnaryBlock {
         if (bias) {
             b = params["bias"];
         }
+        #if defined(SD_USE_OPENCL)
+            direct = true
+        #endif
         if (direct) {
             #if defined(SD_USE_CONV2D_DIRECT)
-                #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL) || defined(SD_USE_OPENCL)
+                #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL)
                     return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
                 #else
                     return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);

From 7ce4f3b031828a8bc7cd6246ab50a686481f8b40 Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Wed, 30 Jul 2025 20:10:01 +0200
Subject: [PATCH 05/12] conv direct as a flag

---
 CMakeLists.txt        |  7 ----
 README.md             |  2 +
 common.hpp            | 27 +++++++-----
 diffusion_model.hpp   |  5 ++-
 examples/cli/main.cpp | 10 +++++
 ggml_extend.hpp       | 10 ++---
 stable-diffusion.cpp  | 15 +++++--
 stable-diffusion.h    |  2 +
 tae.hpp               | 80 ++++++++++++++++++++----------------
 unet.hpp              | 19 +++++----
 vae.hpp               | 95 ++++++++++++++++++++++++++-----------------
 11 files changed, 161 insertions(+), 111 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8bdbcd211..06de0d58b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,7 +31,6 @@ option(SD_VULKAN                     "sd: vulkan backend" OFF)
 option(SD_OPENCL                     "sd: opencl backend" OFF)
 option(SD_SYCL                       "sd: sycl backend" OFF)
 option(SD_MUSA                       "sd: musa backend" OFF)
-option(SD_CONV2D_DIRECT              "sd: enable conv2d direct support" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)
@@ -58,7 +57,6 @@ if (SD_OPENCL)
     message("-- Use OpenCL as backend stable-diffusion")
     set(GGML_OPENCL ON)
     add_definitions(-DSD_USE_OPENCL)
-    add_definitions(-DSD_USE_CONV2D_DIRECT)
 endif ()
 
 if (SD_HIPBLAS)
@@ -79,11 +77,6 @@ if(SD_MUSA)
     endif()
 endif()
 
-if(SD_CONV2D_DIRECT)
-    message("-- Use CONV2D Direct for VAE")
-    add_definitions(-DSD_USE_CONV2D_DIRECT)
-endif ()
-
 set(SD_LIB stable-diffusion)
 
 file(GLOB SD_LIB_SOURCES
diff --git a/README.md b/README.md
index 89eb095ec..3513b9f42 100644
--- a/README.md
+++ b/README.md
@@ -339,6 +339,8 @@ arguments:
   --vae-on-cpu                       keep vae in cpu (for low vram)
   --clip-on-cpu                      keep clip in cpu (for low vram)
   --diffusion-fa                     use flash attention in the diffusion model (for low vram)
+  --diffusion-conv-direct            use Conv2D direct in the diffusion model
+  --vae-conv-direct                  use Conv2D direct in the vae model (should improve the performance)
                                      Might lower quality, since it implies converting k and v to f16.
                                      This might crash if it is not supported by the backend.
   --control-net-cpu                  keep controlnet in cpu (for low vram)
diff --git a/common.hpp b/common.hpp
index c2aa397ca..5dff00eff 100644
--- a/common.hpp
+++ b/common.hpp
@@ -8,18 +8,21 @@ class DownSampleBlock : public GGMLBlock {
     int channels;
     int out_channels;
     bool vae_downsample;
+    bool direct = false;
 
 public:
     DownSampleBlock(int channels,
                     int out_channels,
-                    bool vae_downsample = false)
+                    bool vae_downsample = false,
+                    bool direct         = false)
         : channels(channels),
           out_channels(out_channels),
-          vae_downsample(vae_downsample) {
+          vae_downsample(vae_downsample),
+          direct(direct) {
         if (vae_downsample) {
-            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, true, true));
+            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, true, direct));
         } else {
-            blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}));
+            blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct));
         }
     }
 
@@ -43,13 +46,16 @@ class UpSampleBlock : public GGMLBlock {
 protected:
     int channels;
     int out_channels;
+    bool direct = false;
 
 public:
     UpSampleBlock(int channels,
-                  int out_channels)
+                  int out_channels,
+                  bool direct = false)
         : channels(channels),
-          out_channels(out_channels) {
-        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true));
+          out_channels(out_channels),
+          direct(direct) {
+        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -381,7 +387,8 @@ class SpatialTransformer : public GGMLBlock {
                        int64_t d_head,
                        int64_t depth,
                        int64_t context_dim,
-                       bool flash_attn = false)
+                       bool flash_attn = false,
+                       bool direct = false)
         : in_channels(in_channels),
           n_head(n_head),
           d_head(d_head),
@@ -391,14 +398,14 @@ class SpatialTransformer : public GGMLBlock {
         // disable_self_attn is always False
         int64_t inner_dim = n_head * d_head;  // in_channels
         blocks["norm"]    = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
+        blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
 
         for (int i = 0; i < depth; i++) {
             std::string name = "transformer_blocks." + std::to_string(i);
             blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn));
         }
 
-        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
+        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
     }
 
     virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index 5c349439d..fe799251b 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -34,8 +34,9 @@ struct UNetModel : public DiffusionModel {
     UNetModel(ggml_backend_t backend,
               std::map<std::string, enum ggml_type>& tensor_types,
               SDVersion version = VERSION_SD1,
-              bool flash_attn   = false)
-        : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
+              bool flash_attn   = false,
+              bool direct       = false)
+        : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn, direct) {
     }
 
     void alloc_params_buffer() {
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 140e3843a..3a13ef128 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -97,6 +97,8 @@ struct SDParams {
     bool clip_on_cpu              = false;
     bool vae_on_cpu               = false;
     bool diffusion_flash_attn     = false;
+    bool diffusion_conv_direct    = false;
+    bool vae_conv_direct          = false;
     bool canny_preprocess         = false;
     bool color                    = false;
     int upscale_repeats           = 1;
@@ -142,6 +144,8 @@ void print_params(SDParams params) {
     printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
     printf("    vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
     printf("    diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false");
+    printf("    diffusion Conv2D direct:%s\n", params.diffusion_conv_direct ? "true" : "false");
+    printf("    vae Conv2D direct:%s\n", params.vae_conv_direct ? "true" : "false");
     printf("    strength(control): %.2f\n", params.control_strength);
     printf("    prompt:            %s\n", params.prompt.c_str());
     printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
@@ -232,6 +236,8 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --diffusion-fa                     use flash attention in the diffusion model (for low vram)\n");
     printf("                                     Might lower quality, since it implies converting k and v to f16.\n");
     printf("                                     This might crash if it is not supported by the backend.\n");
+    printf("  --diffusion-conv-direct            use Conv2D direct in the diffusion model");
+    printf("  --vae-conv-direct                  use Conv2D direct in the vae model (should improve the performance)");
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
     printf("  --color                            colors the logging tags according to level\n");
@@ -422,6 +428,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--clip-on-cpu", "", true, &params.clip_on_cpu},
         {"", "--vae-on-cpu", "", true, &params.vae_on_cpu},
         {"", "--diffusion-fa", "", true, &params.diffusion_flash_attn},
+        {"", "--diffusion-conv-direct", "", true, &params.diffusion_conv_direct},
+        {"", "--vae-conv-direct", "", true, &params.vae_conv_direct},
         {"", "--canny", "", true, &params.canny_preprocess},
         {"-v", "--verbos", "", true, &params.verbose},
         {"", "--color", "", true, &params.color},
@@ -901,6 +909,8 @@ int main(int argc, const char* argv[]) {
         params.control_net_cpu,
         params.vae_on_cpu,
         params.diffusion_flash_attn,
+        params.diffusion_conv_direct,
+        params.vae_conv_direct,
         params.chroma_use_dit_mask,
         params.chroma_use_t5_mask,
         params.chroma_t5_mask_pad,
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index d5cfd00f8..d3d437be8 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1514,14 +1514,10 @@ class Conv2d : public UnaryBlock {
             direct = true
         #endif
         if (direct) {
-            #if defined(SD_USE_CONV2D_DIRECT)
-                #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL)
-                    return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
-                #else
-                    return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
-                #endif
-            #else
+            #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL)
                 return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
+            #else
+                return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
             #endif
         } else {
             return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 2594ba2b7..7baa74282 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -326,6 +326,12 @@ class StableDiffusionGGML {
                 LOG_INFO("CLIP: Using CPU backend");
                 clip_backend = ggml_backend_cpu_init();
             }
+            if (sd_ctx_params->diffusion_conv_direct) {
+                LOG_INFO("Using Conv2D direct in the diffusion model");
+            }
+            if (sd_ctx_params->vae_conv_direct){
+                LOG_INFO("Using Conv2D direct in the vae model");
+            }
             if (sd_ctx_params->diffusion_flash_attn) {
                 LOG_INFO("Using flash attention in the diffusion model");
             }
@@ -373,7 +379,8 @@ class StableDiffusionGGML {
                 diffusion_model = std::make_shared<UNetModel>(backend,
                                                               model_loader.tensor_storages_types,
                                                               version,
-                                                              sd_ctx_params->diffusion_flash_attn);
+                                                              sd_ctx_params->diffusion_flash_attn,
+                                                              sd_ctx_params->diffusion_conv_direct);
             }
 
             cond_stage_model->alloc_params_buffer();
@@ -394,7 +401,8 @@ class StableDiffusionGGML {
                                                                     "first_stage_model",
                                                                     vae_decode_only,
                                                                     false,
-                                                                    version);
+                                                                    version,
+                                                                    sd_ctx_params->vae_conv_direct);
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             } else {
@@ -402,7 +410,8 @@ class StableDiffusionGGML {
                                                                     model_loader.tensor_storages_types,
                                                                     "decoder.layers",
                                                                     vae_decode_only,
-                                                                    version);
+                                                                    version,
+                                                                    sd_ctx_params->vae_conv_direct);
             }
             // first_stage_model->get_param_tensors(tensors, "first_stage_model.");
 
diff --git a/stable-diffusion.h b/stable-diffusion.h
index a60325923..fc68f9b13 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -134,6 +134,8 @@ typedef struct {
     bool keep_control_net_on_cpu;
     bool keep_vae_on_cpu;
     bool diffusion_flash_attn;
+    bool diffusion_conv_direct;
+    bool vae_conv_direct;
     bool chroma_use_dit_mask;
     bool chroma_use_t5_mask;
     int chroma_t5_mask_pad;
diff --git a/tae.hpp b/tae.hpp
index 678c44c57..54ec7d659 100644
--- a/tae.hpp
+++ b/tae.hpp
@@ -17,15 +17,16 @@ class TAEBlock : public UnaryBlock {
 protected:
     int n_in;
     int n_out;
+    bool direct = false;
 
 public:
-    TAEBlock(int n_in, int n_out)
-        : n_in(n_in), n_out(n_out) {
-        blocks["conv.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {3, 3}, {1, 1}, {1, 1}));
-        blocks["conv.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
-        blocks["conv.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
+    TAEBlock(int n_in, int n_out, bool direct = false)
+        : n_in(n_in), n_out(n_out), direct(direct) {
+        blocks["conv.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["conv.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["conv.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
         if (n_in != n_out) {
-            blocks["skip"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false));
+            blocks["skip"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false, direct));
         }
     }
 
@@ -60,30 +61,32 @@ class TinyEncoder : public UnaryBlock {
     int channels    = 64;
     int z_channels  = 4;
     int num_blocks  = 3;
+    bool direct     = false;
 
 public:
-    TinyEncoder(int z_channels = 4)
-        : z_channels(z_channels) {
+    TinyEncoder(int z_channels = 4, bool direct = false)
+        : z_channels(z_channels),
+          direct(direct) {
         int index                       = 0;
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, channels, {3, 3}, {1, 1}, {1, 1}));
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, direct));
 
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false, direct));
         for (int i = 0; i < num_blocks; i++) {
-            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, direct));
         }
 
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false, direct));
         for (int i = 0; i < num_blocks; i++) {
-            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, direct));
         }
 
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false, direct));
         for (int i = 0; i < num_blocks; i++) {
-            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, direct));
         }
 
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -105,35 +108,37 @@ class TinyDecoder : public UnaryBlock {
     int channels     = 64;
     int out_channels = 3;
     int num_blocks   = 3;
+    bool direct      = false;
 
 public:
-    TinyDecoder(int z_channels = 4)
-        : z_channels(z_channels) {
+    TinyDecoder(int z_channels = 4, bool direct = false)
+        : z_channels(z_channels),
+          direct(direct) {
         int index = 0;
 
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
         index++;  // nn.ReLU()
 
         for (int i = 0; i < num_blocks; i++) {
-            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, direct));
         }
         index++;  // nn.Upsample()
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false, direct));
 
         for (int i = 0; i < num_blocks; i++) {
-            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, direct));
         }
         index++;  // nn.Upsample()
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false, direct));
 
         for (int i = 0; i < num_blocks; i++) {
-            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, direct));
         }
         index++;  // nn.Upsample()
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false, direct));
 
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, direct));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
@@ -165,18 +170,20 @@ class TinyDecoder : public UnaryBlock {
 class TAESD : public GGMLBlock {
 protected:
     bool decode_only;
+    bool direct = false;
 
 public:
-    TAESD(bool decode_only = true, SDVersion version = VERSION_SD1)
-        : decode_only(decode_only) {
+    TAESD(bool decode_only = true, SDVersion version = VERSION_SD1, bool direct = false)
+        : decode_only(decode_only),
+          direct(direct) {
         int z_channels = 4;
         if (sd_version_is_dit(version)) {
             z_channels = 16;
         }
-        blocks["decoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyDecoder(z_channels));
+        blocks["decoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyDecoder(z_channels, direct));
 
         if (!decode_only) {
-            blocks["encoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyEncoder(z_channels));
+            blocks["encoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyEncoder(z_channels, direct));
         }
     }
 
@@ -194,14 +201,17 @@ class TAESD : public GGMLBlock {
 struct TinyAutoEncoder : public GGMLRunner {
     TAESD taesd;
     bool decode_only = false;
+    bool direct = false;
 
     TinyAutoEncoder(ggml_backend_t backend,
                     std::map<std::string, enum ggml_type>& tensor_types,
                     const std::string prefix,
                     bool decoder_only = true,
-                    SDVersion version = VERSION_SD1)
+                    SDVersion version = VERSION_SD1,
+                    bool direct       = false)
         : decode_only(decoder_only),
-          taesd(decoder_only, version),
+          taesd(decoder_only, version, direct),
+          direct(direct),
           GGMLRunner(backend) {
         taesd.init(params_ctx, tensor_types, prefix);
     }
@@ -258,4 +268,4 @@ struct TinyAutoEncoder : public GGMLRunner {
     }
 };
 
-#endif  // __TAE_HPP__
\ No newline at end of file
+#endif  // __TAE_HPP__
diff --git a/unet.hpp b/unet.hpp
index 9193dcd67..d5db40e8f 100644
--- a/unet.hpp
+++ b/unet.hpp
@@ -184,7 +184,7 @@ class UnetModelBlock : public GGMLBlock {
     int model_channels  = 320;
     int adm_in_channels = 2816;  // only for VERSION_SDXL/SVD
 
-    UnetModelBlock(SDVersion version = VERSION_SD1, std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types, bool flash_attn = false)
+    UnetModelBlock(SDVersion version = VERSION_SD1, std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types, bool flash_attn = false, bool direct = false)
         : version(version) {
         if (sd_version_is_sd2(version)) {
             context_dim       = 1024;
@@ -225,7 +225,7 @@ class UnetModelBlock : public GGMLBlock {
         }
 
         // input_blocks
-        blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
 
         std::vector<int> input_block_chans;
         input_block_chans.push_back(model_channels);
@@ -237,7 +237,7 @@ class UnetModelBlock : public GGMLBlock {
             if (version == VERSION_SVD) {
                 return new VideoResBlock(channels, emb_channels, out_channels);
             } else {
-                return new ResBlock(channels, emb_channels, out_channels);
+                return new ResBlock(channels, emb_channels, out_channels, {3, 3});
             }
         };
 
@@ -249,7 +249,7 @@ class UnetModelBlock : public GGMLBlock {
             if (version == VERSION_SVD) {
                 return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim);
             } else {
-                return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, flash_attn);
+                return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, flash_attn, direct);
             }
         };
 
@@ -281,7 +281,7 @@ class UnetModelBlock : public GGMLBlock {
             if (i != len_mults - 1) {
                 input_block_idx += 1;
                 std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
-                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch));
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch, false, direct));
 
                 input_block_chans.push_back(ch);
                 ds *= 2;
@@ -331,7 +331,7 @@ class UnetModelBlock : public GGMLBlock {
 
                 if (i > 0 && j == num_res_blocks) {
                     std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx);
-                    blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(ch, ch));
+                    blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(ch, ch, direct));
 
                     ds /= 2;
                 }
@@ -343,7 +343,7 @@ class UnetModelBlock : public GGMLBlock {
         // out
         blocks["out.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(ch));  // ch == model_channels
         // out_1 is nn.SiLU()
-        blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
     }
 
     struct ggml_tensor* resblock_forward(std::string name,
@@ -542,8 +542,9 @@ struct UNetModelRunner : public GGMLRunner {
                     std::map<std::string, enum ggml_type>& tensor_types,
                     const std::string prefix,
                     SDVersion version = VERSION_SD1,
-                    bool flash_attn   = false)
-        : GGMLRunner(backend), unet(version, tensor_types, flash_attn) {
+                    bool flash_attn   = false,
+                    bool direct       = false)
+        : GGMLRunner(backend), unet(version, tensor_types, flash_attn, direct) {
         unet.init(params_ctx, tensor_types, prefix);
     }
 
diff --git a/vae.hpp b/vae.hpp
index 42e64a95e..07afe28c0 100644
--- a/vae.hpp
+++ b/vae.hpp
@@ -12,21 +12,24 @@ class ResnetBlock : public UnaryBlock {
 protected:
     int64_t in_channels;
     int64_t out_channels;
+    bool direct = false;
 
 public:
     ResnetBlock(int64_t in_channels,
-                int64_t out_channels)
+                int64_t out_channels,
+                bool direct = false)
         : in_channels(in_channels),
-          out_channels(out_channels) {
+          out_channels(out_channels),
+          direct(direct){
         // temb_channels is always 0
         blocks["norm1"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true));
+        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
 
         blocks["norm2"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
-        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true));
+        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
 
         if (out_channels != in_channels) {
-            blocks["nin_shortcut"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true));
+            blocks["nin_shortcut"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
         }
     }
 
@@ -64,16 +67,19 @@ class ResnetBlock : public UnaryBlock {
 class AttnBlock : public UnaryBlock {
 protected:
     int64_t in_channels;
+    bool direct = false;
 
 public:
-    AttnBlock(int64_t in_channels)
-        : in_channels(in_channels) {
+    AttnBlock(int64_t in_channels,
+              bool direct = false)
+        : in_channels(in_channels),
+          direct(direct){
         blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        blocks["q"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true));
-        blocks["k"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true));
-        blocks["v"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true));
+        blocks["q"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
+        blocks["k"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
+        blocks["v"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
 
-        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true));
+        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -177,8 +183,9 @@ class VideoResnetBlock : public ResnetBlock {
 public:
     VideoResnetBlock(int64_t in_channels,
                      int64_t out_channels,
-                     int video_kernel_size = 3)
-        : ResnetBlock(in_channels, out_channels) {
+                     int video_kernel_size = 3,
+                     bool direct           = false)
+        : ResnetBlock(in_channels, out_channels, direct) {
         // merge_strategy is always learned
         blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true));
     }
@@ -227,6 +234,7 @@ class Encoder : public GGMLBlock {
     int in_channels          = 3;
     int z_channels           = 4;
     bool double_z            = true;
+    bool direct              = false;
 
 public:
     Encoder(int ch,
@@ -234,14 +242,16 @@ class Encoder : public GGMLBlock {
             int num_res_blocks,
             int in_channels,
             int z_channels,
-            bool double_z = true)
+            bool double_z = true,
+            bool direct   = false)
         : ch(ch),
           ch_mult(ch_mult),
           num_res_blocks(num_res_blocks),
           in_channels(in_channels),
           z_channels(z_channels),
-          double_z(double_z) {
-        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true));
+          double_z(double_z),
+          direct(direct){
+        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
 
         size_t num_resolutions = ch_mult.size();
 
@@ -255,21 +265,21 @@ class Encoder : public GGMLBlock {
             int block_out = ch * ch_mult[i];
             for (int j = 0; j < num_res_blocks; j++) {
                 std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j);
-                blocks[name]     = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_out));
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_out, direct));
                 block_in         = block_out;
             }
             if (i != num_resolutions - 1) {
                 std::string name = "down." + std::to_string(i) + ".downsample";
-                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(block_in, block_in, true));
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(block_in, block_in, true, direct));
             }
         }
 
-        blocks["mid.block_1"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
-        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in));
-        blocks["mid.block_2"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
+        blocks["mid.block_1"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in, direct));
+        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in, direct));
+        blocks["mid.block_2"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in, direct));
 
         blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
-        blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true));
+        blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
     }
 
     virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -324,6 +334,7 @@ class Decoder : public GGMLBlock {
     int z_channels           = 4;
     bool video_decoder       = false;
     int video_kernel_size    = 3;
+    bool direct              = false;
 
     virtual std::shared_ptr<GGMLBlock> get_conv_out(int64_t in_channels,
                                                     int64_t out_channels,
@@ -343,9 +354,9 @@ class Decoder : public GGMLBlock {
     virtual std::shared_ptr<GGMLBlock> get_resnet_block(int64_t in_channels,
                                                         int64_t out_channels) {
         if (video_decoder) {
-            return std::shared_ptr<GGMLBlock>(new VideoResnetBlock(in_channels, out_channels, video_kernel_size));
+            return std::shared_ptr<GGMLBlock>(new VideoResnetBlock(in_channels, out_channels, video_kernel_size, direct));
         } else {
-            return std::shared_ptr<GGMLBlock>(new ResnetBlock(in_channels, out_channels));
+            return std::shared_ptr<GGMLBlock>(new ResnetBlock(in_channels, out_channels, direct));
         }
     }
 
@@ -356,21 +367,23 @@ class Decoder : public GGMLBlock {
             int num_res_blocks,
             int z_channels,
             bool video_decoder    = false,
-            int video_kernel_size = 3)
+            int video_kernel_size = 3,
+            bool direct           = false)
         : ch(ch),
           out_ch(out_ch),
           ch_mult(ch_mult),
           num_res_blocks(num_res_blocks),
           z_channels(z_channels),
           video_decoder(video_decoder),
-          video_kernel_size(video_kernel_size) {
+          video_kernel_size(video_kernel_size),
+          direct(direct) {
         size_t num_resolutions = ch_mult.size();
         int block_in           = ch * ch_mult[num_resolutions - 1];
 
-        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true));
+        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
 
         blocks["mid.block_1"] = get_resnet_block(block_in, block_in);
-        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in));
+        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in, direct));
         blocks["mid.block_2"] = get_resnet_block(block_in, block_in);
 
         for (int i = num_resolutions - 1; i >= 0; i--) {
@@ -384,12 +397,12 @@ class Decoder : public GGMLBlock {
             }
             if (i != 0) {
                 std::string name = "up." + std::to_string(i) + ".upsample";
-                blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(block_in, block_in));
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(block_in, block_in, direct));
             }
         }
 
         blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
-        blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true);
+        blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct);
     }
 
     virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
@@ -446,6 +459,7 @@ class AutoencodingEngine : public GGMLBlock {
     bool use_video_decoder = false;
     bool use_quant         = true;
     int embed_dim          = 4;
+    bool direct            = false;
     struct {
         int z_channels           = 4;
         int resolution           = 256;
@@ -460,8 +474,9 @@ class AutoencodingEngine : public GGMLBlock {
 public:
     AutoencodingEngine(bool decode_only       = true,
                        bool use_video_decoder = false,
-                       SDVersion version      = VERSION_SD1)
-        : decode_only(decode_only), use_video_decoder(use_video_decoder) {
+                       SDVersion version      = VERSION_SD1,
+                       bool direct            = false)
+        : decode_only(decode_only), use_video_decoder(use_video_decoder), direct(direct) {
         if (sd_version_is_dit(version)) {
             dd_config.z_channels = 16;
             use_quant            = false;
@@ -474,7 +489,9 @@ class AutoencodingEngine : public GGMLBlock {
                                                                    dd_config.ch_mult,
                                                                    dd_config.num_res_blocks,
                                                                    dd_config.z_channels,
-                                                                   use_video_decoder));
+                                                                   use_video_decoder,
+                                                                   3,
+                                                                   direct));
         if (use_quant) {
             blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(dd_config.z_channels,
                                                                               embed_dim,
@@ -483,7 +500,7 @@ class AutoencodingEngine : public GGMLBlock {
                                                                               {0, 0},
                                                                               {1, 1},
                                                                               true,
-                                                                              true));
+                                                                              direct));
         }
         if (!decode_only) {
             blocks["encoder"] = std::shared_ptr<GGMLBlock>(new Encoder(dd_config.ch,
@@ -491,7 +508,8 @@ class AutoencodingEngine : public GGMLBlock {
                                                                        dd_config.num_res_blocks,
                                                                        dd_config.in_channels,
                                                                        dd_config.z_channels,
-                                                                       dd_config.double_z));
+                                                                       dd_config.double_z,
+                                                                       direct));
             if (use_quant) {
                 int factor = dd_config.double_z ? 2 : 1;
 
@@ -502,7 +520,7 @@ class AutoencodingEngine : public GGMLBlock {
                                                                              {0, 0},
                                                                              {1, 1},
                                                                              true,
-                                                                             true));
+                                                                             direct));
             }
         }
     }
@@ -543,8 +561,9 @@ struct AutoEncoderKL : public GGMLRunner {
                   const std::string prefix,
                   bool decode_only       = false,
                   bool use_video_decoder = false,
-                  SDVersion version      = VERSION_SD1)
-        : decode_only(decode_only), ae(decode_only, use_video_decoder, version), GGMLRunner(backend) {
+                  SDVersion version      = VERSION_SD1,
+                  bool direct            = false)
+        : decode_only(decode_only), ae(decode_only, use_video_decoder, version, direct), GGMLRunner(backend) {
         ae.init(params_ctx, tensor_types, prefix);
     }
 

From 70cef96a7654afb9efe2535cbd1af5f94df789e4 Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Wed, 30 Jul 2025 20:22:01 +0200
Subject: [PATCH 06/12] fix merge typo

---
 diffusion_model.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index 33175b9f1..de65c1c36 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -34,7 +34,8 @@ struct UNetModel : public DiffusionModel {
     UNetModel(ggml_backend_t backend,
               const String2GGMLType& tensor_types = {},
               SDVersion version                   = VERSION_SD1,
-              bool flash_attn                     = false)
+              bool flash_attn                     = false,
+              bool direct                         = false)
         : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn, direct) {
     }
 

From 9bbf53c3c67c1840d813e8c4b907f0e11f59c8f5 Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Thu, 31 Jul 2025 11:24:59 +0200
Subject: [PATCH 07/12] Align conv2d behavior to flash attention's

---
 examples/cli/main.cpp | 2 ++
 ggml_extend.hpp       | 9 +--------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 3a13ef128..f8fedcb69 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -237,7 +237,9 @@ void print_usage(int argc, const char* argv[]) {
     printf("                                     Might lower quality, since it implies converting k and v to f16.\n");
     printf("                                     This might crash if it is not supported by the backend.\n");
     printf("  --diffusion-conv-direct            use Conv2D direct in the diffusion model");
+    printf("                                     This might crash if it is not supported by the backend.\n");
     printf("  --vae-conv-direct                  use Conv2D direct in the vae model (should improve the performance)");
+    printf("                                     This might crash if it is not supported by the backend.\n");
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
     printf("  --color                            colors the logging tags according to level\n");
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index ab4926ee1..de7b2bfa1 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1518,15 +1518,8 @@ class Conv2d : public UnaryBlock {
         if (bias) {
             b = params["bias"];
         }
-        #if defined(SD_USE_OPENCL)
-            direct = true
-        #endif
         if (direct) {
-            #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL)
-                return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
-            #else
-                return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
-            #endif
+            return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
         } else {
             return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first);
         }

From 9a349b2f73969e36b48005887b59041ac89cba2c Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Thu, 31 Jul 2025 12:14:56 +0200
Subject: [PATCH 08/12] fix readme

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3513b9f42..e07142c00 100644
--- a/README.md
+++ b/README.md
@@ -339,9 +339,11 @@ arguments:
   --vae-on-cpu                       keep vae in cpu (for low vram)
   --clip-on-cpu                      keep clip in cpu (for low vram)
   --diffusion-fa                     use flash attention in the diffusion model (for low vram)
+                                     Might lower quality, since it implies converting k and v to f16.
+                                     This might crash if it is not supported by the backend.
   --diffusion-conv-direct            use Conv2D direct in the diffusion model
+                                     This might crash if it is not supported by the backend.
   --vae-conv-direct                  use Conv2D direct in the vae model (should improve the performance)
-                                     Might lower quality, since it implies converting k and v to f16.
                                      This might crash if it is not supported by the backend.
   --control-net-cpu                  keep controlnet in cpu (for low vram)
   --canny                            apply canny preprocessor (edge detection)

From 8974ec134faa00cb87a1a13c97869f1d8849becd Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Thu, 31 Jul 2025 12:46:51 +0200
Subject: [PATCH 09/12] add conv2d direct for controlnet

---
 control.hpp          | 34 +++++++++++++++++++---------------
 stable-diffusion.cpp |  2 +-
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/control.hpp b/control.hpp
index d8f81fc0d..af28fcb33 100644
--- a/control.hpp
+++ b/control.hpp
@@ -27,13 +27,16 @@ class ControlNetBlock : public GGMLBlock {
     int num_heads                          = 8;
     int num_head_channels                  = -1;   // channels // num_heads
     int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
+    bool direct                            = false;
 
 public:
     int model_channels  = 320;
     int adm_in_channels = 2816;  // only for VERSION_SDXL
 
-    ControlNetBlock(SDVersion version = VERSION_SD1)
-        : version(version) {
+    ControlNetBlock(SDVersion version = VERSION_SD1,
+                    bool direct       = false)
+        : version(version),
+          direct(direct) {
         if (sd_version_is_sd2(version)) {
             context_dim       = 1024;
             num_head_channels = 64;
@@ -65,7 +68,7 @@ class ControlNetBlock : public GGMLBlock {
         }
 
         // input_blocks
-        blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
 
         std::vector<int> input_block_chans;
         input_block_chans.push_back(model_channels);
@@ -86,26 +89,26 @@ class ControlNetBlock : public GGMLBlock {
         };
 
         auto make_zero_conv = [&](int64_t channels) {
-            return new Conv2d(channels, channels, {1, 1});
+            return new Conv2d(channels, channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct);
         };
 
         blocks["zero_convs.0.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(model_channels));
 
-        blocks["input_hint_block.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(hint_channels, 16, {3, 3}, {1, 1}, {1, 1}));
+        blocks["input_hint_block.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(hint_channels, 16, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
         // nn.SiLU()
-        blocks["input_hint_block.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 16, {3, 3}, {1, 1}, {1, 1}));
+        blocks["input_hint_block.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 16, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
         // nn.SiLU()
-        blocks["input_hint_block.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 32, {3, 3}, {2, 2}, {1, 1}));
+        blocks["input_hint_block.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 32, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct));
         // nn.SiLU()
-        blocks["input_hint_block.6"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 32, {3, 3}, {1, 1}, {1, 1}));
+        blocks["input_hint_block.6"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 32, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
         // nn.SiLU()
-        blocks["input_hint_block.8"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 96, {3, 3}, {2, 2}, {1, 1}));
+        blocks["input_hint_block.8"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 96, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct));
         // nn.SiLU()
-        blocks["input_hint_block.10"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 96, {3, 3}, {1, 1}, {1, 1}));
+        blocks["input_hint_block.10"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 96, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
         // nn.SiLU()
-        blocks["input_hint_block.12"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 256, {3, 3}, {2, 2}, {1, 1}));
+        blocks["input_hint_block.12"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 256, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct));
         // nn.SiLU()
-        blocks["input_hint_block.14"] = std::shared_ptr<GGMLBlock>(new Conv2d(256, model_channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks["input_hint_block.14"] = std::shared_ptr<GGMLBlock>(new Conv2d(256, model_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
 
         size_t len_mults = channel_mult.size();
         for (int i = 0; i < len_mults; i++) {
@@ -318,8 +321,9 @@ struct ControlNet : public GGMLRunner {
 
     ControlNet(ggml_backend_t backend,
                const String2GGMLType& tensor_types = {},
-               SDVersion version                   = VERSION_SD1)
-        : GGMLRunner(backend), control_net(version) {
+               SDVersion version                   = VERSION_SD1,
+               bool direct                         = false)
+        : GGMLRunner(backend), control_net(version, direct) {
         control_net.init(params_ctx, tensor_types, "");
     }
 
@@ -455,4 +459,4 @@ struct ControlNet : public GGMLRunner {
     }
 };
 
-#endif  // __CONTROL_HPP__
\ No newline at end of file
+#endif  // __CONTROL_HPP__
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 7baa74282..245ffea0e 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -423,7 +423,7 @@ class StableDiffusionGGML {
                 } else {
                     controlnet_backend = backend;
                 }
-                control_net = std::make_shared<ControlNet>(controlnet_backend, model_loader.tensor_storages_types, version);
+                control_net = std::make_shared<ControlNet>(controlnet_backend, model_loader.tensor_storages_types, version, sd_ctx_params->diffusion_conv_direct);
             }
 
             if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {

From 9b6339c915b62a27653b52492e198357636afc7e Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Thu, 31 Jul 2025 13:09:55 +0200
Subject: [PATCH 10/12] add conv2d direct for esrgan

---
 esrgan.hpp            | 44 ++++++++++++++++++++++---------------------
 examples/cli/main.cpp |  3 ++-
 stable-diffusion.h    |  3 ++-
 upscaler.cpp          | 14 +++++++++-----
 4 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/esrgan.hpp b/esrgan.hpp
index 4215db192..96997ac1e 100644
--- a/esrgan.hpp
+++ b/esrgan.hpp
@@ -16,15 +16,16 @@ class ResidualDenseBlock : public GGMLBlock {
 protected:
     int num_feat;
     int num_grow_ch;
+    bool direct = false;
 
 public:
-    ResidualDenseBlock(int num_feat = 64, int num_grow_ch = 32)
-        : num_feat(num_feat), num_grow_ch(num_grow_ch) {
-        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
-        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
-        blocks["conv3"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
-        blocks["conv4"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
-        blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
+    ResidualDenseBlock(int num_feat = 64, int num_grow_ch = 32, bool direct = false)
+        : num_feat(num_feat), num_grow_ch(num_grow_ch), direct(direct) {
+        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_grow_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["conv3"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["conv4"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
     }
 
     struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -58,10 +59,10 @@ class ResidualDenseBlock : public GGMLBlock {
 
 class RRDB : public GGMLBlock {
 public:
-    RRDB(int num_feat, int num_grow_ch = 32) {
-        blocks["rdb1"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
-        blocks["rdb2"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
-        blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
+    RRDB(int num_feat, int num_grow_ch = 32, bool direct = false) {
+        blocks["rdb1"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch, direct));
+        blocks["rdb2"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch, direct));
+        blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch, direct));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -89,20 +90,21 @@ class RRDBNet : public GGMLBlock {
     int num_out_ch  = 3;
     int num_feat    = 64;  // default RealESRGAN_x4plus_anime_6B
     int num_grow_ch = 32;  // default RealESRGAN_x4plus_anime_6B
+    bool direct     = false;
 
 public:
-    RRDBNet() {
-        blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
+    RRDBNet(bool direct = false) {
+        blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
         for (int i = 0; i < num_block; i++) {
             std::string name = "body." + std::to_string(i);
             blocks[name]     = std::shared_ptr<GGMLBlock>(new RRDB(num_feat, num_grow_ch));
         }
-        blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
         // upsample
-        blocks["conv_up1"]  = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
-        blocks["conv_up2"]  = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
-        blocks["conv_hr"]   = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
-        blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_up1"]  = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["conv_up2"]  = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["conv_hr"]   = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
     }
 
     struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -142,8 +144,8 @@ struct ESRGAN : public GGMLRunner {
     int scale     = 4;
     int tile_size = 128;  // avoid cuda OOM for 4gb VRAM
 
-    ESRGAN(ggml_backend_t backend, const String2GGMLType& tensor_types = {})
-        : GGMLRunner(backend) {
+    ESRGAN(ggml_backend_t backend, const String2GGMLType& tensor_types = {}, bool direct = false)
+        : GGMLRunner(backend), rrdb_net(direct) {
         rrdb_net.init(params_ctx, tensor_types, "");
     }
 
@@ -194,4 +196,4 @@ struct ESRGAN : public GGMLRunner {
     }
 };
 
-#endif  // __ESRGAN_HPP__
\ No newline at end of file
+#endif  // __ESRGAN_HPP__
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index f8fedcb69..98aadb044 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -1024,7 +1024,8 @@ int main(int argc, const char* argv[]) {
     int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
     if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) {
         upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
-                                                        params.n_threads);
+                                                        params.n_threads,
+                                                        params.diffusion_conv_direct);
 
         if (upscaler_ctx == NULL) {
             printf("new_upscaler_ctx failed\n");
diff --git a/stable-diffusion.h b/stable-diffusion.h
index fc68f9b13..e87ac2ce2 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -238,7 +238,8 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
 typedef struct upscaler_ctx_t upscaler_ctx_t;
 
 SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
-                                        int n_threads);
+                                        int n_threads,
+                                        bool direct);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
diff --git a/upscaler.cpp b/upscaler.cpp
index 137213496..69d5ef392 100644
--- a/upscaler.cpp
+++ b/upscaler.cpp
@@ -9,9 +9,12 @@ struct UpscalerGGML {
     std::shared_ptr<ESRGAN> esrgan_upscaler;
     std::string esrgan_path;
     int n_threads;
+    bool direct = false;
 
-    UpscalerGGML(int n_threads)
-        : n_threads(n_threads) {
+    UpscalerGGML(int n_threads,
+                 bool direct = false)
+        : n_threads(n_threads),
+          direct(direct) {
     }
 
     bool load_from_file(const std::string& esrgan_path) {
@@ -46,7 +49,7 @@ struct UpscalerGGML {
             backend = ggml_backend_cpu_init();
         }
         LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
-        esrgan_upscaler = std::make_shared<ESRGAN>(backend, model_loader.tensor_storages_types);
+        esrgan_upscaler = std::make_shared<ESRGAN>(backend, model_loader.tensor_storages_types, direct);
         if (!esrgan_upscaler->load_from_file(esrgan_path)) {
             return false;
         }
@@ -104,14 +107,15 @@ struct upscaler_ctx_t {
 };
 
 upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
-                                 int n_threads) {
+                                 int n_threads,
+                                 bool direct = false) {
     upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
     if (upscaler_ctx == NULL) {
         return NULL;
     }
     std::string esrgan_path(esrgan_path_c_str);
 
-    upscaler_ctx->upscaler = new UpscalerGGML(n_threads);
+    upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct);
     if (upscaler_ctx->upscaler == NULL) {
         return NULL;
     }

From 2e85d2c4cfd8a8c50c7d6344f1a67e707ffa6cb8 Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Sat, 2 Aug 2025 13:47:41 +0800
Subject: [PATCH 11/12] clean code, use enable_conv2d_direct/get_all_blocks

---
 README.md             |   4 +-
 common.hpp            |  27 ++++-----
 control.hpp           |  45 ++++++++-------
 diffusion_model.hpp   |   5 +-
 esrgan.hpp            |  55 ++++++++++--------
 examples/cli/main.cpp |   8 +--
 ggml_extend.hpp       |  29 ++++++++--
 stable-diffusion.cpp  |  33 ++++++-----
 tae.hpp               |  91 +++++++++++++++---------------
 unet.hpp              |  31 ++++++----
 upscaler.cpp          |   5 +-
 vae.hpp               | 128 +++++++++++++++++-------------------------
 12 files changed, 244 insertions(+), 217 deletions(-)

diff --git a/README.md b/README.md
index e07142c00..5a28052f6 100644
--- a/README.md
+++ b/README.md
@@ -341,9 +341,9 @@ arguments:
   --diffusion-fa                     use flash attention in the diffusion model (for low vram)
                                      Might lower quality, since it implies converting k and v to f16.
                                      This might crash if it is not supported by the backend.
-  --diffusion-conv-direct            use Conv2D direct in the diffusion model
+  --diffusion-conv-direct            use Conv2d direct in the diffusion model
                                      This might crash if it is not supported by the backend.
-  --vae-conv-direct                  use Conv2D direct in the vae model (should improve the performance)
+  --vae-conv-direct                  use Conv2d direct in the vae model (should improve the performance)
                                      This might crash if it is not supported by the backend.
   --control-net-cpu                  keep controlnet in cpu (for low vram)
   --canny                            apply canny preprocessor (edge detection)
diff --git a/common.hpp b/common.hpp
index b92d31722..3a1307767 100644
--- a/common.hpp
+++ b/common.hpp
@@ -8,21 +8,18 @@ class DownSampleBlock : public GGMLBlock {
     int channels;
     int out_channels;
     bool vae_downsample;
-    bool direct = false;
 
 public:
     DownSampleBlock(int channels,
                     int out_channels,
-                    bool vae_downsample = false,
-                    bool direct         = false)
+                    bool vae_downsample = false)
         : channels(channels),
           out_channels(out_channels),
-          vae_downsample(vae_downsample),
-          direct(direct) {
+          vae_downsample(vae_downsample) {
         if (vae_downsample) {
-            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, true, direct));
+            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}));
         } else {
-            blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct));
+            blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}));
         }
     }
 
@@ -46,16 +43,13 @@ class UpSampleBlock : public GGMLBlock {
 protected:
     int channels;
     int out_channels;
-    bool direct = false;
 
 public:
     UpSampleBlock(int channels,
-                  int out_channels,
-                  bool direct = false)
+                  int out_channels)
         : channels(channels),
-          out_channels(out_channels),
-          direct(direct) {
-        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+          out_channels(out_channels) {
+        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -387,8 +381,7 @@ class SpatialTransformer : public GGMLBlock {
                        int64_t d_head,
                        int64_t depth,
                        int64_t context_dim,
-                       bool flash_attn = false,
-                       bool direct = false)
+                       bool flash_attn = false)
         : in_channels(in_channels),
           n_head(n_head),
           d_head(d_head),
@@ -398,14 +391,14 @@ class SpatialTransformer : public GGMLBlock {
         // disable_self_attn is always False
         int64_t inner_dim = n_head * d_head;  // in_channels
         blocks["norm"]    = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
+        blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
 
         for (int i = 0; i < depth; i++) {
             std::string name = "transformer_blocks." + std::to_string(i);
             blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn));
         }
 
-        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
+        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
     }
 
     virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
diff --git a/control.hpp b/control.hpp
index af28fcb33..63fe70455 100644
--- a/control.hpp
+++ b/control.hpp
@@ -27,16 +27,13 @@ class ControlNetBlock : public GGMLBlock {
     int num_heads                          = 8;
     int num_head_channels                  = -1;   // channels // num_heads
     int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
-    bool direct                            = false;
 
 public:
     int model_channels  = 320;
     int adm_in_channels = 2816;  // only for VERSION_SDXL
 
-    ControlNetBlock(SDVersion version = VERSION_SD1,
-                    bool direct       = false)
-        : version(version),
-          direct(direct) {
+    ControlNetBlock(SDVersion version = VERSION_SD1)
+        : version(version) {
         if (sd_version_is_sd2(version)) {
             context_dim       = 1024;
             num_head_channels = 64;
@@ -68,7 +65,7 @@ class ControlNetBlock : public GGMLBlock {
         }
 
         // input_blocks
-        blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
 
         std::vector<int> input_block_chans;
         input_block_chans.push_back(model_channels);
@@ -89,26 +86,26 @@ class ControlNetBlock : public GGMLBlock {
         };
 
         auto make_zero_conv = [&](int64_t channels) {
-            return new Conv2d(channels, channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct);
+            return new Conv2d(channels, channels, {1, 1});
         };
 
         blocks["zero_convs.0.0"] = std::shared_ptr<GGMLBlock>(make_zero_conv(model_channels));
 
-        blocks["input_hint_block.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(hint_channels, 16, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["input_hint_block.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(hint_channels, 16, {3, 3}, {1, 1}, {1, 1}));
         // nn.SiLU()
-        blocks["input_hint_block.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 16, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["input_hint_block.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 16, {3, 3}, {1, 1}, {1, 1}));
         // nn.SiLU()
-        blocks["input_hint_block.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 32, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct));
+        blocks["input_hint_block.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(16, 32, {3, 3}, {2, 2}, {1, 1}));
         // nn.SiLU()
-        blocks["input_hint_block.6"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 32, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["input_hint_block.6"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 32, {3, 3}, {1, 1}, {1, 1}));
         // nn.SiLU()
-        blocks["input_hint_block.8"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 96, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct));
+        blocks["input_hint_block.8"] = std::shared_ptr<GGMLBlock>(new Conv2d(32, 96, {3, 3}, {2, 2}, {1, 1}));
         // nn.SiLU()
-        blocks["input_hint_block.10"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 96, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["input_hint_block.10"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 96, {3, 3}, {1, 1}, {1, 1}));
         // nn.SiLU()
-        blocks["input_hint_block.12"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 256, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct));
+        blocks["input_hint_block.12"] = std::shared_ptr<GGMLBlock>(new Conv2d(96, 256, {3, 3}, {2, 2}, {1, 1}));
         // nn.SiLU()
-        blocks["input_hint_block.14"] = std::shared_ptr<GGMLBlock>(new Conv2d(256, model_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["input_hint_block.14"] = std::shared_ptr<GGMLBlock>(new Conv2d(256, model_channels, {3, 3}, {1, 1}, {1, 1}));
 
         size_t len_mults = channel_mult.size();
         for (int i = 0; i < len_mults; i++) {
@@ -321,12 +318,22 @@ struct ControlNet : public GGMLRunner {
 
     ControlNet(ggml_backend_t backend,
                const String2GGMLType& tensor_types = {},
-               SDVersion version                   = VERSION_SD1,
-               bool direct                         = false)
-        : GGMLRunner(backend), control_net(version, direct) {
+               SDVersion version                   = VERSION_SD1)
+        : GGMLRunner(backend), control_net(version) {
         control_net.init(params_ctx, tensor_types, "");
     }
 
+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        control_net.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
     ~ControlNet() {
         free_control_ctx();
     }
@@ -459,4 +466,4 @@ struct ControlNet : public GGMLRunner {
     }
 };
 
-#endif  // __CONTROL_HPP__
+#endif  // __CONTROL_HPP__
\ No newline at end of file
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index de65c1c36..787a4fa79 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -34,9 +34,8 @@ struct UNetModel : public DiffusionModel {
     UNetModel(ggml_backend_t backend,
               const String2GGMLType& tensor_types = {},
               SDVersion version                   = VERSION_SD1,
-              bool flash_attn                     = false,
-              bool direct                         = false)
-        : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn, direct) {
+              bool flash_attn                     = false)
+        : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
     }
 
     void alloc_params_buffer() {
diff --git a/esrgan.hpp b/esrgan.hpp
index 96997ac1e..3e41a8871 100644
--- a/esrgan.hpp
+++ b/esrgan.hpp
@@ -16,16 +16,15 @@ class ResidualDenseBlock : public GGMLBlock {
 protected:
     int num_feat;
     int num_grow_ch;
-    bool direct = false;
 
 public:
-    ResidualDenseBlock(int num_feat = 64, int num_grow_ch = 32, bool direct = false)
-        : num_feat(num_feat), num_grow_ch(num_grow_ch), direct(direct) {
-        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_grow_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
-        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
-        blocks["conv3"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
-        blocks["conv4"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
-        blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+    ResidualDenseBlock(int num_feat = 64, int num_grow_ch = 32)
+        : num_feat(num_feat), num_grow_ch(num_grow_ch) {
+        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv3"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv4"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv5"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
     }
 
     struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -59,10 +58,10 @@ class ResidualDenseBlock : public GGMLBlock {
 
 class RRDB : public GGMLBlock {
 public:
-    RRDB(int num_feat, int num_grow_ch = 32, bool direct = false) {
-        blocks["rdb1"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch, direct));
-        blocks["rdb2"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch, direct));
-        blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch, direct));
+    RRDB(int num_feat, int num_grow_ch = 32) {
+        blocks["rdb1"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
+        blocks["rdb2"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
+        blocks["rdb3"] = std::shared_ptr<GGMLBlock>(new ResidualDenseBlock(num_feat, num_grow_ch));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -90,21 +89,20 @@ class RRDBNet : public GGMLBlock {
     int num_out_ch  = 3;
     int num_feat    = 64;  // default RealESRGAN_x4plus_anime_6B
     int num_grow_ch = 32;  // default RealESRGAN_x4plus_anime_6B
-    bool direct     = false;
 
 public:
-    RRDBNet(bool direct = false) {
-        blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+    RRDBNet() {
+        blocks["conv_first"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}));
         for (int i = 0; i < num_block; i++) {
             std::string name = "body." + std::to_string(i);
             blocks[name]     = std::shared_ptr<GGMLBlock>(new RRDB(num_feat, num_grow_ch));
         }
-        blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["conv_body"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
         // upsample
-        blocks["conv_up1"]  = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
-        blocks["conv_up2"]  = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
-        blocks["conv_hr"]   = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
-        blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["conv_up1"]  = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_up2"]  = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_hr"]   = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv_last"] = std::shared_ptr<GGMLBlock>(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}));
     }
 
     struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -144,11 +142,22 @@ struct ESRGAN : public GGMLRunner {
     int scale     = 4;
     int tile_size = 128;  // avoid cuda OOM for 4gb VRAM
 
-    ESRGAN(ggml_backend_t backend, const String2GGMLType& tensor_types = {}, bool direct = false)
-        : GGMLRunner(backend), rrdb_net(direct) {
+    ESRGAN(ggml_backend_t backend, const String2GGMLType& tensor_types = {})
+        : GGMLRunner(backend) {
         rrdb_net.init(params_ctx, tensor_types, "");
     }
 
+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        rrdb_net.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
     std::string get_desc() {
         return "esrgan";
     }
@@ -196,4 +205,4 @@ struct ESRGAN : public GGMLRunner {
     }
 };
 
-#endif  // __ESRGAN_HPP__
+#endif  // __ESRGAN_HPP__
\ No newline at end of file
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 98aadb044..ec04dfde3 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -144,8 +144,8 @@ void print_params(SDParams params) {
     printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
     printf("    vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
     printf("    diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false");
-    printf("    diffusion Conv2D direct:%s\n", params.diffusion_conv_direct ? "true" : "false");
-    printf("    vae Conv2D direct:%s\n", params.vae_conv_direct ? "true" : "false");
+    printf("    diffusion Conv2d direct:%s\n", params.diffusion_conv_direct ? "true" : "false");
+    printf("    vae Conv2d direct:%s\n", params.vae_conv_direct ? "true" : "false");
     printf("    strength(control): %.2f\n", params.control_strength);
     printf("    prompt:            %s\n", params.prompt.c_str());
     printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
@@ -236,9 +236,9 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --diffusion-fa                     use flash attention in the diffusion model (for low vram)\n");
     printf("                                     Might lower quality, since it implies converting k and v to f16.\n");
     printf("                                     This might crash if it is not supported by the backend.\n");
-    printf("  --diffusion-conv-direct            use Conv2D direct in the diffusion model");
+    printf("  --diffusion-conv-direct            use Conv2d direct in the diffusion model");
     printf("                                     This might crash if it is not supported by the backend.\n");
-    printf("  --vae-conv-direct                  use Conv2D direct in the vae model (should improve the performance)");
+    printf("  --vae-conv-direct                  use Conv2d direct in the vae model (should improve the performance)");
     printf("                                     This might crash if it is not supported by the backend.\n");
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index de7b2bfa1..3c6ebd10e 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1394,6 +1394,19 @@ class GGMLBlock {
             tensors[prefix + pair.first] = pair.second;
         }
     }
+
+    virtual std::string get_desc() {
+        return "GGMLBlock";
+    }
+
+    void get_all_blocks(std::vector<GGMLBlock*>& result) {
+        result.push_back(this);
+        for (auto& block_iter : blocks) {
+            if (block_iter.second) {
+                block_iter.second->get_all_blocks(result);
+            }
+        }
+    }
 };
 
 class UnaryBlock : public GGMLBlock {
@@ -1483,7 +1496,7 @@ class Conv2d : public UnaryBlock {
     std::pair<int, int> padding;
     std::pair<int, int> dilation;
     bool bias;
-    bool direct;
+    bool direct = false;
 
     void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") {
         enum ggml_type wtype = GGML_TYPE_F16;
@@ -1501,16 +1514,22 @@ class Conv2d : public UnaryBlock {
            std::pair<int, int> stride   = {1, 1},
            std::pair<int, int> padding  = {0, 0},
            std::pair<int, int> dilation = {1, 1},
-           bool bias                    = true,
-           bool direct                  = false)
+           bool bias                    = true)
         : in_channels(in_channels),
           out_channels(out_channels),
           kernel_size(kernel_size),
           stride(stride),
           padding(padding),
           dilation(dilation),
-          bias(bias),
-          direct(direct) {}
+          bias(bias) {}
+
+    void enable_direct() {
+        direct = true;
+    }
+
+    std::string get_desc() {
+        return "Conv2d";
+    }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         struct ggml_tensor* w = params["weight"];
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 245ffea0e..f0d9c05af 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -326,12 +326,6 @@ class StableDiffusionGGML {
                 LOG_INFO("CLIP: Using CPU backend");
                 clip_backend = ggml_backend_cpu_init();
             }
-            if (sd_ctx_params->diffusion_conv_direct) {
-                LOG_INFO("Using Conv2D direct in the diffusion model");
-            }
-            if (sd_ctx_params->vae_conv_direct){
-                LOG_INFO("Using Conv2D direct in the vae model");
-            }
             if (sd_ctx_params->diffusion_flash_attn) {
                 LOG_INFO("Using flash attention in the diffusion model");
             }
@@ -379,8 +373,11 @@ class StableDiffusionGGML {
                 diffusion_model = std::make_shared<UNetModel>(backend,
                                                               model_loader.tensor_storages_types,
                                                               version,
-                                                              sd_ctx_params->diffusion_flash_attn,
-                                                              sd_ctx_params->diffusion_conv_direct);
+                                                              sd_ctx_params->diffusion_flash_attn);
+                if (sd_ctx_params->diffusion_conv_direct) {
+                    LOG_INFO("Using Conv2d direct in the diffusion model");
+                    std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.enable_conv2d_direct();
+                }
             }
 
             cond_stage_model->alloc_params_buffer();
@@ -401,8 +398,11 @@ class StableDiffusionGGML {
                                                                     "first_stage_model",
                                                                     vae_decode_only,
                                                                     false,
-                                                                    version,
-                                                                    sd_ctx_params->vae_conv_direct);
+                                                                    version);
+                if (sd_ctx_params->vae_conv_direct){
+                    LOG_INFO("Using Conv2d direct in the vae model");
+                    first_stage_model->enable_conv2d_direct();
+                }
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             } else {
@@ -410,8 +410,11 @@ class StableDiffusionGGML {
                                                                     model_loader.tensor_storages_types,
                                                                     "decoder.layers",
                                                                     vae_decode_only,
-                                                                    version,
-                                                                    sd_ctx_params->vae_conv_direct);
+                                                                    version);
+                if (sd_ctx_params->vae_conv_direct){
+                    LOG_INFO("Using Conv2d direct in the tae model");
+                    tae_first_stage->enable_conv2d_direct();
+                }
             }
             // first_stage_model->get_param_tensors(tensors, "first_stage_model.");
 
@@ -423,7 +426,11 @@ class StableDiffusionGGML {
                 } else {
                     controlnet_backend = backend;
                 }
-                control_net = std::make_shared<ControlNet>(controlnet_backend, model_loader.tensor_storages_types, version, sd_ctx_params->diffusion_conv_direct);
+                control_net = std::make_shared<ControlNet>(controlnet_backend, model_loader.tensor_storages_types, version);
+                if (sd_ctx_params->diffusion_conv_direct) {
+                    LOG_INFO("Using Conv2d direct in the control net");
+                    control_net->enable_conv2d_direct();
+                }
             }
 
             if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) {
diff --git a/tae.hpp b/tae.hpp
index fa83200b2..4959bbd08 100644
--- a/tae.hpp
+++ b/tae.hpp
@@ -17,16 +17,15 @@ class TAEBlock : public UnaryBlock {
 protected:
     int n_in;
     int n_out;
-    bool direct = false;
 
 public:
-    TAEBlock(int n_in, int n_out, bool direct = false)
-        : n_in(n_in), n_out(n_out), direct(direct) {
-        blocks["conv.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
-        blocks["conv.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
-        blocks["conv.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+    TAEBlock(int n_in, int n_out)
+        : n_in(n_in), n_out(n_out) {
+        blocks["conv.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
+        blocks["conv.4"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}));
         if (n_in != n_out) {
-            blocks["skip"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false, direct));
+            blocks["skip"] = std::shared_ptr<GGMLBlock>(new Conv2d(n_in, n_out, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false));
         }
     }
 
@@ -61,32 +60,30 @@ class TinyEncoder : public UnaryBlock {
     int channels    = 64;
     int z_channels  = 4;
     int num_blocks  = 3;
-    bool direct     = false;
 
 public:
-    TinyEncoder(int z_channels = 4, bool direct = false)
-        : z_channels(z_channels),
-          direct(direct) {
+    TinyEncoder(int z_channels = 4)
+        : z_channels(z_channels) {
         int index                       = 0;
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, direct));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, channels, {3, 3}, {1, 1}, {1, 1}));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
 
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false, direct));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
         for (int i = 0; i < num_blocks; i++) {
-            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, direct));
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
         }
 
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false, direct));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
         for (int i = 0; i < num_blocks; i++) {
-            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, direct));
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
         }
 
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false, direct));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false));
         for (int i = 0; i < num_blocks; i++) {
-            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, direct));
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
         }
 
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -108,37 +105,35 @@ class TinyDecoder : public UnaryBlock {
     int channels     = 64;
     int out_channels = 3;
     int num_blocks   = 3;
-    bool direct      = false;
 
 public:
-    TinyDecoder(int z_channels = 4, bool direct = false)
-        : z_channels(z_channels),
-          direct(direct) {
+    TinyDecoder(int z_channels = 4)
+        : z_channels(z_channels) {
         int index = 0;
 
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, channels, {3, 3}, {1, 1}, {1, 1}));
         index++;  // nn.ReLU()
 
         for (int i = 0; i < num_blocks; i++) {
-            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, direct));
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
         }
         index++;  // nn.Upsample()
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false, direct));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
 
         for (int i = 0; i < num_blocks; i++) {
-            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, direct));
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
         }
         index++;  // nn.Upsample()
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false, direct));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
 
         for (int i = 0; i < num_blocks; i++) {
-            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, direct));
+            blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
         }
         index++;  // nn.Upsample()
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false, direct));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false));
 
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels, direct));
-        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new TAEBlock(channels, channels));
+        blocks[std::to_string(index++)] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
@@ -170,20 +165,18 @@ class TinyDecoder : public UnaryBlock {
 class TAESD : public GGMLBlock {
 protected:
     bool decode_only;
-    bool direct = false;
 
 public:
-    TAESD(bool decode_only = true, SDVersion version = VERSION_SD1, bool direct = false)
-        : decode_only(decode_only),
-          direct(direct) {
+    TAESD(bool decode_only = true, SDVersion version = VERSION_SD1)
+        : decode_only(decode_only) {
         int z_channels = 4;
         if (sd_version_is_dit(version)) {
             z_channels = 16;
         }
-        blocks["decoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyDecoder(z_channels, direct));
+        blocks["decoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyDecoder(z_channels));
 
         if (!decode_only) {
-            blocks["encoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyEncoder(z_channels, direct));
+            blocks["encoder.layers"] = std::shared_ptr<GGMLBlock>(new TinyEncoder(z_channels));
         }
     }
 
@@ -201,21 +194,29 @@ class TAESD : public GGMLBlock {
 struct TinyAutoEncoder : public GGMLRunner {
     TAESD taesd;
     bool decode_only = false;
-    bool direct = false;
 
     TinyAutoEncoder(ggml_backend_t backend,
                     const String2GGMLType& tensor_types,
                     const std::string prefix,
                     bool decoder_only = true,
-                    SDVersion version = VERSION_SD1,
-                    bool direct       = false)
+                    SDVersion version = VERSION_SD1)
         : decode_only(decoder_only),
-          taesd(decoder_only, version, direct),
-          direct(direct),
+          taesd(decoder_only, version),
           GGMLRunner(backend) {
         taesd.init(params_ctx, tensor_types, prefix);
     }
 
+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        taesd.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
     std::string get_desc() {
         return "taesd";
     }
@@ -268,4 +269,4 @@ struct TinyAutoEncoder : public GGMLRunner {
     }
 };
 
-#endif  // __TAE_HPP__
+#endif  // __TAE_HPP__
\ No newline at end of file
diff --git a/unet.hpp b/unet.hpp
index 7196fb03c..696bc6dfa 100644
--- a/unet.hpp
+++ b/unet.hpp
@@ -183,7 +183,7 @@ class UnetModelBlock : public GGMLBlock {
     int model_channels  = 320;
     int adm_in_channels = 2816;  // only for VERSION_SDXL/SVD
 
-    UnetModelBlock(SDVersion version = VERSION_SD1, const String2GGMLType& tensor_types = {}, bool flash_attn = false, bool direct = false)
+    UnetModelBlock(SDVersion version = VERSION_SD1, const String2GGMLType& tensor_types = {}, bool flash_attn = false)
         : version(version) {
         if (sd_version_is_sd2(version)) {
             context_dim       = 1024;
@@ -224,7 +224,7 @@ class UnetModelBlock : public GGMLBlock {
         }
 
         // input_blocks
-        blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["input_blocks.0.0"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}));
 
         std::vector<int> input_block_chans;
         input_block_chans.push_back(model_channels);
@@ -236,7 +236,7 @@ class UnetModelBlock : public GGMLBlock {
             if (version == VERSION_SVD) {
                 return new VideoResBlock(channels, emb_channels, out_channels);
             } else {
-                return new ResBlock(channels, emb_channels, out_channels, {3, 3});
+                return new ResBlock(channels, emb_channels, out_channels);
             }
         };
 
@@ -248,7 +248,7 @@ class UnetModelBlock : public GGMLBlock {
             if (version == VERSION_SVD) {
                 return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim);
             } else {
-                return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, flash_attn, direct);
+                return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, flash_attn);
             }
         };
 
@@ -280,7 +280,7 @@ class UnetModelBlock : public GGMLBlock {
             if (i != len_mults - 1) {
                 input_block_idx += 1;
                 std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
-                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch, false, direct));
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch));
 
                 input_block_chans.push_back(ch);
                 ds *= 2;
@@ -330,7 +330,7 @@ class UnetModelBlock : public GGMLBlock {
 
                 if (i > 0 && j == num_res_blocks) {
                     std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx);
-                    blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(ch, ch, direct));
+                    blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(ch, ch));
 
                     ds /= 2;
                 }
@@ -342,7 +342,7 @@ class UnetModelBlock : public GGMLBlock {
         // out
         blocks["out.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(ch));  // ch == model_channels
         // out_1 is nn.SiLU()
-        blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["out.2"] = std::shared_ptr<GGMLBlock>(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
     }
 
     struct ggml_tensor* resblock_forward(std::string name,
@@ -541,12 +541,23 @@ struct UNetModelRunner : public GGMLRunner {
                     const String2GGMLType& tensor_types,
                     const std::string prefix,
                     SDVersion version = VERSION_SD1,
-                    bool flash_attn   = false,
-                    bool direct       = false)
-        : GGMLRunner(backend), unet(version, tensor_types, flash_attn, direct) {
+                    bool flash_attn   = false)
+        : GGMLRunner(backend), unet(version, tensor_types, flash_attn) {
         unet.init(params_ctx, tensor_types, prefix);
     }
 
+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        unet.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                LOG_DEBUG("block %s", block->get_desc().c_str());
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
     std::string get_desc() {
         return "unet";
     }
diff --git a/upscaler.cpp b/upscaler.cpp
index 69d5ef392..599f263f9 100644
--- a/upscaler.cpp
+++ b/upscaler.cpp
@@ -49,7 +49,10 @@ struct UpscalerGGML {
             backend = ggml_backend_cpu_init();
         }
         LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
-        esrgan_upscaler = std::make_shared<ESRGAN>(backend, model_loader.tensor_storages_types, direct);
+        esrgan_upscaler = std::make_shared<ESRGAN>(backend, model_loader.tensor_storages_types);
+        if (direct) {
+            esrgan_upscaler->enable_conv2d_direct();
+        }
         if (!esrgan_upscaler->load_from_file(esrgan_path)) {
             return false;
         }
diff --git a/vae.hpp b/vae.hpp
index 6f435fd6b..bdf160bb8 100644
--- a/vae.hpp
+++ b/vae.hpp
@@ -12,24 +12,21 @@ class ResnetBlock : public UnaryBlock {
 protected:
     int64_t in_channels;
     int64_t out_channels;
-    bool direct = false;
 
 public:
     ResnetBlock(int64_t in_channels,
-                int64_t out_channels,
-                bool direct = false)
+                int64_t out_channels)
         : in_channels(in_channels),
-          out_channels(out_channels),
-          direct(direct){
+          out_channels(out_channels) {
         // temb_channels is always 0
         blocks["norm1"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["conv1"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
 
         blocks["norm2"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
-        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["conv2"] = std::shared_ptr<GGMLBlock>(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
 
         if (out_channels != in_channels) {
-            blocks["nin_shortcut"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
+            blocks["nin_shortcut"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, {1, 1}));
         }
     }
 
@@ -67,19 +64,16 @@ class ResnetBlock : public UnaryBlock {
 class AttnBlock : public UnaryBlock {
 protected:
     int64_t in_channels;
-    bool direct = false;
 
 public:
-    AttnBlock(int64_t in_channels,
-              bool direct = false)
-        : in_channels(in_channels),
-          direct(direct){
+    AttnBlock(int64_t in_channels)
+        : in_channels(in_channels) {
         blocks["norm"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
-        blocks["q"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
-        blocks["k"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
-        blocks["v"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
+        blocks["q"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+        blocks["k"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
+        blocks["v"]    = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
 
-        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct));
+        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, in_channels, {1, 1}));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -129,9 +123,8 @@ class AE3DConv : public Conv2d {
              std::pair<int, int> stride   = {1, 1},
              std::pair<int, int> padding  = {0, 0},
              std::pair<int, int> dilation = {1, 1},
-             bool bias                    = true,
-             bool direct                  = false)
-        : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias, direct) {
+             bool bias                    = true)
+        : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) {
         int64_t kernel_padding  = video_kernel_size / 2;
         blocks["time_mix_conv"] = std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(out_channels,
                                                                              out_channels,
@@ -183,9 +176,8 @@ class VideoResnetBlock : public ResnetBlock {
 public:
     VideoResnetBlock(int64_t in_channels,
                      int64_t out_channels,
-                     int video_kernel_size = 3,
-                     bool direct           = false)
-        : ResnetBlock(in_channels, out_channels, direct) {
+                     int video_kernel_size = 3)
+        : ResnetBlock(in_channels, out_channels) {
         // merge_strategy is always learned
         blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true));
     }
@@ -234,7 +226,6 @@ class Encoder : public GGMLBlock {
     int in_channels          = 3;
     int z_channels           = 4;
     bool double_z            = true;
-    bool direct              = false;
 
 public:
     Encoder(int ch,
@@ -242,16 +233,14 @@ class Encoder : public GGMLBlock {
             int num_res_blocks,
             int in_channels,
             int z_channels,
-            bool double_z = true,
-            bool direct   = false)
+            bool double_z = true)
         : ch(ch),
           ch_mult(ch_mult),
           num_res_blocks(num_res_blocks),
           in_channels(in_channels),
           z_channels(z_channels),
-          double_z(double_z),
-          direct(direct){
-        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+          double_z(double_z) {
+        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}));
 
         size_t num_resolutions = ch_mult.size();
 
@@ -265,21 +254,21 @@ class Encoder : public GGMLBlock {
             int block_out = ch * ch_mult[i];
             for (int j = 0; j < num_res_blocks; j++) {
                 std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j);
-                blocks[name]     = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_out, direct));
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_out));
                 block_in         = block_out;
             }
             if (i != num_resolutions - 1) {
                 std::string name = "down." + std::to_string(i) + ".downsample";
-                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(block_in, block_in, true, direct));
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(block_in, block_in, true));
             }
         }
 
-        blocks["mid.block_1"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in, direct));
-        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in, direct));
-        blocks["mid.block_2"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in, direct));
+        blocks["mid.block_1"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
+        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in));
+        blocks["mid.block_2"] = std::shared_ptr<GGMLBlock>(new ResnetBlock(block_in, block_in));
 
         blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
-        blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["conv_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}));
     }
 
     virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -334,29 +323,25 @@ class Decoder : public GGMLBlock {
     int z_channels           = 4;
     bool video_decoder       = false;
     int video_kernel_size    = 3;
-    bool direct              = false;
 
     virtual std::shared_ptr<GGMLBlock> get_conv_out(int64_t in_channels,
                                                     int64_t out_channels,
                                                     std::pair<int, int> kernel_size,
                                                     std::pair<int, int> stride  = {1, 1},
-                                                    std::pair<int, int> padding = {0, 0},
-                                                    std::pair<int, int> dilation = {1, 1},
-                                                    bool bias                    = true,
-                                                    bool direct                  = false){
+                                                    std::pair<int, int> padding = {0, 0}) {
         if (video_decoder) {
             return std::shared_ptr<GGMLBlock>(new AE3DConv(in_channels, out_channels, kernel_size, video_kernel_size, stride, padding));
         } else {
-            return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias, direct));
+            return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, stride, padding));
         }
     }
 
     virtual std::shared_ptr<GGMLBlock> get_resnet_block(int64_t in_channels,
                                                         int64_t out_channels) {
         if (video_decoder) {
-            return std::shared_ptr<GGMLBlock>(new VideoResnetBlock(in_channels, out_channels, video_kernel_size, direct));
+            return std::shared_ptr<GGMLBlock>(new VideoResnetBlock(in_channels, out_channels, video_kernel_size));
         } else {
-            return std::shared_ptr<GGMLBlock>(new ResnetBlock(in_channels, out_channels, direct));
+            return std::shared_ptr<GGMLBlock>(new ResnetBlock(in_channels, out_channels));
         }
     }
 
@@ -367,23 +352,21 @@ class Decoder : public GGMLBlock {
             int num_res_blocks,
             int z_channels,
             bool video_decoder    = false,
-            int video_kernel_size = 3,
-            bool direct           = false)
+            int video_kernel_size = 3)
         : ch(ch),
           out_ch(out_ch),
           ch_mult(ch_mult),
           num_res_blocks(num_res_blocks),
           z_channels(z_channels),
           video_decoder(video_decoder),
-          video_kernel_size(video_kernel_size),
-          direct(direct) {
+          video_kernel_size(video_kernel_size) {
         size_t num_resolutions = ch_mult.size();
         int block_in           = ch * ch_mult[num_resolutions - 1];
 
-        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct));
+        blocks["conv_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}));
 
         blocks["mid.block_1"] = get_resnet_block(block_in, block_in);
-        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in, direct));
+        blocks["mid.attn_1"]  = std::shared_ptr<GGMLBlock>(new AttnBlock(block_in));
         blocks["mid.block_2"] = get_resnet_block(block_in, block_in);
 
         for (int i = num_resolutions - 1; i >= 0; i--) {
@@ -397,12 +380,12 @@ class Decoder : public GGMLBlock {
             }
             if (i != 0) {
                 std::string name = "up." + std::to_string(i) + ".upsample";
-                blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(block_in, block_in, direct));
+                blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(block_in, block_in));
             }
         }
 
         blocks["norm_out"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(block_in));
-        blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct);
+        blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1});
     }
 
     virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
@@ -459,7 +442,6 @@ class AutoencodingEngine : public GGMLBlock {
     bool use_video_decoder = false;
     bool use_quant         = true;
     int embed_dim          = 4;
-    bool direct            = false;
     struct {
         int z_channels           = 4;
         int resolution           = 256;
@@ -474,9 +456,8 @@ class AutoencodingEngine : public GGMLBlock {
 public:
     AutoencodingEngine(bool decode_only       = true,
                        bool use_video_decoder = false,
-                       SDVersion version      = VERSION_SD1,
-                       bool direct            = false)
-        : decode_only(decode_only), use_video_decoder(use_video_decoder), direct(direct) {
+                       SDVersion version      = VERSION_SD1)
+        : decode_only(decode_only), use_video_decoder(use_video_decoder) {
         if (sd_version_is_dit(version)) {
             dd_config.z_channels = 16;
             use_quant            = false;
@@ -489,18 +470,11 @@ class AutoencodingEngine : public GGMLBlock {
                                                                    dd_config.ch_mult,
                                                                    dd_config.num_res_blocks,
                                                                    dd_config.z_channels,
-                                                                   use_video_decoder,
-                                                                   3,
-                                                                   direct));
+                                                                   use_video_decoder));
         if (use_quant) {
             blocks["post_quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(dd_config.z_channels,
                                                                               embed_dim,
-                                                                              {1, 1},
-                                                                              {1, 1},
-                                                                              {0, 0},
-                                                                              {1, 1},
-                                                                              true,
-                                                                              direct));
+                                                                              {1, 1}));
         }
         if (!decode_only) {
             blocks["encoder"] = std::shared_ptr<GGMLBlock>(new Encoder(dd_config.ch,
@@ -508,19 +482,13 @@ class AutoencodingEngine : public GGMLBlock {
                                                                        dd_config.num_res_blocks,
                                                                        dd_config.in_channels,
                                                                        dd_config.z_channels,
-                                                                       dd_config.double_z,
-                                                                       direct));
+                                                                       dd_config.double_z));
             if (use_quant) {
                 int factor = dd_config.double_z ? 2 : 1;
 
                 blocks["quant_conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(embed_dim * factor,
                                                                              dd_config.z_channels * factor,
-                                                                             {1, 1},
-                                                                             {1, 1},
-                                                                             {0, 0},
-                                                                             {1, 1},
-                                                                             true,
-                                                                             direct));
+                                                                             {1, 1}));
             }
         }
     }
@@ -561,12 +529,22 @@ struct AutoEncoderKL : public GGMLRunner {
                   const std::string prefix,
                   bool decode_only       = false,
                   bool use_video_decoder = false,
-                  SDVersion version      = VERSION_SD1,
-                  bool direct            = false)
-        : decode_only(decode_only), ae(decode_only, use_video_decoder, version, direct), GGMLRunner(backend) {
+                  SDVersion version      = VERSION_SD1)
+        : decode_only(decode_only), ae(decode_only, use_video_decoder, version), GGMLRunner(backend) {
         ae.init(params_ctx, tensor_types, prefix);
     }
 
+    void enable_conv2d_direct() {
+        std::vector<GGMLBlock*> blocks;
+        ae.get_all_blocks(blocks);
+        for (auto block : blocks) {
+            if (block->get_desc() == "Conv2d") {
+                auto conv_block = (Conv2d*)block;
+                conv_block->enable_direct();
+            }
+        }
+    }
+
     std::string get_desc() {
         return "vae";
     }

From f053f736241a9d9a2327daed83543813a3f1f67e Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Sat, 2 Aug 2025 13:52:43 +0800
Subject: [PATCH 12/12] format code

---
 ggml_extend.hpp      | 18 +++++++++---------
 stable-diffusion.cpp |  4 ++--
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 3c6ebd10e..57c1e8aa6 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -707,15 +707,15 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
 }
 
 __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx,
-                                                      struct ggml_tensor* x,
-                                                      struct ggml_tensor* w,
-                                                      struct ggml_tensor* b,
-                                                      int s0 = 1,
-                                                      int s1 = 1,
-                                                      int p0 = 0,
-                                                      int p1 = 0,
-                                                      int d0 = 1,
-                                                      int d1 = 1) {
+                                                             struct ggml_tensor* x,
+                                                             struct ggml_tensor* w,
+                                                             struct ggml_tensor* b,
+                                                             int s0 = 1,
+                                                             int s1 = 1,
+                                                             int p0 = 0,
+                                                             int p1 = 0,
+                                                             int d0 = 1,
+                                                             int d1 = 1) {
     x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
     if (b != NULL) {
         b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index f0d9c05af..c5448f927 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -399,7 +399,7 @@ class StableDiffusionGGML {
                                                                     vae_decode_only,
                                                                     false,
                                                                     version);
-                if (sd_ctx_params->vae_conv_direct){
+                if (sd_ctx_params->vae_conv_direct) {
                     LOG_INFO("Using Conv2d direct in the vae model");
                     first_stage_model->enable_conv2d_direct();
                 }
@@ -411,7 +411,7 @@ class StableDiffusionGGML {
                                                                     "decoder.layers",
                                                                     vae_decode_only,
                                                                     version);
-                if (sd_ctx_params->vae_conv_direct){
+                if (sd_ctx_params->vae_conv_direct) {
                     LOG_INFO("Using Conv2d direct in the tae model");
                     tae_first_stage->enable_conv2d_direct();
                 }