From ff452b8d8c0dc6d5683d99ecc9826b88e4ee68a6 Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Mon, 28 Jul 2025 16:44:13 +0200 Subject: [PATCH 01/12] Conv2DDirect for VAE stage --- common.hpp | 8 +++---- ggml_extend.hpp | 64 +++++++++++++++++++++++++++++++++++++++++++++++++ vae.hpp | 58 ++++++++++++++++++++++---------------------- 3 files changed, 97 insertions(+), 33 deletions(-) diff --git a/common.hpp b/common.hpp index 9b5cc53be..2afee2260 100644 --- a/common.hpp +++ b/common.hpp @@ -17,7 +17,7 @@ class DownSampleBlock : public GGMLBlock { out_channels(out_channels), vae_downsample(vae_downsample) { if (vae_downsample) { - blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0})); + blocks["conv"] = std::shared_ptr(new Conv2dDirect(channels, out_channels, {3, 3}, {2, 2}, {0, 0})); } else { blocks["op"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1})); } @@ -26,7 +26,7 @@ class DownSampleBlock : public GGMLBlock { struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { // x: [N, channels, h, w] if (vae_downsample) { - auto conv = std::dynamic_pointer_cast(blocks["conv"]); + auto conv = std::dynamic_pointer_cast(blocks["conv"]); x = ggml_pad(ctx, x, 1, 1, 0, 0); x = conv->forward(ctx, x); @@ -49,12 +49,12 @@ class UpSampleBlock : public GGMLBlock { int out_channels) : channels(channels), out_channels(out_channels) { - blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1})); + blocks["conv"] = std::shared_ptr(new Conv2dDirect(channels, out_channels, {3, 3}, {1, 1}, {1, 1})); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { // x: [N, channels, h, w] - auto conv = std::dynamic_pointer_cast(blocks["conv"]); + auto conv = std::dynamic_pointer_cast(blocks["conv"]); x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2] x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2] diff --git a/ggml_extend.hpp b/ggml_extend.hpp index eb33f0248..a17162f43 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -706,6 +706,25 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx, return x; } +__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx, + struct ggml_tensor* x, + struct ggml_tensor* w, + struct ggml_tensor* b, + int s0 = 1, + int s1 = 1, + int p0 = 0, + int p1 = 0, + int d0 = 1, + int d1 = 1) { + x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1); + if (b != NULL) { + b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1); + // b = ggml_repeat(ctx, b, x); + x = ggml_add(ctx, x, b); + } + return x; +} + // w: [OC,IC, KD, 1 * 1] // x: [N, IC, IH, IW] // b: [OC,] @@ -1492,6 +1511,51 @@ class Conv2d : public UnaryBlock { } }; +class Conv2dDirect : public UnaryBlock { +protected: + int64_t in_channels; + int64_t out_channels; + std::pair kernel_size; + std::pair stride; + std::pair padding; + std::pair dilation; + bool bias; + + void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { + enum ggml_type wtype = GGML_TYPE_F16; //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F16; + params["weight"] = ggml_new_tensor_4d(ctx, wtype, kernel_size.second, kernel_size.first, in_channels, out_channels); + if (bias) { + enum ggml_type wtype = GGML_TYPE_F32; // (tensor_types.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32; + params["bias"] = ggml_new_tensor_1d(ctx, wtype, out_channels); + } + } + +public: + Conv2dDirect(int64_t in_channels, + int64_t out_channels, + std::pair kernel_size, + std::pair stride = {1, 1}, + std::pair padding = {0, 0}, + std::pair dilation = {1, 1}, + bool bias = true) + : in_channels(in_channels), + out_channels(out_channels), + kernel_size(kernel_size), + stride(stride), + padding(padding), + dilation(dilation), + bias(bias) {} + + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* w = params["weight"]; + struct ggml_tensor* b = NULL; + if (bias) { + b = params["bias"]; + } + return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); + } +}; + class Conv3dnx1x1 : public UnaryBlock { protected: int64_t in_channels; diff --git a/vae.hpp b/vae.hpp index 4add881f6..88bcc349c 100644 --- a/vae.hpp +++ b/vae.hpp @@ -20,13 +20,13 @@ class ResnetBlock : public UnaryBlock { out_channels(out_channels) { // temb_channels is always 0 blocks["norm1"] = std::shared_ptr(new GroupNorm32(in_channels)); - blocks["conv1"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); + blocks["conv1"] = std::shared_ptr(new Conv2dDirect(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); blocks["norm2"] = std::shared_ptr(new GroupNorm32(out_channels)); - blocks["conv2"] = std::shared_ptr(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); + blocks["conv2"] = std::shared_ptr(new Conv2dDirect(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); if (out_channels != in_channels) { - blocks["nin_shortcut"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {1, 1})); + blocks["nin_shortcut"] = std::shared_ptr(new Conv2dDirect(in_channels, out_channels, {1, 1})); } } @@ -34,9 +34,9 @@ class ResnetBlock : public UnaryBlock { // x: [N, in_channels, h, w] // t_emb is always None auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); - auto conv1 = std::dynamic_pointer_cast(blocks["conv1"]); + auto conv1 = std::dynamic_pointer_cast(blocks["conv1"]); auto norm2 = std::dynamic_pointer_cast(blocks["norm2"]); - auto conv2 = std::dynamic_pointer_cast(blocks["conv2"]); + auto conv2 = std::dynamic_pointer_cast(blocks["conv2"]); auto h = x; h = norm1->forward(ctx, h); @@ -51,7 +51,7 @@ class ResnetBlock : public UnaryBlock { // skip connection if (out_channels != in_channels) { - auto nin_shortcut = std::dynamic_pointer_cast(blocks["nin_shortcut"]); + auto nin_shortcut = std::dynamic_pointer_cast(blocks["nin_shortcut"]); x = nin_shortcut->forward(ctx, x); // [N, out_channels, h, w] } @@ -69,20 +69,20 @@ class AttnBlock : public UnaryBlock { AttnBlock(int64_t in_channels) : in_channels(in_channels) { blocks["norm"] = std::shared_ptr(new GroupNorm32(in_channels)); - blocks["q"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); - blocks["k"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); - blocks["v"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); + blocks["q"] = std::shared_ptr(new Conv2dDirect(in_channels, in_channels, {1, 1})); + blocks["k"] = std::shared_ptr(new Conv2dDirect(in_channels, in_channels, {1, 1})); + blocks["v"] = std::shared_ptr(new Conv2dDirect(in_channels, in_channels, {1, 1})); - blocks["proj_out"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); + blocks["proj_out"] = std::shared_ptr(new Conv2dDirect(in_channels, in_channels, {1, 1})); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { // x: [N, in_channels, h, w] auto norm = std::dynamic_pointer_cast(blocks["norm"]); - auto q_proj = std::dynamic_pointer_cast(blocks["q"]); - auto k_proj = std::dynamic_pointer_cast(blocks["k"]); - auto v_proj = std::dynamic_pointer_cast(blocks["v"]); - auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]); + auto q_proj = std::dynamic_pointer_cast(blocks["q"]); + auto k_proj = std::dynamic_pointer_cast(blocks["k"]); + auto v_proj = std::dynamic_pointer_cast(blocks["v"]); + auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]); auto h_ = norm->forward(ctx, x); @@ -114,7 +114,7 @@ class AttnBlock : public UnaryBlock { } }; -class AE3DConv : public Conv2d { +class AE3DConv : public Conv2dDirect { public: AE3DConv(int64_t in_channels, int64_t out_channels, @@ -124,7 +124,7 @@ class AE3DConv : public Conv2d { std::pair padding = {0, 0}, std::pair dilation = {1, 1}, bool bias = true) - : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) { + : Conv2dDirect(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) { int64_t kernel_padding = video_kernel_size / 2; blocks["time_mix_conv"] = std::shared_ptr(new Conv3dnx1x1(out_channels, out_channels, @@ -141,7 +141,7 @@ class AE3DConv : public Conv2d { // result: [N, OC, OH, OW] auto time_mix_conv = std::dynamic_pointer_cast(blocks["time_mix_conv"]); - x = Conv2d::forward(ctx, x); + x = Conv2dDirect::forward(ctx, x); // timesteps = x.shape[0] // x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps) // x = conv3d(x) @@ -240,7 +240,7 @@ class Encoder : public GGMLBlock { in_channels(in_channels), z_channels(z_channels), double_z(double_z) { - blocks["conv_in"] = std::shared_ptr(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1})); + blocks["conv_in"] = std::shared_ptr(new Conv2dDirect(in_channels, ch, {3, 3}, {1, 1}, {1, 1})); size_t num_resolutions = ch_mult.size(); @@ -268,18 +268,18 @@ class Encoder : public GGMLBlock { blocks["mid.block_2"] = std::shared_ptr(new ResnetBlock(block_in, block_in)); blocks["norm_out"] = std::shared_ptr(new GroupNorm32(block_in)); - blocks["conv_out"] = std::shared_ptr(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1})); + blocks["conv_out"] = std::shared_ptr(new Conv2dDirect(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1})); } virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { // x: [N, in_channels, h, w] - auto conv_in = std::dynamic_pointer_cast(blocks["conv_in"]); + auto conv_in = std::dynamic_pointer_cast(blocks["conv_in"]); auto mid_block_1 = std::dynamic_pointer_cast(blocks["mid.block_1"]); auto mid_attn_1 = std::dynamic_pointer_cast(blocks["mid.attn_1"]); auto mid_block_2 = std::dynamic_pointer_cast(blocks["mid.block_2"]); auto norm_out = std::dynamic_pointer_cast(blocks["norm_out"]); - auto conv_out = std::dynamic_pointer_cast(blocks["conv_out"]); + auto conv_out = std::dynamic_pointer_cast(blocks["conv_out"]); auto h = conv_in->forward(ctx, x); // [N, ch, h, w] @@ -332,7 +332,7 @@ class Decoder : public GGMLBlock { if (video_decoder) { return std::shared_ptr(new AE3DConv(in_channels, out_channels, kernel_size, video_kernel_size, stride, padding)); } else { - return std::shared_ptr(new Conv2d(in_channels, out_channels, kernel_size, stride, padding)); + return std::shared_ptr(new Conv2dDirect(in_channels, out_channels, kernel_size, stride, padding)); } } @@ -363,7 +363,7 @@ class Decoder : public GGMLBlock { size_t num_resolutions = ch_mult.size(); int block_in = ch * ch_mult[num_resolutions - 1]; - blocks["conv_in"] = std::shared_ptr(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1})); + blocks["conv_in"] = std::shared_ptr(new Conv2dDirect(z_channels, block_in, {3, 3}, {1, 1}, {1, 1})); blocks["mid.block_1"] = get_resnet_block(block_in, block_in); blocks["mid.attn_1"] = std::shared_ptr(new AttnBlock(block_in)); @@ -394,12 +394,12 @@ class Decoder : public GGMLBlock { // merge_strategy is always learned // time_mode is always conv-only, so we need to replace conv_out_op/resnet_op to AE3DConv/VideoResBlock // AttnVideoBlock will not be used - auto conv_in = std::dynamic_pointer_cast(blocks["conv_in"]); + auto conv_in = std::dynamic_pointer_cast(blocks["conv_in"]); auto mid_block_1 = std::dynamic_pointer_cast(blocks["mid.block_1"]); auto mid_attn_1 = std::dynamic_pointer_cast(blocks["mid.attn_1"]); auto mid_block_2 = std::dynamic_pointer_cast(blocks["mid.block_2"]); auto norm_out = std::dynamic_pointer_cast(blocks["norm_out"]); - auto conv_out = std::dynamic_pointer_cast(blocks["conv_out"]); + auto conv_out = std::dynamic_pointer_cast(blocks["conv_out"]); // conv_in auto h = conv_in->forward(ctx, z); // [N, block_in, h, w] @@ -472,7 +472,7 @@ class AutoencodingEngine : public GGMLBlock { dd_config.z_channels, use_video_decoder)); if (use_quant) { - blocks["post_quant_conv"] = std::shared_ptr(new Conv2d(dd_config.z_channels, + blocks["post_quant_conv"] = std::shared_ptr(new Conv2dDirect(dd_config.z_channels, embed_dim, {1, 1})); } @@ -486,7 +486,7 @@ class AutoencodingEngine : public GGMLBlock { if (use_quant) { int factor = dd_config.double_z ? 2 : 1; - blocks["quant_conv"] = std::shared_ptr(new Conv2d(embed_dim * factor, + blocks["quant_conv"] = std::shared_ptr(new Conv2dDirect(embed_dim * factor, dd_config.z_channels * factor, {1, 1})); } @@ -496,7 +496,7 @@ class AutoencodingEngine : public GGMLBlock { struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) { // z: [N, z_channels, h, w] if (use_quant) { - auto post_quant_conv = std::dynamic_pointer_cast(blocks["post_quant_conv"]); + auto post_quant_conv = std::dynamic_pointer_cast(blocks["post_quant_conv"]); z = post_quant_conv->forward(ctx, z); // [N, z_channels, h, w] } auto decoder = std::dynamic_pointer_cast(blocks["decoder"]); @@ -513,7 +513,7 @@ class AutoencodingEngine : public GGMLBlock { auto h = encoder->forward(ctx, x); // [N, 2*z_channels, h/8, w/8] if (use_quant) { - auto quant_conv = std::dynamic_pointer_cast(blocks["quant_conv"]); + auto quant_conv = std::dynamic_pointer_cast(blocks["quant_conv"]); h = quant_conv->forward(ctx, h); // [N, 2*embed_dim, h/8, w/8] } return h; From 6624650eaa9ae5f4773723b370f57fce4f00bdd6 Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Mon, 28 Jul 2025 18:58:16 +0200 Subject: [PATCH 02/12] Enable only for Vulkan, reduced duplicated code --- common.hpp | 8 ++--- ggml_extend.hpp | 60 ++++++++---------------------------- vae.hpp | 82 +++++++++++++++++++++++++++++-------------------- 3 files changed, 65 insertions(+), 85 deletions(-) diff --git a/common.hpp b/common.hpp index 2afee2260..c2aa397ca 100644 --- a/common.hpp +++ b/common.hpp @@ -17,7 +17,7 @@ class DownSampleBlock : public GGMLBlock { out_channels(out_channels), vae_downsample(vae_downsample) { if (vae_downsample) { - blocks["conv"] = std::shared_ptr(new Conv2dDirect(channels, out_channels, {3, 3}, {2, 2}, {0, 0})); + blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, true, true)); } else { blocks["op"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1})); } @@ -26,7 +26,7 @@ class DownSampleBlock : public GGMLBlock { struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { // x: [N, channels, h, w] if (vae_downsample) { - auto conv = std::dynamic_pointer_cast(blocks["conv"]); + auto conv = std::dynamic_pointer_cast(blocks["conv"]); x = ggml_pad(ctx, x, 1, 1, 0, 0); x = conv->forward(ctx, x); @@ -49,12 +49,12 @@ class UpSampleBlock : public GGMLBlock { int out_channels) : channels(channels), out_channels(out_channels) { - blocks["conv"] = std::shared_ptr(new Conv2dDirect(channels, out_channels, {3, 3}, {1, 1}, {1, 1})); + blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true)); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { // x: [N, channels, h, w] - auto conv = std::dynamic_pointer_cast(blocks["conv"]); + auto conv = std::dynamic_pointer_cast(blocks["conv"]); x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2] x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2] diff --git a/ggml_extend.hpp b/ggml_extend.hpp index a17162f43..161b48e91 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1475,6 +1475,7 @@ class Conv2d : public UnaryBlock { std::pair padding; std::pair dilation; bool bias; + bool direct; void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { enum ggml_type wtype = GGML_TYPE_F16; //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F16; @@ -1492,14 +1493,16 @@ class Conv2d : public UnaryBlock { std::pair stride = {1, 1}, std::pair padding = {0, 0}, std::pair dilation = {1, 1}, - bool bias = true) + bool bias = true, + bool direct = false) : in_channels(in_channels), out_channels(out_channels), kernel_size(kernel_size), stride(stride), padding(padding), dilation(dilation), - bias(bias) {} + bias(bias), + direct(direct) {} struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* w = params["weight"]; @@ -1507,52 +1510,15 @@ class Conv2d : public UnaryBlock { if (bias) { b = params["bias"]; } - return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); - } -}; - -class Conv2dDirect : public UnaryBlock { -protected: - int64_t in_channels; - int64_t out_channels; - std::pair kernel_size; - std::pair stride; - std::pair padding; - std::pair dilation; - bool bias; - - void init_params(struct ggml_context* ctx, std::map& tensor_types, const std::string prefix = "") { - enum ggml_type wtype = GGML_TYPE_F16; //(tensor_types.find(prefix + "weight") != tensor_types.end()) ? tensor_types[prefix + "weight"] : GGML_TYPE_F16; - params["weight"] = ggml_new_tensor_4d(ctx, wtype, kernel_size.second, kernel_size.first, in_channels, out_channels); - if (bias) { - enum ggml_type wtype = GGML_TYPE_F32; // (tensor_types.find(prefix + "bias") != tensor_types.end()) ? tensor_types[prefix + "bias"] : GGML_TYPE_F32; - params["bias"] = ggml_new_tensor_1d(ctx, wtype, out_channels); - } - } - -public: - Conv2dDirect(int64_t in_channels, - int64_t out_channels, - std::pair kernel_size, - std::pair stride = {1, 1}, - std::pair padding = {0, 0}, - std::pair dilation = {1, 1}, - bool bias = true) - : in_channels(in_channels), - out_channels(out_channels), - kernel_size(kernel_size), - stride(stride), - padding(padding), - dilation(dilation), - bias(bias) {} - - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { - struct ggml_tensor* w = params["weight"]; - struct ggml_tensor* b = NULL; - if (bias) { - b = params["bias"]; + if (direct) { + #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL) || defined(SD_USE_OPENCL) + return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); + #else + return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); + #endif + } else { + return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); } - return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); } }; diff --git a/vae.hpp b/vae.hpp index 88bcc349c..42e64a95e 100644 --- a/vae.hpp +++ b/vae.hpp @@ -20,13 +20,13 @@ class ResnetBlock : public UnaryBlock { out_channels(out_channels) { // temb_channels is always 0 blocks["norm1"] = std::shared_ptr(new GroupNorm32(in_channels)); - blocks["conv1"] = std::shared_ptr(new Conv2dDirect(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); + blocks["conv1"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true)); blocks["norm2"] = std::shared_ptr(new GroupNorm32(out_channels)); - blocks["conv2"] = std::shared_ptr(new Conv2dDirect(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); + blocks["conv2"] = std::shared_ptr(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true)); if (out_channels != in_channels) { - blocks["nin_shortcut"] = std::shared_ptr(new Conv2dDirect(in_channels, out_channels, {1, 1})); + blocks["nin_shortcut"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true)); } } @@ -34,9 +34,9 @@ class ResnetBlock : public UnaryBlock { // x: [N, in_channels, h, w] // t_emb is always None auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); - auto conv1 = std::dynamic_pointer_cast(blocks["conv1"]); + auto conv1 = std::dynamic_pointer_cast(blocks["conv1"]); auto norm2 = std::dynamic_pointer_cast(blocks["norm2"]); - auto conv2 = std::dynamic_pointer_cast(blocks["conv2"]); + auto conv2 = std::dynamic_pointer_cast(blocks["conv2"]); auto h = x; h = norm1->forward(ctx, h); @@ -51,7 +51,7 @@ class ResnetBlock : public UnaryBlock { // skip connection if (out_channels != in_channels) { - auto nin_shortcut = std::dynamic_pointer_cast(blocks["nin_shortcut"]); + auto nin_shortcut = std::dynamic_pointer_cast(blocks["nin_shortcut"]); x = nin_shortcut->forward(ctx, x); // [N, out_channels, h, w] } @@ -69,20 +69,20 @@ class AttnBlock : public UnaryBlock { AttnBlock(int64_t in_channels) : in_channels(in_channels) { blocks["norm"] = std::shared_ptr(new GroupNorm32(in_channels)); - blocks["q"] = std::shared_ptr(new Conv2dDirect(in_channels, in_channels, {1, 1})); - blocks["k"] = std::shared_ptr(new Conv2dDirect(in_channels, in_channels, {1, 1})); - blocks["v"] = std::shared_ptr(new Conv2dDirect(in_channels, in_channels, {1, 1})); + blocks["q"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true)); + blocks["k"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true)); + blocks["v"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true)); - blocks["proj_out"] = std::shared_ptr(new Conv2dDirect(in_channels, in_channels, {1, 1})); + blocks["proj_out"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true)); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { // x: [N, in_channels, h, w] auto norm = std::dynamic_pointer_cast(blocks["norm"]); - auto q_proj = std::dynamic_pointer_cast(blocks["q"]); - auto k_proj = std::dynamic_pointer_cast(blocks["k"]); - auto v_proj = std::dynamic_pointer_cast(blocks["v"]); - auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]); + auto q_proj = std::dynamic_pointer_cast(blocks["q"]); + auto k_proj = std::dynamic_pointer_cast(blocks["k"]); + auto v_proj = std::dynamic_pointer_cast(blocks["v"]); + auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]); auto h_ = norm->forward(ctx, x); @@ -114,7 +114,7 @@ class AttnBlock : public UnaryBlock { } }; -class AE3DConv : public Conv2dDirect { +class AE3DConv : public Conv2d { public: AE3DConv(int64_t in_channels, int64_t out_channels, @@ -123,8 +123,9 @@ class AE3DConv : public Conv2dDirect { std::pair stride = {1, 1}, std::pair padding = {0, 0}, std::pair dilation = {1, 1}, - bool bias = true) - : Conv2dDirect(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) { + bool bias = true, + bool direct = false) + : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias, direct) { int64_t kernel_padding = video_kernel_size / 2; blocks["time_mix_conv"] = std::shared_ptr(new Conv3dnx1x1(out_channels, out_channels, @@ -141,7 +142,7 @@ class AE3DConv : public Conv2dDirect { // result: [N, OC, OH, OW] auto time_mix_conv = std::dynamic_pointer_cast(blocks["time_mix_conv"]); - x = Conv2dDirect::forward(ctx, x); + x = Conv2d::forward(ctx, x); // timesteps = x.shape[0] // x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps) // x = conv3d(x) @@ -240,7 +241,7 @@ class Encoder : public GGMLBlock { in_channels(in_channels), z_channels(z_channels), double_z(double_z) { - blocks["conv_in"] = std::shared_ptr(new Conv2dDirect(in_channels, ch, {3, 3}, {1, 1}, {1, 1})); + blocks["conv_in"] = std::shared_ptr(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true)); size_t num_resolutions = ch_mult.size(); @@ -268,18 +269,18 @@ class Encoder : public GGMLBlock { blocks["mid.block_2"] = std::shared_ptr(new ResnetBlock(block_in, block_in)); blocks["norm_out"] = std::shared_ptr(new GroupNorm32(block_in)); - blocks["conv_out"] = std::shared_ptr(new Conv2dDirect(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1})); + blocks["conv_out"] = std::shared_ptr(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true)); } virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { // x: [N, in_channels, h, w] - auto conv_in = std::dynamic_pointer_cast(blocks["conv_in"]); + auto conv_in = std::dynamic_pointer_cast(blocks["conv_in"]); auto mid_block_1 = std::dynamic_pointer_cast(blocks["mid.block_1"]); auto mid_attn_1 = std::dynamic_pointer_cast(blocks["mid.attn_1"]); auto mid_block_2 = std::dynamic_pointer_cast(blocks["mid.block_2"]); auto norm_out = std::dynamic_pointer_cast(blocks["norm_out"]); - auto conv_out = std::dynamic_pointer_cast(blocks["conv_out"]); + auto conv_out = std::dynamic_pointer_cast(blocks["conv_out"]); auto h = conv_in->forward(ctx, x); // [N, ch, h, w] @@ -328,11 +329,14 @@ class Decoder : public GGMLBlock { int64_t out_channels, std::pair kernel_size, std::pair stride = {1, 1}, - std::pair padding = {0, 0}) { + std::pair padding = {0, 0}, + std::pair dilation = {1, 1}, + bool bias = true, + bool direct = false){ if (video_decoder) { return std::shared_ptr(new AE3DConv(in_channels, out_channels, kernel_size, video_kernel_size, stride, padding)); } else { - return std::shared_ptr(new Conv2dDirect(in_channels, out_channels, kernel_size, stride, padding)); + return std::shared_ptr(new Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias, direct)); } } @@ -363,7 +367,7 @@ class Decoder : public GGMLBlock { size_t num_resolutions = ch_mult.size(); int block_in = ch * ch_mult[num_resolutions - 1]; - blocks["conv_in"] = std::shared_ptr(new Conv2dDirect(z_channels, block_in, {3, 3}, {1, 1}, {1, 1})); + blocks["conv_in"] = std::shared_ptr(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true)); blocks["mid.block_1"] = get_resnet_block(block_in, block_in); blocks["mid.attn_1"] = std::shared_ptr(new AttnBlock(block_in)); @@ -385,7 +389,7 @@ class Decoder : public GGMLBlock { } blocks["norm_out"] = std::shared_ptr(new GroupNorm32(block_in)); - blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1}); + blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true); } virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) { @@ -394,12 +398,12 @@ class Decoder : public GGMLBlock { // merge_strategy is always learned // time_mode is always conv-only, so we need to replace conv_out_op/resnet_op to AE3DConv/VideoResBlock // AttnVideoBlock will not be used - auto conv_in = std::dynamic_pointer_cast(blocks["conv_in"]); + auto conv_in = std::dynamic_pointer_cast(blocks["conv_in"]); auto mid_block_1 = std::dynamic_pointer_cast(blocks["mid.block_1"]); auto mid_attn_1 = std::dynamic_pointer_cast(blocks["mid.attn_1"]); auto mid_block_2 = std::dynamic_pointer_cast(blocks["mid.block_2"]); auto norm_out = std::dynamic_pointer_cast(blocks["norm_out"]); - auto conv_out = std::dynamic_pointer_cast(blocks["conv_out"]); + auto conv_out = std::dynamic_pointer_cast(blocks["conv_out"]); // conv_in auto h = conv_in->forward(ctx, z); // [N, block_in, h, w] @@ -472,9 +476,14 @@ class AutoencodingEngine : public GGMLBlock { dd_config.z_channels, use_video_decoder)); if (use_quant) { - blocks["post_quant_conv"] = std::shared_ptr(new Conv2dDirect(dd_config.z_channels, + blocks["post_quant_conv"] = std::shared_ptr(new Conv2d(dd_config.z_channels, embed_dim, - {1, 1})); + {1, 1}, + {1, 1}, + {0, 0}, + {1, 1}, + true, + true)); } if (!decode_only) { blocks["encoder"] = std::shared_ptr(new Encoder(dd_config.ch, @@ -486,9 +495,14 @@ class AutoencodingEngine : public GGMLBlock { if (use_quant) { int factor = dd_config.double_z ? 2 : 1; - blocks["quant_conv"] = std::shared_ptr(new Conv2dDirect(embed_dim * factor, + blocks["quant_conv"] = std::shared_ptr(new Conv2d(embed_dim * factor, dd_config.z_channels * factor, - {1, 1})); + {1, 1}, + {1, 1}, + {0, 0}, + {1, 1}, + true, + true)); } } } @@ -496,7 +510,7 @@ class AutoencodingEngine : public GGMLBlock { struct ggml_tensor* decode(struct ggml_context* ctx, struct ggml_tensor* z) { // z: [N, z_channels, h, w] if (use_quant) { - auto post_quant_conv = std::dynamic_pointer_cast(blocks["post_quant_conv"]); + auto post_quant_conv = std::dynamic_pointer_cast(blocks["post_quant_conv"]); z = post_quant_conv->forward(ctx, z); // [N, z_channels, h, w] } auto decoder = std::dynamic_pointer_cast(blocks["decoder"]); @@ -513,7 +527,7 @@ class AutoencodingEngine : public GGMLBlock { auto h = encoder->forward(ctx, x); // [N, 2*z_channels, h/8, w/8] if (use_quant) { - auto quant_conv = std::dynamic_pointer_cast(blocks["quant_conv"]); + auto quant_conv = std::dynamic_pointer_cast(blocks["quant_conv"]); h = quant_conv->forward(ctx, h); // [N, 2*embed_dim, h/8, w/8] } return h; From b06ddf9853792db9d73aca2504d454a39c801a7f Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Mon, 28 Jul 2025 19:21:24 +0200 Subject: [PATCH 03/12] Cmake option to use conv2d direct --- CMakeLists.txt | 6 ++++++ ggml_extend.hpp | 10 +++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 06de0d58b..f7b63c9e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,7 @@ option(SD_VULKAN "sd: vulkan backend" OFF) option(SD_OPENCL "sd: opencl backend" OFF) option(SD_SYCL "sd: sycl backend" OFF) option(SD_MUSA "sd: musa backend" OFF) +option(SD_CONV2D_DIRECT "sd: enable conv2d direct support" OFF) option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF) option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF) #option(SD_BUILD_SERVER "sd: build server example" ON) @@ -77,6 +78,11 @@ if(SD_MUSA) endif() endif() +if(SD_CONV2D_DIRECT) + message("-- Use CONV2D Direct for VAE") + add_definitions(-DSD_USE_CONV2D_DIRECT) +endif () + set(SD_LIB stable-diffusion) file(GLOB SD_LIB_SOURCES diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 161b48e91..45ad2381d 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1511,10 +1511,14 @@ class Conv2d : public UnaryBlock { b = params["bias"]; } if (direct) { - #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL) || defined(SD_USE_OPENCL) - return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); + #if defined(SD_USE_CONV2D_DIRECT) + #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL) || defined(SD_USE_OPENCL) + return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); + #else + return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); + #endif #else - return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); + return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); #endif } else { return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); From f5b5f5c77463476c6e4e7f059ba4a05ae3da682d Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Mon, 28 Jul 2025 19:28:08 +0200 Subject: [PATCH 04/12] conv2d direct always on for opencl --- CMakeLists.txt | 1 + ggml_extend.hpp | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f7b63c9e0..8bdbcd211 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,6 +58,7 @@ if (SD_OPENCL) message("-- Use OpenCL as backend stable-diffusion") set(GGML_OPENCL ON) add_definitions(-DSD_USE_OPENCL) + add_definitions(-DSD_USE_CONV2D_DIRECT) endif () if (SD_HIPBLAS) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 45ad2381d..d5cfd00f8 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1510,9 +1510,12 @@ class Conv2d : public UnaryBlock { if (bias) { b = params["bias"]; } + #if defined(SD_USE_OPENCL) + direct = true + #endif if (direct) { #if defined(SD_USE_CONV2D_DIRECT) - #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL) || defined(SD_USE_OPENCL) + #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL) return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); #else return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); From 7ce4f3b031828a8bc7cd6246ab50a686481f8b40 Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Wed, 30 Jul 2025 20:10:01 +0200 Subject: [PATCH 05/12] conv direct as a flag --- CMakeLists.txt | 7 ---- README.md | 2 + common.hpp | 27 +++++++----- diffusion_model.hpp | 5 ++- examples/cli/main.cpp | 10 +++++ ggml_extend.hpp | 10 ++--- stable-diffusion.cpp | 15 +++++-- stable-diffusion.h | 2 + tae.hpp | 80 ++++++++++++++++++++---------------- unet.hpp | 19 +++++---- vae.hpp | 95 ++++++++++++++++++++++++++----------------- 11 files changed, 161 insertions(+), 111 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8bdbcd211..06de0d58b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,6 @@ option(SD_VULKAN "sd: vulkan backend" OFF) option(SD_OPENCL "sd: opencl backend" OFF) option(SD_SYCL "sd: sycl backend" OFF) option(SD_MUSA "sd: musa backend" OFF) -option(SD_CONV2D_DIRECT "sd: enable conv2d direct support" OFF) option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF) option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF) #option(SD_BUILD_SERVER "sd: build server example" ON) @@ -58,7 +57,6 @@ if (SD_OPENCL) message("-- Use OpenCL as backend stable-diffusion") set(GGML_OPENCL ON) add_definitions(-DSD_USE_OPENCL) - add_definitions(-DSD_USE_CONV2D_DIRECT) endif () if (SD_HIPBLAS) @@ -79,11 +77,6 @@ if(SD_MUSA) endif() endif() -if(SD_CONV2D_DIRECT) - message("-- Use CONV2D Direct for VAE") - add_definitions(-DSD_USE_CONV2D_DIRECT) -endif () - set(SD_LIB stable-diffusion) file(GLOB SD_LIB_SOURCES diff --git a/README.md b/README.md index 89eb095ec..3513b9f42 100644 --- a/README.md +++ b/README.md @@ -339,6 +339,8 @@ arguments: --vae-on-cpu keep vae in cpu (for low vram) --clip-on-cpu keep clip in cpu (for low vram) --diffusion-fa use flash attention in the diffusion model (for low vram) + --diffusion-conv-direct use Conv2D direct in the diffusion model + --vae-conv-direct use Conv2D direct in the vae model (should improve the performance) Might lower quality, since it implies converting k and v to f16. This might crash if it is not supported by the backend. --control-net-cpu keep controlnet in cpu (for low vram) diff --git a/common.hpp b/common.hpp index c2aa397ca..5dff00eff 100644 --- a/common.hpp +++ b/common.hpp @@ -8,18 +8,21 @@ class DownSampleBlock : public GGMLBlock { int channels; int out_channels; bool vae_downsample; + bool direct = false; public: DownSampleBlock(int channels, int out_channels, - bool vae_downsample = false) + bool vae_downsample = false, + bool direct = false) : channels(channels), out_channels(out_channels), - vae_downsample(vae_downsample) { + vae_downsample(vae_downsample), + direct(direct) { if (vae_downsample) { - blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, true, true)); + blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, true, direct)); } else { - blocks["op"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1})); + blocks["op"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct)); } } @@ -43,13 +46,16 @@ class UpSampleBlock : public GGMLBlock { protected: int channels; int out_channels; + bool direct = false; public: UpSampleBlock(int channels, - int out_channels) + int out_channels, + bool direct = false) : channels(channels), - out_channels(out_channels) { - blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true)); + out_channels(out_channels), + direct(direct) { + blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -381,7 +387,8 @@ class SpatialTransformer : public GGMLBlock { int64_t d_head, int64_t depth, int64_t context_dim, - bool flash_attn = false) + bool flash_attn = false, + bool direct = false) : in_channels(in_channels), n_head(n_head), d_head(d_head), @@ -391,14 +398,14 @@ class SpatialTransformer : public GGMLBlock { // disable_self_attn is always False int64_t inner_dim = n_head * d_head; // in_channels blocks["norm"] = std::shared_ptr(new GroupNorm32(in_channels)); - blocks["proj_in"] = std::shared_ptr(new Conv2d(in_channels, inner_dim, {1, 1})); + blocks["proj_in"] = std::shared_ptr(new Conv2d(in_channels, inner_dim, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct)); for (int i = 0; i < depth; i++) { std::string name = "transformer_blocks." + std::to_string(i); blocks[name] = std::shared_ptr(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn)); } - blocks["proj_out"] = std::shared_ptr(new Conv2d(inner_dim, in_channels, {1, 1})); + blocks["proj_out"] = std::shared_ptr(new Conv2d(inner_dim, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct)); } virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) { diff --git a/diffusion_model.hpp b/diffusion_model.hpp index 5c349439d..fe799251b 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -34,8 +34,9 @@ struct UNetModel : public DiffusionModel { UNetModel(ggml_backend_t backend, std::map& tensor_types, SDVersion version = VERSION_SD1, - bool flash_attn = false) - : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) { + bool flash_attn = false, + bool direct = false) + : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn, direct) { } void alloc_params_buffer() { diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 140e3843a..3a13ef128 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -97,6 +97,8 @@ struct SDParams { bool clip_on_cpu = false; bool vae_on_cpu = false; bool diffusion_flash_attn = false; + bool diffusion_conv_direct = false; + bool vae_conv_direct = false; bool canny_preprocess = false; bool color = false; int upscale_repeats = 1; @@ -142,6 +144,8 @@ void print_params(SDParams params) { printf(" controlnet cpu: %s\n", params.control_net_cpu ? "true" : "false"); printf(" vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false"); printf(" diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false"); + printf(" diffusion Conv2D direct:%s\n", params.diffusion_conv_direct ? "true" : "false"); + printf(" vae Conv2D direct:%s\n", params.vae_conv_direct ? "true" : "false"); printf(" strength(control): %.2f\n", params.control_strength); printf(" prompt: %s\n", params.prompt.c_str()); printf(" negative_prompt: %s\n", params.negative_prompt.c_str()); @@ -232,6 +236,8 @@ void print_usage(int argc, const char* argv[]) { printf(" --diffusion-fa use flash attention in the diffusion model (for low vram)\n"); printf(" Might lower quality, since it implies converting k and v to f16.\n"); printf(" This might crash if it is not supported by the backend.\n"); + printf(" --diffusion-conv-direct use Conv2D direct in the diffusion model"); + printf(" --vae-conv-direct use Conv2D direct in the vae model (should improve the performance)"); printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n"); printf(" --canny apply canny preprocessor (edge detection)\n"); printf(" --color colors the logging tags according to level\n"); @@ -422,6 +428,8 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--clip-on-cpu", "", true, ¶ms.clip_on_cpu}, {"", "--vae-on-cpu", "", true, ¶ms.vae_on_cpu}, {"", "--diffusion-fa", "", true, ¶ms.diffusion_flash_attn}, + {"", "--diffusion-conv-direct", "", true, ¶ms.diffusion_conv_direct}, + {"", "--vae-conv-direct", "", true, ¶ms.vae_conv_direct}, {"", "--canny", "", true, ¶ms.canny_preprocess}, {"-v", "--verbos", "", true, ¶ms.verbose}, {"", "--color", "", true, ¶ms.color}, @@ -901,6 +909,8 @@ int main(int argc, const char* argv[]) { params.control_net_cpu, params.vae_on_cpu, params.diffusion_flash_attn, + params.diffusion_conv_direct, + params.vae_conv_direct, params.chroma_use_dit_mask, params.chroma_use_t5_mask, params.chroma_t5_mask_pad, diff --git a/ggml_extend.hpp b/ggml_extend.hpp index d5cfd00f8..d3d437be8 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1514,14 +1514,10 @@ class Conv2d : public UnaryBlock { direct = true #endif if (direct) { - #if defined(SD_USE_CONV2D_DIRECT) - #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL) - return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); - #else - return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); - #endif - #else + #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL) return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); + #else + return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); #endif } else { return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 2594ba2b7..7baa74282 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -326,6 +326,12 @@ class StableDiffusionGGML { LOG_INFO("CLIP: Using CPU backend"); clip_backend = ggml_backend_cpu_init(); } + if (sd_ctx_params->diffusion_conv_direct) { + LOG_INFO("Using Conv2D direct in the diffusion model"); + } + if (sd_ctx_params->vae_conv_direct){ + LOG_INFO("Using Conv2D direct in the vae model"); + } if (sd_ctx_params->diffusion_flash_attn) { LOG_INFO("Using flash attention in the diffusion model"); } @@ -373,7 +379,8 @@ class StableDiffusionGGML { diffusion_model = std::make_shared(backend, model_loader.tensor_storages_types, version, - sd_ctx_params->diffusion_flash_attn); + sd_ctx_params->diffusion_flash_attn, + sd_ctx_params->diffusion_conv_direct); } cond_stage_model->alloc_params_buffer(); @@ -394,7 +401,8 @@ class StableDiffusionGGML { "first_stage_model", vae_decode_only, false, - version); + version, + sd_ctx_params->vae_conv_direct); first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); } else { @@ -402,7 +410,8 @@ class StableDiffusionGGML { model_loader.tensor_storages_types, "decoder.layers", vae_decode_only, - version); + version, + sd_ctx_params->vae_conv_direct); } // first_stage_model->get_param_tensors(tensors, "first_stage_model."); diff --git a/stable-diffusion.h b/stable-diffusion.h index a60325923..fc68f9b13 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -134,6 +134,8 @@ typedef struct { bool keep_control_net_on_cpu; bool keep_vae_on_cpu; bool diffusion_flash_attn; + bool diffusion_conv_direct; + bool vae_conv_direct; bool chroma_use_dit_mask; bool chroma_use_t5_mask; int chroma_t5_mask_pad; diff --git a/tae.hpp b/tae.hpp index 678c44c57..54ec7d659 100644 --- a/tae.hpp +++ b/tae.hpp @@ -17,15 +17,16 @@ class TAEBlock : public UnaryBlock { protected: int n_in; int n_out; + bool direct = false; public: - TAEBlock(int n_in, int n_out) - : n_in(n_in), n_out(n_out) { - blocks["conv.0"] = std::shared_ptr(new Conv2d(n_in, n_out, {3, 3}, {1, 1}, {1, 1})); - blocks["conv.2"] = std::shared_ptr(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1})); - blocks["conv.4"] = std::shared_ptr(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1})); + TAEBlock(int n_in, int n_out, bool direct = false) + : n_in(n_in), n_out(n_out), direct(direct) { + blocks["conv.0"] = std::shared_ptr(new Conv2d(n_in, n_out, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["conv.2"] = std::shared_ptr(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["conv.4"] = std::shared_ptr(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); if (n_in != n_out) { - blocks["skip"] = std::shared_ptr(new Conv2d(n_in, n_out, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false)); + blocks["skip"] = std::shared_ptr(new Conv2d(n_in, n_out, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false, direct)); } } @@ -60,30 +61,32 @@ class TinyEncoder : public UnaryBlock { int channels = 64; int z_channels = 4; int num_blocks = 3; + bool direct = false; public: - TinyEncoder(int z_channels = 4) - : z_channels(z_channels) { + TinyEncoder(int z_channels = 4, bool direct = false) + : z_channels(z_channels), + direct(direct) { int index = 0; - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(in_channels, channels, {3, 3}, {1, 1}, {1, 1})); - blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(in_channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels, direct)); - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false, direct)); for (int i = 0; i < num_blocks; i++) { - blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels)); + blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels, direct)); } - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false, direct)); for (int i = 0; i < num_blocks; i++) { - blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels)); + blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels, direct)); } - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false, direct)); for (int i = 0; i < num_blocks; i++) { - blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels)); + blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels, direct)); } - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1})); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -105,35 +108,37 @@ class TinyDecoder : public UnaryBlock { int channels = 64; int out_channels = 3; int num_blocks = 3; + bool direct = false; public: - TinyDecoder(int z_channels = 4) - : z_channels(z_channels) { + TinyDecoder(int z_channels = 4, bool direct = false) + : z_channels(z_channels), + direct(direct) { int index = 0; - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(z_channels, channels, {3, 3}, {1, 1}, {1, 1})); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(z_channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); index++; // nn.ReLU() for (int i = 0; i < num_blocks; i++) { - blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels)); + blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels, direct)); } index++; // nn.Upsample() - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false, direct)); for (int i = 0; i < num_blocks; i++) { - blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels)); + blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels, direct)); } index++; // nn.Upsample() - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false, direct)); for (int i = 0; i < num_blocks; i++) { - blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels)); + blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels, direct)); } index++; // nn.Upsample() - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false, direct)); - blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels)); - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1})); + blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) { @@ -165,18 +170,20 @@ class TinyDecoder : public UnaryBlock { class TAESD : public GGMLBlock { protected: bool decode_only; + bool direct = false; public: - TAESD(bool decode_only = true, SDVersion version = VERSION_SD1) - : decode_only(decode_only) { + TAESD(bool decode_only = true, SDVersion version = VERSION_SD1, bool direct = false) + : decode_only(decode_only), + direct(direct) { int z_channels = 4; if (sd_version_is_dit(version)) { z_channels = 16; } - blocks["decoder.layers"] = std::shared_ptr(new TinyDecoder(z_channels)); + blocks["decoder.layers"] = std::shared_ptr(new TinyDecoder(z_channels, direct)); if (!decode_only) { - blocks["encoder.layers"] = std::shared_ptr(new TinyEncoder(z_channels)); + blocks["encoder.layers"] = std::shared_ptr(new TinyEncoder(z_channels, direct)); } } @@ -194,14 +201,17 @@ class TAESD : public GGMLBlock { struct TinyAutoEncoder : public GGMLRunner { TAESD taesd; bool decode_only = false; + bool direct = false; TinyAutoEncoder(ggml_backend_t backend, std::map& tensor_types, const std::string prefix, bool decoder_only = true, - SDVersion version = VERSION_SD1) + SDVersion version = VERSION_SD1, + bool direct = false) : decode_only(decoder_only), - taesd(decoder_only, version), + taesd(decoder_only, version, direct), + direct(direct), GGMLRunner(backend) { taesd.init(params_ctx, tensor_types, prefix); } @@ -258,4 +268,4 @@ struct TinyAutoEncoder : public GGMLRunner { } }; -#endif // __TAE_HPP__ \ No newline at end of file +#endif // __TAE_HPP__ diff --git a/unet.hpp b/unet.hpp index 9193dcd67..d5db40e8f 100644 --- a/unet.hpp +++ b/unet.hpp @@ -184,7 +184,7 @@ class UnetModelBlock : public GGMLBlock { int model_channels = 320; int adm_in_channels = 2816; // only for VERSION_SDXL/SVD - UnetModelBlock(SDVersion version = VERSION_SD1, std::map& tensor_types = empty_tensor_types, bool flash_attn = false) + UnetModelBlock(SDVersion version = VERSION_SD1, std::map& tensor_types = empty_tensor_types, bool flash_attn = false, bool direct = false) : version(version) { if (sd_version_is_sd2(version)) { context_dim = 1024; @@ -225,7 +225,7 @@ class UnetModelBlock : public GGMLBlock { } // input_blocks - blocks["input_blocks.0.0"] = std::shared_ptr(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1})); + blocks["input_blocks.0.0"] = std::shared_ptr(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); std::vector input_block_chans; input_block_chans.push_back(model_channels); @@ -237,7 +237,7 @@ class UnetModelBlock : public GGMLBlock { if (version == VERSION_SVD) { return new VideoResBlock(channels, emb_channels, out_channels); } else { - return new ResBlock(channels, emb_channels, out_channels); + return new ResBlock(channels, emb_channels, out_channels, {3, 3}); } }; @@ -249,7 +249,7 @@ class UnetModelBlock : public GGMLBlock { if (version == VERSION_SVD) { return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim); } else { - return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, flash_attn); + return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, flash_attn, direct); } }; @@ -281,7 +281,7 @@ class UnetModelBlock : public GGMLBlock { if (i != len_mults - 1) { input_block_idx += 1; std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0"; - blocks[name] = std::shared_ptr(new DownSampleBlock(ch, ch)); + blocks[name] = std::shared_ptr(new DownSampleBlock(ch, ch, false, direct)); input_block_chans.push_back(ch); ds *= 2; @@ -331,7 +331,7 @@ class UnetModelBlock : public GGMLBlock { if (i > 0 && j == num_res_blocks) { std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx); - blocks[name] = std::shared_ptr(new UpSampleBlock(ch, ch)); + blocks[name] = std::shared_ptr(new UpSampleBlock(ch, ch, direct)); ds /= 2; } @@ -343,7 +343,7 @@ class UnetModelBlock : public GGMLBlock { // out blocks["out.0"] = std::shared_ptr(new GroupNorm32(ch)); // ch == model_channels // out_1 is nn.SiLU() - blocks["out.2"] = std::shared_ptr(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); + blocks["out.2"] = std::shared_ptr(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); } struct ggml_tensor* resblock_forward(std::string name, @@ -542,8 +542,9 @@ struct UNetModelRunner : public GGMLRunner { std::map& tensor_types, const std::string prefix, SDVersion version = VERSION_SD1, - bool flash_attn = false) - : GGMLRunner(backend), unet(version, tensor_types, flash_attn) { + bool flash_attn = false, + bool direct = false) + : GGMLRunner(backend), unet(version, tensor_types, flash_attn, direct) { unet.init(params_ctx, tensor_types, prefix); } diff --git a/vae.hpp b/vae.hpp index 42e64a95e..07afe28c0 100644 --- a/vae.hpp +++ b/vae.hpp @@ -12,21 +12,24 @@ class ResnetBlock : public UnaryBlock { protected: int64_t in_channels; int64_t out_channels; + bool direct = false; public: ResnetBlock(int64_t in_channels, - int64_t out_channels) + int64_t out_channels, + bool direct = false) : in_channels(in_channels), - out_channels(out_channels) { + out_channels(out_channels), + direct(direct){ // temb_channels is always 0 blocks["norm1"] = std::shared_ptr(new GroupNorm32(in_channels)); - blocks["conv1"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true)); + blocks["conv1"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); blocks["norm2"] = std::shared_ptr(new GroupNorm32(out_channels)); - blocks["conv2"] = std::shared_ptr(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true)); + blocks["conv2"] = std::shared_ptr(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); if (out_channels != in_channels) { - blocks["nin_shortcut"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true)); + blocks["nin_shortcut"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct)); } } @@ -64,16 +67,19 @@ class ResnetBlock : public UnaryBlock { class AttnBlock : public UnaryBlock { protected: int64_t in_channels; + bool direct = false; public: - AttnBlock(int64_t in_channels) - : in_channels(in_channels) { + AttnBlock(int64_t in_channels, + bool direct = false) + : in_channels(in_channels), + direct(direct){ blocks["norm"] = std::shared_ptr(new GroupNorm32(in_channels)); - blocks["q"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true)); - blocks["k"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true)); - blocks["v"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true)); + blocks["q"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct)); + blocks["k"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct)); + blocks["v"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct)); - blocks["proj_out"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, true)); + blocks["proj_out"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct)); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -177,8 +183,9 @@ class VideoResnetBlock : public ResnetBlock { public: VideoResnetBlock(int64_t in_channels, int64_t out_channels, - int video_kernel_size = 3) - : ResnetBlock(in_channels, out_channels) { + int video_kernel_size = 3, + bool direct = false) + : ResnetBlock(in_channels, out_channels, direct) { // merge_strategy is always learned blocks["time_stack"] = std::shared_ptr(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true)); } @@ -227,6 +234,7 @@ class Encoder : public GGMLBlock { int in_channels = 3; int z_channels = 4; bool double_z = true; + bool direct = false; public: Encoder(int ch, @@ -234,14 +242,16 @@ class Encoder : public GGMLBlock { int num_res_blocks, int in_channels, int z_channels, - bool double_z = true) + bool double_z = true, + bool direct = false) : ch(ch), ch_mult(ch_mult), num_res_blocks(num_res_blocks), in_channels(in_channels), z_channels(z_channels), - double_z(double_z) { - blocks["conv_in"] = std::shared_ptr(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true)); + double_z(double_z), + direct(direct){ + blocks["conv_in"] = std::shared_ptr(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); size_t num_resolutions = ch_mult.size(); @@ -255,21 +265,21 @@ class Encoder : public GGMLBlock { int block_out = ch * ch_mult[i]; for (int j = 0; j < num_res_blocks; j++) { std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j); - blocks[name] = std::shared_ptr(new ResnetBlock(block_in, block_out)); + blocks[name] = std::shared_ptr(new ResnetBlock(block_in, block_out, direct)); block_in = block_out; } if (i != num_resolutions - 1) { std::string name = "down." + std::to_string(i) + ".downsample"; - blocks[name] = std::shared_ptr(new DownSampleBlock(block_in, block_in, true)); + blocks[name] = std::shared_ptr(new DownSampleBlock(block_in, block_in, true, direct)); } } - blocks["mid.block_1"] = std::shared_ptr(new ResnetBlock(block_in, block_in)); - blocks["mid.attn_1"] = std::shared_ptr(new AttnBlock(block_in)); - blocks["mid.block_2"] = std::shared_ptr(new ResnetBlock(block_in, block_in)); + blocks["mid.block_1"] = std::shared_ptr(new ResnetBlock(block_in, block_in, direct)); + blocks["mid.attn_1"] = std::shared_ptr(new AttnBlock(block_in, direct)); + blocks["mid.block_2"] = std::shared_ptr(new ResnetBlock(block_in, block_in, direct)); blocks["norm_out"] = std::shared_ptr(new GroupNorm32(block_in)); - blocks["conv_out"] = std::shared_ptr(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true)); + blocks["conv_out"] = std::shared_ptr(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); } virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -324,6 +334,7 @@ class Decoder : public GGMLBlock { int z_channels = 4; bool video_decoder = false; int video_kernel_size = 3; + bool direct = false; virtual std::shared_ptr get_conv_out(int64_t in_channels, int64_t out_channels, @@ -343,9 +354,9 @@ class Decoder : public GGMLBlock { virtual std::shared_ptr get_resnet_block(int64_t in_channels, int64_t out_channels) { if (video_decoder) { - return std::shared_ptr(new VideoResnetBlock(in_channels, out_channels, video_kernel_size)); + return std::shared_ptr(new VideoResnetBlock(in_channels, out_channels, video_kernel_size, direct)); } else { - return std::shared_ptr(new ResnetBlock(in_channels, out_channels)); + return std::shared_ptr(new ResnetBlock(in_channels, out_channels, direct)); } } @@ -356,21 +367,23 @@ class Decoder : public GGMLBlock { int num_res_blocks, int z_channels, bool video_decoder = false, - int video_kernel_size = 3) + int video_kernel_size = 3, + bool direct = false) : ch(ch), out_ch(out_ch), ch_mult(ch_mult), num_res_blocks(num_res_blocks), z_channels(z_channels), video_decoder(video_decoder), - video_kernel_size(video_kernel_size) { + video_kernel_size(video_kernel_size), + direct(direct) { size_t num_resolutions = ch_mult.size(); int block_in = ch * ch_mult[num_resolutions - 1]; - blocks["conv_in"] = std::shared_ptr(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true)); + blocks["conv_in"] = std::shared_ptr(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); blocks["mid.block_1"] = get_resnet_block(block_in, block_in); - blocks["mid.attn_1"] = std::shared_ptr(new AttnBlock(block_in)); + blocks["mid.attn_1"] = std::shared_ptr(new AttnBlock(block_in, direct)); blocks["mid.block_2"] = get_resnet_block(block_in, block_in); for (int i = num_resolutions - 1; i >= 0; i--) { @@ -384,12 +397,12 @@ class Decoder : public GGMLBlock { } if (i != 0) { std::string name = "up." + std::to_string(i) + ".upsample"; - blocks[name] = std::shared_ptr(new UpSampleBlock(block_in, block_in)); + blocks[name] = std::shared_ptr(new UpSampleBlock(block_in, block_in, direct)); } } blocks["norm_out"] = std::shared_ptr(new GroupNorm32(block_in)); - blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, true); + blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct); } virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) { @@ -446,6 +459,7 @@ class AutoencodingEngine : public GGMLBlock { bool use_video_decoder = false; bool use_quant = true; int embed_dim = 4; + bool direct = false; struct { int z_channels = 4; int resolution = 256; @@ -460,8 +474,9 @@ class AutoencodingEngine : public GGMLBlock { public: AutoencodingEngine(bool decode_only = true, bool use_video_decoder = false, - SDVersion version = VERSION_SD1) - : decode_only(decode_only), use_video_decoder(use_video_decoder) { + SDVersion version = VERSION_SD1, + bool direct = false) + : decode_only(decode_only), use_video_decoder(use_video_decoder), direct(direct) { if (sd_version_is_dit(version)) { dd_config.z_channels = 16; use_quant = false; @@ -474,7 +489,9 @@ class AutoencodingEngine : public GGMLBlock { dd_config.ch_mult, dd_config.num_res_blocks, dd_config.z_channels, - use_video_decoder)); + use_video_decoder, + 3, + direct)); if (use_quant) { blocks["post_quant_conv"] = std::shared_ptr(new Conv2d(dd_config.z_channels, embed_dim, @@ -483,7 +500,7 @@ class AutoencodingEngine : public GGMLBlock { {0, 0}, {1, 1}, true, - true)); + direct)); } if (!decode_only) { blocks["encoder"] = std::shared_ptr(new Encoder(dd_config.ch, @@ -491,7 +508,8 @@ class AutoencodingEngine : public GGMLBlock { dd_config.num_res_blocks, dd_config.in_channels, dd_config.z_channels, - dd_config.double_z)); + dd_config.double_z, + direct)); if (use_quant) { int factor = dd_config.double_z ? 2 : 1; @@ -502,7 +520,7 @@ class AutoencodingEngine : public GGMLBlock { {0, 0}, {1, 1}, true, - true)); + direct)); } } } @@ -543,8 +561,9 @@ struct AutoEncoderKL : public GGMLRunner { const std::string prefix, bool decode_only = false, bool use_video_decoder = false, - SDVersion version = VERSION_SD1) - : decode_only(decode_only), ae(decode_only, use_video_decoder, version), GGMLRunner(backend) { + SDVersion version = VERSION_SD1, + bool direct = false) + : decode_only(decode_only), ae(decode_only, use_video_decoder, version, direct), GGMLRunner(backend) { ae.init(params_ctx, tensor_types, prefix); } From 70cef96a7654afb9efe2535cbd1af5f94df789e4 Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Wed, 30 Jul 2025 20:22:01 +0200 Subject: [PATCH 06/12] fix merge typo --- diffusion_model.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/diffusion_model.hpp b/diffusion_model.hpp index 33175b9f1..de65c1c36 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -34,7 +34,8 @@ struct UNetModel : public DiffusionModel { UNetModel(ggml_backend_t backend, const String2GGMLType& tensor_types = {}, SDVersion version = VERSION_SD1, - bool flash_attn = false) + bool flash_attn = false, + bool direct = false) : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn, direct) { } From 9bbf53c3c67c1840d813e8c4b907f0e11f59c8f5 Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Thu, 31 Jul 2025 11:24:59 +0200 Subject: [PATCH 07/12] Align conv2d behavior to flash attention's --- examples/cli/main.cpp | 2 ++ ggml_extend.hpp | 9 +-------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 3a13ef128..f8fedcb69 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -237,7 +237,9 @@ void print_usage(int argc, const char* argv[]) { printf(" Might lower quality, since it implies converting k and v to f16.\n"); printf(" This might crash if it is not supported by the backend.\n"); printf(" --diffusion-conv-direct use Conv2D direct in the diffusion model"); + printf(" This might crash if it is not supported by the backend.\n"); printf(" --vae-conv-direct use Conv2D direct in the vae model (should improve the performance)"); + printf(" This might crash if it is not supported by the backend.\n"); printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n"); printf(" --canny apply canny preprocessor (edge detection)\n"); printf(" --color colors the logging tags according to level\n"); diff --git a/ggml_extend.hpp b/ggml_extend.hpp index ab4926ee1..de7b2bfa1 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1518,15 +1518,8 @@ class Conv2d : public UnaryBlock { if (bias) { b = params["bias"]; } - #if defined(SD_USE_OPENCL) - direct = true - #endif if (direct) { - #if defined(SD_USE_CUDA) || defined(SD_USE_SYCL) || defined(SD_USE_METAL) - return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); - #else - return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); - #endif + return ggml_nn_conv_2d_direct(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); } else { return ggml_nn_conv_2d(ctx, x, w, b, stride.second, stride.first, padding.second, padding.first, dilation.second, dilation.first); } From 9a349b2f73969e36b48005887b59041ac89cba2c Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Thu, 31 Jul 2025 12:14:56 +0200 Subject: [PATCH 08/12] fix readme --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3513b9f42..e07142c00 100644 --- a/README.md +++ b/README.md @@ -339,9 +339,11 @@ arguments: --vae-on-cpu keep vae in cpu (for low vram) --clip-on-cpu keep clip in cpu (for low vram) --diffusion-fa use flash attention in the diffusion model (for low vram) + Might lower quality, since it implies converting k and v to f16. + This might crash if it is not supported by the backend. --diffusion-conv-direct use Conv2D direct in the diffusion model + This might crash if it is not supported by the backend. --vae-conv-direct use Conv2D direct in the vae model (should improve the performance) - Might lower quality, since it implies converting k and v to f16. This might crash if it is not supported by the backend. --control-net-cpu keep controlnet in cpu (for low vram) --canny apply canny preprocessor (edge detection) From 8974ec134faa00cb87a1a13c97869f1d8849becd Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Thu, 31 Jul 2025 12:46:51 +0200 Subject: [PATCH 09/12] add conv2d direct for controlnet --- control.hpp | 34 +++++++++++++++++++--------------- stable-diffusion.cpp | 2 +- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/control.hpp b/control.hpp index d8f81fc0d..af28fcb33 100644 --- a/control.hpp +++ b/control.hpp @@ -27,13 +27,16 @@ class ControlNetBlock : public GGMLBlock { int num_heads = 8; int num_head_channels = -1; // channels // num_heads int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL + bool direct = false; public: int model_channels = 320; int adm_in_channels = 2816; // only for VERSION_SDXL - ControlNetBlock(SDVersion version = VERSION_SD1) - : version(version) { + ControlNetBlock(SDVersion version = VERSION_SD1, + bool direct = false) + : version(version), + direct(direct) { if (sd_version_is_sd2(version)) { context_dim = 1024; num_head_channels = 64; @@ -65,7 +68,7 @@ class ControlNetBlock : public GGMLBlock { } // input_blocks - blocks["input_blocks.0.0"] = std::shared_ptr(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1})); + blocks["input_blocks.0.0"] = std::shared_ptr(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); std::vector input_block_chans; input_block_chans.push_back(model_channels); @@ -86,26 +89,26 @@ class ControlNetBlock : public GGMLBlock { }; auto make_zero_conv = [&](int64_t channels) { - return new Conv2d(channels, channels, {1, 1}); + return new Conv2d(channels, channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct); }; blocks["zero_convs.0.0"] = std::shared_ptr(make_zero_conv(model_channels)); - blocks["input_hint_block.0"] = std::shared_ptr(new Conv2d(hint_channels, 16, {3, 3}, {1, 1}, {1, 1})); + blocks["input_hint_block.0"] = std::shared_ptr(new Conv2d(hint_channels, 16, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); // nn.SiLU() - blocks["input_hint_block.2"] = std::shared_ptr(new Conv2d(16, 16, {3, 3}, {1, 1}, {1, 1})); + blocks["input_hint_block.2"] = std::shared_ptr(new Conv2d(16, 16, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); // nn.SiLU() - blocks["input_hint_block.4"] = std::shared_ptr(new Conv2d(16, 32, {3, 3}, {2, 2}, {1, 1})); + blocks["input_hint_block.4"] = std::shared_ptr(new Conv2d(16, 32, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct)); // nn.SiLU() - blocks["input_hint_block.6"] = std::shared_ptr(new Conv2d(32, 32, {3, 3}, {1, 1}, {1, 1})); + blocks["input_hint_block.6"] = std::shared_ptr(new Conv2d(32, 32, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); // nn.SiLU() - blocks["input_hint_block.8"] = std::shared_ptr(new Conv2d(32, 96, {3, 3}, {2, 2}, {1, 1})); + blocks["input_hint_block.8"] = std::shared_ptr(new Conv2d(32, 96, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct)); // nn.SiLU() - blocks["input_hint_block.10"] = std::shared_ptr(new Conv2d(96, 96, {3, 3}, {1, 1}, {1, 1})); + blocks["input_hint_block.10"] = std::shared_ptr(new Conv2d(96, 96, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); // nn.SiLU() - blocks["input_hint_block.12"] = std::shared_ptr(new Conv2d(96, 256, {3, 3}, {2, 2}, {1, 1})); + blocks["input_hint_block.12"] = std::shared_ptr(new Conv2d(96, 256, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct)); // nn.SiLU() - blocks["input_hint_block.14"] = std::shared_ptr(new Conv2d(256, model_channels, {3, 3}, {1, 1}, {1, 1})); + blocks["input_hint_block.14"] = std::shared_ptr(new Conv2d(256, model_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); size_t len_mults = channel_mult.size(); for (int i = 0; i < len_mults; i++) { @@ -318,8 +321,9 @@ struct ControlNet : public GGMLRunner { ControlNet(ggml_backend_t backend, const String2GGMLType& tensor_types = {}, - SDVersion version = VERSION_SD1) - : GGMLRunner(backend), control_net(version) { + SDVersion version = VERSION_SD1, + bool direct = false) + : GGMLRunner(backend), control_net(version, direct) { control_net.init(params_ctx, tensor_types, ""); } @@ -455,4 +459,4 @@ struct ControlNet : public GGMLRunner { } }; -#endif // __CONTROL_HPP__ \ No newline at end of file +#endif // __CONTROL_HPP__ diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 7baa74282..245ffea0e 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -423,7 +423,7 @@ class StableDiffusionGGML { } else { controlnet_backend = backend; } - control_net = std::make_shared(controlnet_backend, model_loader.tensor_storages_types, version); + control_net = std::make_shared(controlnet_backend, model_loader.tensor_storages_types, version, sd_ctx_params->diffusion_conv_direct); } if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) { From 9b6339c915b62a27653b52492e198357636afc7e Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Thu, 31 Jul 2025 13:09:55 +0200 Subject: [PATCH 10/12] add conv2d direct for esrgan --- esrgan.hpp | 44 ++++++++++++++++++++++--------------------- examples/cli/main.cpp | 3 ++- stable-diffusion.h | 3 ++- upscaler.cpp | 14 +++++++++----- 4 files changed, 36 insertions(+), 28 deletions(-) diff --git a/esrgan.hpp b/esrgan.hpp index 4215db192..96997ac1e 100644 --- a/esrgan.hpp +++ b/esrgan.hpp @@ -16,15 +16,16 @@ class ResidualDenseBlock : public GGMLBlock { protected: int num_feat; int num_grow_ch; + bool direct = false; public: - ResidualDenseBlock(int num_feat = 64, int num_grow_ch = 32) - : num_feat(num_feat), num_grow_ch(num_grow_ch) { - blocks["conv1"] = std::shared_ptr(new Conv2d(num_feat, num_grow_ch, {3, 3}, {1, 1}, {1, 1})); - blocks["conv2"] = std::shared_ptr(new Conv2d(num_feat + num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1})); - blocks["conv3"] = std::shared_ptr(new Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1})); - blocks["conv4"] = std::shared_ptr(new Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1})); - blocks["conv5"] = std::shared_ptr(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1})); + ResidualDenseBlock(int num_feat = 64, int num_grow_ch = 32, bool direct = false) + : num_feat(num_feat), num_grow_ch(num_grow_ch), direct(direct) { + blocks["conv1"] = std::shared_ptr(new Conv2d(num_feat, num_grow_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["conv2"] = std::shared_ptr(new Conv2d(num_feat + num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["conv3"] = std::shared_ptr(new Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["conv4"] = std::shared_ptr(new Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["conv5"] = std::shared_ptr(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); } struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -58,10 +59,10 @@ class ResidualDenseBlock : public GGMLBlock { class RRDB : public GGMLBlock { public: - RRDB(int num_feat, int num_grow_ch = 32) { - blocks["rdb1"] = std::shared_ptr(new ResidualDenseBlock(num_feat, num_grow_ch)); - blocks["rdb2"] = std::shared_ptr(new ResidualDenseBlock(num_feat, num_grow_ch)); - blocks["rdb3"] = std::shared_ptr(new ResidualDenseBlock(num_feat, num_grow_ch)); + RRDB(int num_feat, int num_grow_ch = 32, bool direct = false) { + blocks["rdb1"] = std::shared_ptr(new ResidualDenseBlock(num_feat, num_grow_ch, direct)); + blocks["rdb2"] = std::shared_ptr(new ResidualDenseBlock(num_feat, num_grow_ch, direct)); + blocks["rdb3"] = std::shared_ptr(new ResidualDenseBlock(num_feat, num_grow_ch, direct)); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -89,20 +90,21 @@ class RRDBNet : public GGMLBlock { int num_out_ch = 3; int num_feat = 64; // default RealESRGAN_x4plus_anime_6B int num_grow_ch = 32; // default RealESRGAN_x4plus_anime_6B + bool direct = false; public: - RRDBNet() { - blocks["conv_first"] = std::shared_ptr(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1})); + RRDBNet(bool direct = false) { + blocks["conv_first"] = std::shared_ptr(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); for (int i = 0; i < num_block; i++) { std::string name = "body." + std::to_string(i); blocks[name] = std::shared_ptr(new RRDB(num_feat, num_grow_ch)); } - blocks["conv_body"] = std::shared_ptr(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1})); + blocks["conv_body"] = std::shared_ptr(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); // upsample - blocks["conv_up1"] = std::shared_ptr(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1})); - blocks["conv_up2"] = std::shared_ptr(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1})); - blocks["conv_hr"] = std::shared_ptr(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1})); - blocks["conv_last"] = std::shared_ptr(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1})); + blocks["conv_up1"] = std::shared_ptr(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["conv_up2"] = std::shared_ptr(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["conv_hr"] = std::shared_ptr(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["conv_last"] = std::shared_ptr(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); } struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -142,8 +144,8 @@ struct ESRGAN : public GGMLRunner { int scale = 4; int tile_size = 128; // avoid cuda OOM for 4gb VRAM - ESRGAN(ggml_backend_t backend, const String2GGMLType& tensor_types = {}) - : GGMLRunner(backend) { + ESRGAN(ggml_backend_t backend, const String2GGMLType& tensor_types = {}, bool direct = false) + : GGMLRunner(backend), rrdb_net(direct) { rrdb_net.init(params_ctx, tensor_types, ""); } @@ -194,4 +196,4 @@ struct ESRGAN : public GGMLRunner { } }; -#endif // __ESRGAN_HPP__ \ No newline at end of file +#endif // __ESRGAN_HPP__ diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index f8fedcb69..98aadb044 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -1024,7 +1024,8 @@ int main(int argc, const char* argv[]) { int upscale_factor = 4; // unused for RealESRGAN_x4plus_anime_6B.pth if (params.esrgan_path.size() > 0 && params.upscale_repeats > 0) { upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(), - params.n_threads); + params.n_threads, + params.diffusion_conv_direct); if (upscaler_ctx == NULL) { printf("new_upscaler_ctx failed\n"); diff --git a/stable-diffusion.h b/stable-diffusion.h index fc68f9b13..e87ac2ce2 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -238,7 +238,8 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s typedef struct upscaler_ctx_t upscaler_ctx_t; SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path, - int n_threads); + int n_threads, + bool direct); SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx); SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor); diff --git a/upscaler.cpp b/upscaler.cpp index 137213496..69d5ef392 100644 --- a/upscaler.cpp +++ b/upscaler.cpp @@ -9,9 +9,12 @@ struct UpscalerGGML { std::shared_ptr esrgan_upscaler; std::string esrgan_path; int n_threads; + bool direct = false; - UpscalerGGML(int n_threads) - : n_threads(n_threads) { + UpscalerGGML(int n_threads, + bool direct = false) + : n_threads(n_threads), + direct(direct) { } bool load_from_file(const std::string& esrgan_path) { @@ -46,7 +49,7 @@ struct UpscalerGGML { backend = ggml_backend_cpu_init(); } LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type)); - esrgan_upscaler = std::make_shared(backend, model_loader.tensor_storages_types); + esrgan_upscaler = std::make_shared(backend, model_loader.tensor_storages_types, direct); if (!esrgan_upscaler->load_from_file(esrgan_path)) { return false; } @@ -104,14 +107,15 @@ struct upscaler_ctx_t { }; upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str, - int n_threads) { + int n_threads, + bool direct = false) { upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t)); if (upscaler_ctx == NULL) { return NULL; } std::string esrgan_path(esrgan_path_c_str); - upscaler_ctx->upscaler = new UpscalerGGML(n_threads); + upscaler_ctx->upscaler = new UpscalerGGML(n_threads, direct); if (upscaler_ctx->upscaler == NULL) { return NULL; } From 2e85d2c4cfd8a8c50c7d6344f1a67e707ffa6cb8 Mon Sep 17 00:00:00 2001 From: leejet Date: Sat, 2 Aug 2025 13:47:41 +0800 Subject: [PATCH 11/12] clean code, use enable_conv2d_direct/get_all_blocks --- README.md | 4 +- common.hpp | 27 ++++----- control.hpp | 45 ++++++++------- diffusion_model.hpp | 5 +- esrgan.hpp | 55 ++++++++++-------- examples/cli/main.cpp | 8 +-- ggml_extend.hpp | 29 ++++++++-- stable-diffusion.cpp | 33 ++++++----- tae.hpp | 91 +++++++++++++++--------------- unet.hpp | 31 ++++++---- upscaler.cpp | 5 +- vae.hpp | 128 +++++++++++++++++------------------------- 12 files changed, 244 insertions(+), 217 deletions(-) diff --git a/README.md b/README.md index e07142c00..5a28052f6 100644 --- a/README.md +++ b/README.md @@ -341,9 +341,9 @@ arguments: --diffusion-fa use flash attention in the diffusion model (for low vram) Might lower quality, since it implies converting k and v to f16. This might crash if it is not supported by the backend. - --diffusion-conv-direct use Conv2D direct in the diffusion model + --diffusion-conv-direct use Conv2d direct in the diffusion model This might crash if it is not supported by the backend. - --vae-conv-direct use Conv2D direct in the vae model (should improve the performance) + --vae-conv-direct use Conv2d direct in the vae model (should improve the performance) This might crash if it is not supported by the backend. --control-net-cpu keep controlnet in cpu (for low vram) --canny apply canny preprocessor (edge detection) diff --git a/common.hpp b/common.hpp index b92d31722..3a1307767 100644 --- a/common.hpp +++ b/common.hpp @@ -8,21 +8,18 @@ class DownSampleBlock : public GGMLBlock { int channels; int out_channels; bool vae_downsample; - bool direct = false; public: DownSampleBlock(int channels, int out_channels, - bool vae_downsample = false, - bool direct = false) + bool vae_downsample = false) : channels(channels), out_channels(out_channels), - vae_downsample(vae_downsample), - direct(direct) { + vae_downsample(vae_downsample) { if (vae_downsample) { - blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}, {1, 1}, true, direct)); + blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0})); } else { - blocks["op"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct)); + blocks["op"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1})); } } @@ -46,16 +43,13 @@ class UpSampleBlock : public GGMLBlock { protected: int channels; int out_channels; - bool direct = false; public: UpSampleBlock(int channels, - int out_channels, - bool direct = false) + int out_channels) : channels(channels), - out_channels(out_channels), - direct(direct) { - blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + out_channels(out_channels) { + blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1})); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -387,8 +381,7 @@ class SpatialTransformer : public GGMLBlock { int64_t d_head, int64_t depth, int64_t context_dim, - bool flash_attn = false, - bool direct = false) + bool flash_attn = false) : in_channels(in_channels), n_head(n_head), d_head(d_head), @@ -398,14 +391,14 @@ class SpatialTransformer : public GGMLBlock { // disable_self_attn is always False int64_t inner_dim = n_head * d_head; // in_channels blocks["norm"] = std::shared_ptr(new GroupNorm32(in_channels)); - blocks["proj_in"] = std::shared_ptr(new Conv2d(in_channels, inner_dim, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct)); + blocks["proj_in"] = std::shared_ptr(new Conv2d(in_channels, inner_dim, {1, 1})); for (int i = 0; i < depth; i++) { std::string name = "transformer_blocks." + std::to_string(i); blocks[name] = std::shared_ptr(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn)); } - blocks["proj_out"] = std::shared_ptr(new Conv2d(inner_dim, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct)); + blocks["proj_out"] = std::shared_ptr(new Conv2d(inner_dim, in_channels, {1, 1})); } virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) { diff --git a/control.hpp b/control.hpp index af28fcb33..63fe70455 100644 --- a/control.hpp +++ b/control.hpp @@ -27,16 +27,13 @@ class ControlNetBlock : public GGMLBlock { int num_heads = 8; int num_head_channels = -1; // channels // num_heads int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL - bool direct = false; public: int model_channels = 320; int adm_in_channels = 2816; // only for VERSION_SDXL - ControlNetBlock(SDVersion version = VERSION_SD1, - bool direct = false) - : version(version), - direct(direct) { + ControlNetBlock(SDVersion version = VERSION_SD1) + : version(version) { if (sd_version_is_sd2(version)) { context_dim = 1024; num_head_channels = 64; @@ -68,7 +65,7 @@ class ControlNetBlock : public GGMLBlock { } // input_blocks - blocks["input_blocks.0.0"] = std::shared_ptr(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["input_blocks.0.0"] = std::shared_ptr(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1})); std::vector input_block_chans; input_block_chans.push_back(model_channels); @@ -89,26 +86,26 @@ class ControlNetBlock : public GGMLBlock { }; auto make_zero_conv = [&](int64_t channels) { - return new Conv2d(channels, channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct); + return new Conv2d(channels, channels, {1, 1}); }; blocks["zero_convs.0.0"] = std::shared_ptr(make_zero_conv(model_channels)); - blocks["input_hint_block.0"] = std::shared_ptr(new Conv2d(hint_channels, 16, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["input_hint_block.0"] = std::shared_ptr(new Conv2d(hint_channels, 16, {3, 3}, {1, 1}, {1, 1})); // nn.SiLU() - blocks["input_hint_block.2"] = std::shared_ptr(new Conv2d(16, 16, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["input_hint_block.2"] = std::shared_ptr(new Conv2d(16, 16, {3, 3}, {1, 1}, {1, 1})); // nn.SiLU() - blocks["input_hint_block.4"] = std::shared_ptr(new Conv2d(16, 32, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct)); + blocks["input_hint_block.4"] = std::shared_ptr(new Conv2d(16, 32, {3, 3}, {2, 2}, {1, 1})); // nn.SiLU() - blocks["input_hint_block.6"] = std::shared_ptr(new Conv2d(32, 32, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["input_hint_block.6"] = std::shared_ptr(new Conv2d(32, 32, {3, 3}, {1, 1}, {1, 1})); // nn.SiLU() - blocks["input_hint_block.8"] = std::shared_ptr(new Conv2d(32, 96, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct)); + blocks["input_hint_block.8"] = std::shared_ptr(new Conv2d(32, 96, {3, 3}, {2, 2}, {1, 1})); // nn.SiLU() - blocks["input_hint_block.10"] = std::shared_ptr(new Conv2d(96, 96, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["input_hint_block.10"] = std::shared_ptr(new Conv2d(96, 96, {3, 3}, {1, 1}, {1, 1})); // nn.SiLU() - blocks["input_hint_block.12"] = std::shared_ptr(new Conv2d(96, 256, {3, 3}, {2, 2}, {1, 1}, {1, 1}, true, direct)); + blocks["input_hint_block.12"] = std::shared_ptr(new Conv2d(96, 256, {3, 3}, {2, 2}, {1, 1})); // nn.SiLU() - blocks["input_hint_block.14"] = std::shared_ptr(new Conv2d(256, model_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["input_hint_block.14"] = std::shared_ptr(new Conv2d(256, model_channels, {3, 3}, {1, 1}, {1, 1})); size_t len_mults = channel_mult.size(); for (int i = 0; i < len_mults; i++) { @@ -321,12 +318,22 @@ struct ControlNet : public GGMLRunner { ControlNet(ggml_backend_t backend, const String2GGMLType& tensor_types = {}, - SDVersion version = VERSION_SD1, - bool direct = false) - : GGMLRunner(backend), control_net(version, direct) { + SDVersion version = VERSION_SD1) + : GGMLRunner(backend), control_net(version) { control_net.init(params_ctx, tensor_types, ""); } + void enable_conv2d_direct() { + std::vector blocks; + control_net.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + auto conv_block = (Conv2d*)block; + conv_block->enable_direct(); + } + } + } + ~ControlNet() { free_control_ctx(); } @@ -459,4 +466,4 @@ struct ControlNet : public GGMLRunner { } }; -#endif // __CONTROL_HPP__ +#endif // __CONTROL_HPP__ \ No newline at end of file diff --git a/diffusion_model.hpp b/diffusion_model.hpp index de65c1c36..787a4fa79 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -34,9 +34,8 @@ struct UNetModel : public DiffusionModel { UNetModel(ggml_backend_t backend, const String2GGMLType& tensor_types = {}, SDVersion version = VERSION_SD1, - bool flash_attn = false, - bool direct = false) - : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn, direct) { + bool flash_attn = false) + : unet(backend, tensor_types, "model.diffusion_model", version, flash_attn) { } void alloc_params_buffer() { diff --git a/esrgan.hpp b/esrgan.hpp index 96997ac1e..3e41a8871 100644 --- a/esrgan.hpp +++ b/esrgan.hpp @@ -16,16 +16,15 @@ class ResidualDenseBlock : public GGMLBlock { protected: int num_feat; int num_grow_ch; - bool direct = false; public: - ResidualDenseBlock(int num_feat = 64, int num_grow_ch = 32, bool direct = false) - : num_feat(num_feat), num_grow_ch(num_grow_ch), direct(direct) { - blocks["conv1"] = std::shared_ptr(new Conv2d(num_feat, num_grow_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); - blocks["conv2"] = std::shared_ptr(new Conv2d(num_feat + num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); - blocks["conv3"] = std::shared_ptr(new Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); - blocks["conv4"] = std::shared_ptr(new Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); - blocks["conv5"] = std::shared_ptr(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + ResidualDenseBlock(int num_feat = 64, int num_grow_ch = 32) + : num_feat(num_feat), num_grow_ch(num_grow_ch) { + blocks["conv1"] = std::shared_ptr(new Conv2d(num_feat, num_grow_ch, {3, 3}, {1, 1}, {1, 1})); + blocks["conv2"] = std::shared_ptr(new Conv2d(num_feat + num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1})); + blocks["conv3"] = std::shared_ptr(new Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1})); + blocks["conv4"] = std::shared_ptr(new Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, {3, 3}, {1, 1}, {1, 1})); + blocks["conv5"] = std::shared_ptr(new Conv2d(num_feat + 4 * num_grow_ch, num_feat, {3, 3}, {1, 1}, {1, 1})); } struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -59,10 +58,10 @@ class ResidualDenseBlock : public GGMLBlock { class RRDB : public GGMLBlock { public: - RRDB(int num_feat, int num_grow_ch = 32, bool direct = false) { - blocks["rdb1"] = std::shared_ptr(new ResidualDenseBlock(num_feat, num_grow_ch, direct)); - blocks["rdb2"] = std::shared_ptr(new ResidualDenseBlock(num_feat, num_grow_ch, direct)); - blocks["rdb3"] = std::shared_ptr(new ResidualDenseBlock(num_feat, num_grow_ch, direct)); + RRDB(int num_feat, int num_grow_ch = 32) { + blocks["rdb1"] = std::shared_ptr(new ResidualDenseBlock(num_feat, num_grow_ch)); + blocks["rdb2"] = std::shared_ptr(new ResidualDenseBlock(num_feat, num_grow_ch)); + blocks["rdb3"] = std::shared_ptr(new ResidualDenseBlock(num_feat, num_grow_ch)); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -90,21 +89,20 @@ class RRDBNet : public GGMLBlock { int num_out_ch = 3; int num_feat = 64; // default RealESRGAN_x4plus_anime_6B int num_grow_ch = 32; // default RealESRGAN_x4plus_anime_6B - bool direct = false; public: - RRDBNet(bool direct = false) { - blocks["conv_first"] = std::shared_ptr(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + RRDBNet() { + blocks["conv_first"] = std::shared_ptr(new Conv2d(num_in_ch, num_feat, {3, 3}, {1, 1}, {1, 1})); for (int i = 0; i < num_block; i++) { std::string name = "body." + std::to_string(i); blocks[name] = std::shared_ptr(new RRDB(num_feat, num_grow_ch)); } - blocks["conv_body"] = std::shared_ptr(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["conv_body"] = std::shared_ptr(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1})); // upsample - blocks["conv_up1"] = std::shared_ptr(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); - blocks["conv_up2"] = std::shared_ptr(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); - blocks["conv_hr"] = std::shared_ptr(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); - blocks["conv_last"] = std::shared_ptr(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["conv_up1"] = std::shared_ptr(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1})); + blocks["conv_up2"] = std::shared_ptr(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1})); + blocks["conv_hr"] = std::shared_ptr(new Conv2d(num_feat, num_feat, {3, 3}, {1, 1}, {1, 1})); + blocks["conv_last"] = std::shared_ptr(new Conv2d(num_feat, num_out_ch, {3, 3}, {1, 1}, {1, 1})); } struct ggml_tensor* lrelu(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -144,11 +142,22 @@ struct ESRGAN : public GGMLRunner { int scale = 4; int tile_size = 128; // avoid cuda OOM for 4gb VRAM - ESRGAN(ggml_backend_t backend, const String2GGMLType& tensor_types = {}, bool direct = false) - : GGMLRunner(backend), rrdb_net(direct) { + ESRGAN(ggml_backend_t backend, const String2GGMLType& tensor_types = {}) + : GGMLRunner(backend) { rrdb_net.init(params_ctx, tensor_types, ""); } + void enable_conv2d_direct() { + std::vector blocks; + rrdb_net.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + auto conv_block = (Conv2d*)block; + conv_block->enable_direct(); + } + } + } + std::string get_desc() { return "esrgan"; } @@ -196,4 +205,4 @@ struct ESRGAN : public GGMLRunner { } }; -#endif // __ESRGAN_HPP__ +#endif // __ESRGAN_HPP__ \ No newline at end of file diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 98aadb044..ec04dfde3 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -144,8 +144,8 @@ void print_params(SDParams params) { printf(" controlnet cpu: %s\n", params.control_net_cpu ? "true" : "false"); printf(" vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false"); printf(" diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false"); - printf(" diffusion Conv2D direct:%s\n", params.diffusion_conv_direct ? "true" : "false"); - printf(" vae Conv2D direct:%s\n", params.vae_conv_direct ? "true" : "false"); + printf(" diffusion Conv2d direct:%s\n", params.diffusion_conv_direct ? "true" : "false"); + printf(" vae Conv2d direct:%s\n", params.vae_conv_direct ? "true" : "false"); printf(" strength(control): %.2f\n", params.control_strength); printf(" prompt: %s\n", params.prompt.c_str()); printf(" negative_prompt: %s\n", params.negative_prompt.c_str()); @@ -236,9 +236,9 @@ void print_usage(int argc, const char* argv[]) { printf(" --diffusion-fa use flash attention in the diffusion model (for low vram)\n"); printf(" Might lower quality, since it implies converting k and v to f16.\n"); printf(" This might crash if it is not supported by the backend.\n"); - printf(" --diffusion-conv-direct use Conv2D direct in the diffusion model"); + printf(" --diffusion-conv-direct use Conv2d direct in the diffusion model"); printf(" This might crash if it is not supported by the backend.\n"); - printf(" --vae-conv-direct use Conv2D direct in the vae model (should improve the performance)"); + printf(" --vae-conv-direct use Conv2d direct in the vae model (should improve the performance)"); printf(" This might crash if it is not supported by the backend.\n"); printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n"); printf(" --canny apply canny preprocessor (edge detection)\n"); diff --git a/ggml_extend.hpp b/ggml_extend.hpp index de7b2bfa1..3c6ebd10e 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1394,6 +1394,19 @@ class GGMLBlock { tensors[prefix + pair.first] = pair.second; } } + + virtual std::string get_desc() { + return "GGMLBlock"; + } + + void get_all_blocks(std::vector& result) { + result.push_back(this); + for (auto& block_iter : blocks) { + if (block_iter.second) { + block_iter.second->get_all_blocks(result); + } + } + } }; class UnaryBlock : public GGMLBlock { @@ -1483,7 +1496,7 @@ class Conv2d : public UnaryBlock { std::pair padding; std::pair dilation; bool bias; - bool direct; + bool direct = false; void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types, const std::string prefix = "") { enum ggml_type wtype = GGML_TYPE_F16; @@ -1501,16 +1514,22 @@ class Conv2d : public UnaryBlock { std::pair stride = {1, 1}, std::pair padding = {0, 0}, std::pair dilation = {1, 1}, - bool bias = true, - bool direct = false) + bool bias = true) : in_channels(in_channels), out_channels(out_channels), kernel_size(kernel_size), stride(stride), padding(padding), dilation(dilation), - bias(bias), - direct(direct) {} + bias(bias) {} + + void enable_direct() { + direct = true; + } + + std::string get_desc() { + return "Conv2d"; + } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { struct ggml_tensor* w = params["weight"]; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 245ffea0e..f0d9c05af 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -326,12 +326,6 @@ class StableDiffusionGGML { LOG_INFO("CLIP: Using CPU backend"); clip_backend = ggml_backend_cpu_init(); } - if (sd_ctx_params->diffusion_conv_direct) { - LOG_INFO("Using Conv2D direct in the diffusion model"); - } - if (sd_ctx_params->vae_conv_direct){ - LOG_INFO("Using Conv2D direct in the vae model"); - } if (sd_ctx_params->diffusion_flash_attn) { LOG_INFO("Using flash attention in the diffusion model"); } @@ -379,8 +373,11 @@ class StableDiffusionGGML { diffusion_model = std::make_shared(backend, model_loader.tensor_storages_types, version, - sd_ctx_params->diffusion_flash_attn, - sd_ctx_params->diffusion_conv_direct); + sd_ctx_params->diffusion_flash_attn); + if (sd_ctx_params->diffusion_conv_direct) { + LOG_INFO("Using Conv2d direct in the diffusion model"); + std::dynamic_pointer_cast(diffusion_model)->unet.enable_conv2d_direct(); + } } cond_stage_model->alloc_params_buffer(); @@ -401,8 +398,11 @@ class StableDiffusionGGML { "first_stage_model", vae_decode_only, false, - version, - sd_ctx_params->vae_conv_direct); + version); + if (sd_ctx_params->vae_conv_direct){ + LOG_INFO("Using Conv2d direct in the vae model"); + first_stage_model->enable_conv2d_direct(); + } first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); } else { @@ -410,8 +410,11 @@ class StableDiffusionGGML { model_loader.tensor_storages_types, "decoder.layers", vae_decode_only, - version, - sd_ctx_params->vae_conv_direct); + version); + if (sd_ctx_params->vae_conv_direct){ + LOG_INFO("Using Conv2d direct in the tae model"); + tae_first_stage->enable_conv2d_direct(); + } } // first_stage_model->get_param_tensors(tensors, "first_stage_model."); @@ -423,7 +426,11 @@ class StableDiffusionGGML { } else { controlnet_backend = backend; } - control_net = std::make_shared(controlnet_backend, model_loader.tensor_storages_types, version, sd_ctx_params->diffusion_conv_direct); + control_net = std::make_shared(controlnet_backend, model_loader.tensor_storages_types, version); + if (sd_ctx_params->diffusion_conv_direct) { + LOG_INFO("Using Conv2d direct in the control net"); + control_net->enable_conv2d_direct(); + } } if (strstr(SAFE_STR(sd_ctx_params->stacked_id_embed_dir), "v2")) { diff --git a/tae.hpp b/tae.hpp index fa83200b2..4959bbd08 100644 --- a/tae.hpp +++ b/tae.hpp @@ -17,16 +17,15 @@ class TAEBlock : public UnaryBlock { protected: int n_in; int n_out; - bool direct = false; public: - TAEBlock(int n_in, int n_out, bool direct = false) - : n_in(n_in), n_out(n_out), direct(direct) { - blocks["conv.0"] = std::shared_ptr(new Conv2d(n_in, n_out, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); - blocks["conv.2"] = std::shared_ptr(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); - blocks["conv.4"] = std::shared_ptr(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + TAEBlock(int n_in, int n_out) + : n_in(n_in), n_out(n_out) { + blocks["conv.0"] = std::shared_ptr(new Conv2d(n_in, n_out, {3, 3}, {1, 1}, {1, 1})); + blocks["conv.2"] = std::shared_ptr(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1})); + blocks["conv.4"] = std::shared_ptr(new Conv2d(n_out, n_out, {3, 3}, {1, 1}, {1, 1})); if (n_in != n_out) { - blocks["skip"] = std::shared_ptr(new Conv2d(n_in, n_out, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false, direct)); + blocks["skip"] = std::shared_ptr(new Conv2d(n_in, n_out, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false)); } } @@ -61,32 +60,30 @@ class TinyEncoder : public UnaryBlock { int channels = 64; int z_channels = 4; int num_blocks = 3; - bool direct = false; public: - TinyEncoder(int z_channels = 4, bool direct = false) - : z_channels(z_channels), - direct(direct) { + TinyEncoder(int z_channels = 4) + : z_channels(z_channels) { int index = 0; - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(in_channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); - blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(in_channels, channels, {3, 3}, {1, 1}, {1, 1})); + blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels)); - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false)); for (int i = 0; i < num_blocks; i++) { - blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels)); } - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false)); for (int i = 0; i < num_blocks; i++) { - blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels)); } - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {2, 2}, {1, 1}, {1, 1}, false)); for (int i = 0; i < num_blocks; i++) { - blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels)); } - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, z_channels, {3, 3}, {1, 1}, {1, 1})); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -108,37 +105,35 @@ class TinyDecoder : public UnaryBlock { int channels = 64; int out_channels = 3; int num_blocks = 3; - bool direct = false; public: - TinyDecoder(int z_channels = 4, bool direct = false) - : z_channels(z_channels), - direct(direct) { + TinyDecoder(int z_channels = 4) + : z_channels(z_channels) { int index = 0; - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(z_channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(z_channels, channels, {3, 3}, {1, 1}, {1, 1})); index++; // nn.ReLU() for (int i = 0; i < num_blocks; i++) { - blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels)); } index++; // nn.Upsample() - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false)); for (int i = 0; i < num_blocks; i++) { - blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels)); } index++; // nn.Upsample() - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false)); for (int i = 0; i < num_blocks; i++) { - blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels)); } index++; // nn.Upsample() - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, false)); - blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels, direct)); - blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks[std::to_string(index++)] = std::shared_ptr(new TAEBlock(channels, channels)); + blocks[std::to_string(index++)] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1})); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) { @@ -170,20 +165,18 @@ class TinyDecoder : public UnaryBlock { class TAESD : public GGMLBlock { protected: bool decode_only; - bool direct = false; public: - TAESD(bool decode_only = true, SDVersion version = VERSION_SD1, bool direct = false) - : decode_only(decode_only), - direct(direct) { + TAESD(bool decode_only = true, SDVersion version = VERSION_SD1) + : decode_only(decode_only) { int z_channels = 4; if (sd_version_is_dit(version)) { z_channels = 16; } - blocks["decoder.layers"] = std::shared_ptr(new TinyDecoder(z_channels, direct)); + blocks["decoder.layers"] = std::shared_ptr(new TinyDecoder(z_channels)); if (!decode_only) { - blocks["encoder.layers"] = std::shared_ptr(new TinyEncoder(z_channels, direct)); + blocks["encoder.layers"] = std::shared_ptr(new TinyEncoder(z_channels)); } } @@ -201,21 +194,29 @@ class TAESD : public GGMLBlock { struct TinyAutoEncoder : public GGMLRunner { TAESD taesd; bool decode_only = false; - bool direct = false; TinyAutoEncoder(ggml_backend_t backend, const String2GGMLType& tensor_types, const std::string prefix, bool decoder_only = true, - SDVersion version = VERSION_SD1, - bool direct = false) + SDVersion version = VERSION_SD1) : decode_only(decoder_only), - taesd(decoder_only, version, direct), - direct(direct), + taesd(decoder_only, version), GGMLRunner(backend) { taesd.init(params_ctx, tensor_types, prefix); } + void enable_conv2d_direct() { + std::vector blocks; + taesd.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + auto conv_block = (Conv2d*)block; + conv_block->enable_direct(); + } + } + } + std::string get_desc() { return "taesd"; } @@ -268,4 +269,4 @@ struct TinyAutoEncoder : public GGMLRunner { } }; -#endif // __TAE_HPP__ +#endif // __TAE_HPP__ \ No newline at end of file diff --git a/unet.hpp b/unet.hpp index 7196fb03c..696bc6dfa 100644 --- a/unet.hpp +++ b/unet.hpp @@ -183,7 +183,7 @@ class UnetModelBlock : public GGMLBlock { int model_channels = 320; int adm_in_channels = 2816; // only for VERSION_SDXL/SVD - UnetModelBlock(SDVersion version = VERSION_SD1, const String2GGMLType& tensor_types = {}, bool flash_attn = false, bool direct = false) + UnetModelBlock(SDVersion version = VERSION_SD1, const String2GGMLType& tensor_types = {}, bool flash_attn = false) : version(version) { if (sd_version_is_sd2(version)) { context_dim = 1024; @@ -224,7 +224,7 @@ class UnetModelBlock : public GGMLBlock { } // input_blocks - blocks["input_blocks.0.0"] = std::shared_ptr(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["input_blocks.0.0"] = std::shared_ptr(new Conv2d(in_channels, model_channels, {3, 3}, {1, 1}, {1, 1})); std::vector input_block_chans; input_block_chans.push_back(model_channels); @@ -236,7 +236,7 @@ class UnetModelBlock : public GGMLBlock { if (version == VERSION_SVD) { return new VideoResBlock(channels, emb_channels, out_channels); } else { - return new ResBlock(channels, emb_channels, out_channels, {3, 3}); + return new ResBlock(channels, emb_channels, out_channels); } }; @@ -248,7 +248,7 @@ class UnetModelBlock : public GGMLBlock { if (version == VERSION_SVD) { return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim); } else { - return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, flash_attn, direct); + return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, flash_attn); } }; @@ -280,7 +280,7 @@ class UnetModelBlock : public GGMLBlock { if (i != len_mults - 1) { input_block_idx += 1; std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0"; - blocks[name] = std::shared_ptr(new DownSampleBlock(ch, ch, false, direct)); + blocks[name] = std::shared_ptr(new DownSampleBlock(ch, ch)); input_block_chans.push_back(ch); ds *= 2; @@ -330,7 +330,7 @@ class UnetModelBlock : public GGMLBlock { if (i > 0 && j == num_res_blocks) { std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx); - blocks[name] = std::shared_ptr(new UpSampleBlock(ch, ch, direct)); + blocks[name] = std::shared_ptr(new UpSampleBlock(ch, ch)); ds /= 2; } @@ -342,7 +342,7 @@ class UnetModelBlock : public GGMLBlock { // out blocks["out.0"] = std::shared_ptr(new GroupNorm32(ch)); // ch == model_channels // out_1 is nn.SiLU() - blocks["out.2"] = std::shared_ptr(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["out.2"] = std::shared_ptr(new Conv2d(model_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); } struct ggml_tensor* resblock_forward(std::string name, @@ -541,12 +541,23 @@ struct UNetModelRunner : public GGMLRunner { const String2GGMLType& tensor_types, const std::string prefix, SDVersion version = VERSION_SD1, - bool flash_attn = false, - bool direct = false) - : GGMLRunner(backend), unet(version, tensor_types, flash_attn, direct) { + bool flash_attn = false) + : GGMLRunner(backend), unet(version, tensor_types, flash_attn) { unet.init(params_ctx, tensor_types, prefix); } + void enable_conv2d_direct() { + std::vector blocks; + unet.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + LOG_DEBUG("block %s", block->get_desc().c_str()); + auto conv_block = (Conv2d*)block; + conv_block->enable_direct(); + } + } + } + std::string get_desc() { return "unet"; } diff --git a/upscaler.cpp b/upscaler.cpp index 69d5ef392..599f263f9 100644 --- a/upscaler.cpp +++ b/upscaler.cpp @@ -49,7 +49,10 @@ struct UpscalerGGML { backend = ggml_backend_cpu_init(); } LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type)); - esrgan_upscaler = std::make_shared(backend, model_loader.tensor_storages_types, direct); + esrgan_upscaler = std::make_shared(backend, model_loader.tensor_storages_types); + if (direct) { + esrgan_upscaler->enable_conv2d_direct(); + } if (!esrgan_upscaler->load_from_file(esrgan_path)) { return false; } diff --git a/vae.hpp b/vae.hpp index 6f435fd6b..bdf160bb8 100644 --- a/vae.hpp +++ b/vae.hpp @@ -12,24 +12,21 @@ class ResnetBlock : public UnaryBlock { protected: int64_t in_channels; int64_t out_channels; - bool direct = false; public: ResnetBlock(int64_t in_channels, - int64_t out_channels, - bool direct = false) + int64_t out_channels) : in_channels(in_channels), - out_channels(out_channels), - direct(direct){ + out_channels(out_channels) { // temb_channels is always 0 blocks["norm1"] = std::shared_ptr(new GroupNorm32(in_channels)); - blocks["conv1"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["conv1"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); blocks["norm2"] = std::shared_ptr(new GroupNorm32(out_channels)); - blocks["conv2"] = std::shared_ptr(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["conv2"] = std::shared_ptr(new Conv2d(out_channels, out_channels, {3, 3}, {1, 1}, {1, 1})); if (out_channels != in_channels) { - blocks["nin_shortcut"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct)); + blocks["nin_shortcut"] = std::shared_ptr(new Conv2d(in_channels, out_channels, {1, 1})); } } @@ -67,19 +64,16 @@ class ResnetBlock : public UnaryBlock { class AttnBlock : public UnaryBlock { protected: int64_t in_channels; - bool direct = false; public: - AttnBlock(int64_t in_channels, - bool direct = false) - : in_channels(in_channels), - direct(direct){ + AttnBlock(int64_t in_channels) + : in_channels(in_channels) { blocks["norm"] = std::shared_ptr(new GroupNorm32(in_channels)); - blocks["q"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct)); - blocks["k"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct)); - blocks["v"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct)); + blocks["q"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); + blocks["k"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); + blocks["v"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); - blocks["proj_out"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1}, {1, 1}, {0, 0}, {1, 1}, true, direct)); + blocks["proj_out"] = std::shared_ptr(new Conv2d(in_channels, in_channels, {1, 1})); } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -129,9 +123,8 @@ class AE3DConv : public Conv2d { std::pair stride = {1, 1}, std::pair padding = {0, 0}, std::pair dilation = {1, 1}, - bool bias = true, - bool direct = false) - : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias, direct) { + bool bias = true) + : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) { int64_t kernel_padding = video_kernel_size / 2; blocks["time_mix_conv"] = std::shared_ptr(new Conv3dnx1x1(out_channels, out_channels, @@ -183,9 +176,8 @@ class VideoResnetBlock : public ResnetBlock { public: VideoResnetBlock(int64_t in_channels, int64_t out_channels, - int video_kernel_size = 3, - bool direct = false) - : ResnetBlock(in_channels, out_channels, direct) { + int video_kernel_size = 3) + : ResnetBlock(in_channels, out_channels) { // merge_strategy is always learned blocks["time_stack"] = std::shared_ptr(new ResBlock(out_channels, 0, out_channels, {video_kernel_size, 1}, 3, false, true)); } @@ -234,7 +226,6 @@ class Encoder : public GGMLBlock { int in_channels = 3; int z_channels = 4; bool double_z = true; - bool direct = false; public: Encoder(int ch, @@ -242,16 +233,14 @@ class Encoder : public GGMLBlock { int num_res_blocks, int in_channels, int z_channels, - bool double_z = true, - bool direct = false) + bool double_z = true) : ch(ch), ch_mult(ch_mult), num_res_blocks(num_res_blocks), in_channels(in_channels), z_channels(z_channels), - double_z(double_z), - direct(direct){ - blocks["conv_in"] = std::shared_ptr(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + double_z(double_z) { + blocks["conv_in"] = std::shared_ptr(new Conv2d(in_channels, ch, {3, 3}, {1, 1}, {1, 1})); size_t num_resolutions = ch_mult.size(); @@ -265,21 +254,21 @@ class Encoder : public GGMLBlock { int block_out = ch * ch_mult[i]; for (int j = 0; j < num_res_blocks; j++) { std::string name = "down." + std::to_string(i) + ".block." + std::to_string(j); - blocks[name] = std::shared_ptr(new ResnetBlock(block_in, block_out, direct)); + blocks[name] = std::shared_ptr(new ResnetBlock(block_in, block_out)); block_in = block_out; } if (i != num_resolutions - 1) { std::string name = "down." + std::to_string(i) + ".downsample"; - blocks[name] = std::shared_ptr(new DownSampleBlock(block_in, block_in, true, direct)); + blocks[name] = std::shared_ptr(new DownSampleBlock(block_in, block_in, true)); } } - blocks["mid.block_1"] = std::shared_ptr(new ResnetBlock(block_in, block_in, direct)); - blocks["mid.attn_1"] = std::shared_ptr(new AttnBlock(block_in, direct)); - blocks["mid.block_2"] = std::shared_ptr(new ResnetBlock(block_in, block_in, direct)); + blocks["mid.block_1"] = std::shared_ptr(new ResnetBlock(block_in, block_in)); + blocks["mid.attn_1"] = std::shared_ptr(new AttnBlock(block_in)); + blocks["mid.block_2"] = std::shared_ptr(new ResnetBlock(block_in, block_in)); blocks["norm_out"] = std::shared_ptr(new GroupNorm32(block_in)); - blocks["conv_out"] = std::shared_ptr(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["conv_out"] = std::shared_ptr(new Conv2d(block_in, double_z ? z_channels * 2 : z_channels, {3, 3}, {1, 1}, {1, 1})); } virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -334,29 +323,25 @@ class Decoder : public GGMLBlock { int z_channels = 4; bool video_decoder = false; int video_kernel_size = 3; - bool direct = false; virtual std::shared_ptr get_conv_out(int64_t in_channels, int64_t out_channels, std::pair kernel_size, std::pair stride = {1, 1}, - std::pair padding = {0, 0}, - std::pair dilation = {1, 1}, - bool bias = true, - bool direct = false){ + std::pair padding = {0, 0}) { if (video_decoder) { return std::shared_ptr(new AE3DConv(in_channels, out_channels, kernel_size, video_kernel_size, stride, padding)); } else { - return std::shared_ptr(new Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias, direct)); + return std::shared_ptr(new Conv2d(in_channels, out_channels, kernel_size, stride, padding)); } } virtual std::shared_ptr get_resnet_block(int64_t in_channels, int64_t out_channels) { if (video_decoder) { - return std::shared_ptr(new VideoResnetBlock(in_channels, out_channels, video_kernel_size, direct)); + return std::shared_ptr(new VideoResnetBlock(in_channels, out_channels, video_kernel_size)); } else { - return std::shared_ptr(new ResnetBlock(in_channels, out_channels, direct)); + return std::shared_ptr(new ResnetBlock(in_channels, out_channels)); } } @@ -367,23 +352,21 @@ class Decoder : public GGMLBlock { int num_res_blocks, int z_channels, bool video_decoder = false, - int video_kernel_size = 3, - bool direct = false) + int video_kernel_size = 3) : ch(ch), out_ch(out_ch), ch_mult(ch_mult), num_res_blocks(num_res_blocks), z_channels(z_channels), video_decoder(video_decoder), - video_kernel_size(video_kernel_size), - direct(direct) { + video_kernel_size(video_kernel_size) { size_t num_resolutions = ch_mult.size(); int block_in = ch * ch_mult[num_resolutions - 1]; - blocks["conv_in"] = std::shared_ptr(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct)); + blocks["conv_in"] = std::shared_ptr(new Conv2d(z_channels, block_in, {3, 3}, {1, 1}, {1, 1})); blocks["mid.block_1"] = get_resnet_block(block_in, block_in); - blocks["mid.attn_1"] = std::shared_ptr(new AttnBlock(block_in, direct)); + blocks["mid.attn_1"] = std::shared_ptr(new AttnBlock(block_in)); blocks["mid.block_2"] = get_resnet_block(block_in, block_in); for (int i = num_resolutions - 1; i >= 0; i--) { @@ -397,12 +380,12 @@ class Decoder : public GGMLBlock { } if (i != 0) { std::string name = "up." + std::to_string(i) + ".upsample"; - blocks[name] = std::shared_ptr(new UpSampleBlock(block_in, block_in, direct)); + blocks[name] = std::shared_ptr(new UpSampleBlock(block_in, block_in)); } } blocks["norm_out"] = std::shared_ptr(new GroupNorm32(block_in)); - blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1}, {1, 1}, true, direct); + blocks["conv_out"] = get_conv_out(block_in, out_ch, {3, 3}, {1, 1}, {1, 1}); } virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) { @@ -459,7 +442,6 @@ class AutoencodingEngine : public GGMLBlock { bool use_video_decoder = false; bool use_quant = true; int embed_dim = 4; - bool direct = false; struct { int z_channels = 4; int resolution = 256; @@ -474,9 +456,8 @@ class AutoencodingEngine : public GGMLBlock { public: AutoencodingEngine(bool decode_only = true, bool use_video_decoder = false, - SDVersion version = VERSION_SD1, - bool direct = false) - : decode_only(decode_only), use_video_decoder(use_video_decoder), direct(direct) { + SDVersion version = VERSION_SD1) + : decode_only(decode_only), use_video_decoder(use_video_decoder) { if (sd_version_is_dit(version)) { dd_config.z_channels = 16; use_quant = false; @@ -489,18 +470,11 @@ class AutoencodingEngine : public GGMLBlock { dd_config.ch_mult, dd_config.num_res_blocks, dd_config.z_channels, - use_video_decoder, - 3, - direct)); + use_video_decoder)); if (use_quant) { blocks["post_quant_conv"] = std::shared_ptr(new Conv2d(dd_config.z_channels, embed_dim, - {1, 1}, - {1, 1}, - {0, 0}, - {1, 1}, - true, - direct)); + {1, 1})); } if (!decode_only) { blocks["encoder"] = std::shared_ptr(new Encoder(dd_config.ch, @@ -508,19 +482,13 @@ class AutoencodingEngine : public GGMLBlock { dd_config.num_res_blocks, dd_config.in_channels, dd_config.z_channels, - dd_config.double_z, - direct)); + dd_config.double_z)); if (use_quant) { int factor = dd_config.double_z ? 2 : 1; blocks["quant_conv"] = std::shared_ptr(new Conv2d(embed_dim * factor, dd_config.z_channels * factor, - {1, 1}, - {1, 1}, - {0, 0}, - {1, 1}, - true, - direct)); + {1, 1})); } } } @@ -561,12 +529,22 @@ struct AutoEncoderKL : public GGMLRunner { const std::string prefix, bool decode_only = false, bool use_video_decoder = false, - SDVersion version = VERSION_SD1, - bool direct = false) - : decode_only(decode_only), ae(decode_only, use_video_decoder, version, direct), GGMLRunner(backend) { + SDVersion version = VERSION_SD1) + : decode_only(decode_only), ae(decode_only, use_video_decoder, version), GGMLRunner(backend) { ae.init(params_ctx, tensor_types, prefix); } + void enable_conv2d_direct() { + std::vector blocks; + ae.get_all_blocks(blocks); + for (auto block : blocks) { + if (block->get_desc() == "Conv2d") { + auto conv_block = (Conv2d*)block; + conv_block->enable_direct(); + } + } + } + std::string get_desc() { return "vae"; } From f053f736241a9d9a2327daed83543813a3f1f67e Mon Sep 17 00:00:00 2001 From: leejet Date: Sat, 2 Aug 2025 13:52:43 +0800 Subject: [PATCH 12/12] format code --- ggml_extend.hpp | 18 +++++++++--------- stable-diffusion.cpp | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 3c6ebd10e..57c1e8aa6 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -707,15 +707,15 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx, } __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d_direct(struct ggml_context* ctx, - struct ggml_tensor* x, - struct ggml_tensor* w, - struct ggml_tensor* b, - int s0 = 1, - int s1 = 1, - int p0 = 0, - int p1 = 0, - int d0 = 1, - int d1 = 1) { + struct ggml_tensor* x, + struct ggml_tensor* w, + struct ggml_tensor* b, + int s0 = 1, + int s1 = 1, + int p0 = 0, + int p1 = 0, + int d0 = 1, + int d1 = 1) { x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1); if (b != NULL) { b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index f0d9c05af..c5448f927 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -399,7 +399,7 @@ class StableDiffusionGGML { vae_decode_only, false, version); - if (sd_ctx_params->vae_conv_direct){ + if (sd_ctx_params->vae_conv_direct) { LOG_INFO("Using Conv2d direct in the vae model"); first_stage_model->enable_conv2d_direct(); } @@ -411,7 +411,7 @@ class StableDiffusionGGML { "decoder.layers", vae_decode_only, version); - if (sd_ctx_params->vae_conv_direct){ + if (sd_ctx_params->vae_conv_direct) { LOG_INFO("Using Conv2d direct in the tae model"); tae_first_stage->enable_conv2d_direct(); }