Merge branch 'leejet:master' into master

daniandtheweb · web-flow · commit 2f198d3798e1 · 2024-08-25T17:57:12.000Z
diff --git a/assets/flux/flux1-dev-q4_k.png b/assets/flux/flux1-dev-q4_k.png
diff --git a/docs/flux.md b/docs/flux.md
@@ -4,14 +4,17 @@ You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB
 
 ## Download weights
 
-- Download flux-dev from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors
-- Download flux-schnell from https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/main/flux1-schnell.safetensors
+- Download flux
+    - If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf)
+    - Otherwise, download flux-dev from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors or flux-schnell from https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/main/flux1-schnell.safetensors
 - Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
 - Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
 - Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
 
 ## Convert flux weights
 
+You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
+
 Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully developed. Therefore, we need to convert flux to gguf format here, which also saves VRAM. For example:
 ```
 .\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
@@ -30,10 +33,10 @@ For example:
 
 Using formats of different precisions will yield results of varying quality.
 
-| Type | q8_0  | q4_0  | q3_k  | q2_k |
-|---- | ----  |----  |----  |----  |
-| **Memory** | 12068.09 MB  | 6394.53 MB  | 4888.16 MB  | 3735.73 MB |
-| **Result** | ![](../assets/flux/flux1-dev-q8_0.png) |![](../assets/flux/flux1-dev-q4_0.png) |![](../assets/flux/flux1-dev-q3_k.png) |![](../assets/flux/flux1-dev-q2_k.png)|
+| Type | q8_0  | q4_0  | q4_k  | q3_k  | q2_k |
+|---- | ----  |----  |----  |----  |----  |
+| **Memory** | 12068.09 MB  | 6394.53 MB | 6395.17 MB | 4888.16 MB  | 3735.73 MB |
+| **Result** | ![](../assets/flux/flux1-dev-q8_0.png) |![](../assets/flux/flux1-dev-q4_0.png) |![](../assets/flux/flux1-dev-q4_k.png) |![](../assets/flux/flux1-dev-q3_k.png) |![](../assets/flux/flux1-dev-q2_k.png)|
 
 
 
diff --git a/flux.hpp b/flux.hpp
@@ -634,13 +634,13 @@ namespace Flux {
             int64_t out_channels = params.in_channels;
             int64_t pe_dim       = params.hidden_size / params.num_heads;
 
-            blocks["img_in"]    = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size));
+            blocks["img_in"]    = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size, true));
             blocks["time_in"]   = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
             blocks["vector_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(params.vec_in_dim, params.hidden_size));
             if (params.guidance_embed) {
                 blocks["guidance_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
             }
-            blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.context_in_dim, params.hidden_size));
+            blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.context_in_dim, params.hidden_size, true));
 
             for (int i = 0; i < params.depth; i++) {
                 blocks["double_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new DoubleStreamBlock(params.hidden_size,
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
@@ -1187,9 +1187,10 @@ class Linear : public UnaryBlock {
     int64_t in_features;
     int64_t out_features;
     bool bias;
+    bool force_f32;
 
     void init_params(struct ggml_context* ctx, ggml_type wtype) {
-        if (in_features % ggml_blck_size(wtype) != 0) {
+        if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
             wtype = GGML_TYPE_F32;
         }
         params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
@@ -1201,10 +1202,12 @@ class Linear : public UnaryBlock {
 public:
     Linear(int64_t in_features,
            int64_t out_features,
-           bool bias = true)
+           bool bias      = true,
+           bool force_f32 = false)
         : in_features(in_features),
           out_features(out_features),
-          bias(bias) {}
+          bias(bias),
+          force_f32(force_f32) {}
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         struct ggml_tensor* w = params["weight"];
diff --git a/lora.hpp b/lora.hpp
@@ -82,6 +82,7 @@ struct LoraModel : public GGMLRunner {
 
         zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
         set_backend_tensor_data(zero_index, zero_index_vec.data());
+        ggml_build_forward_expand(gf, zero_index);
 
         std::set<std::string> applied_lora_tensors;
         for (auto it : model_tensors) {
diff --git a/mmdit.hpp b/mmdit.hpp
@@ -101,8 +101,8 @@ struct TimestepEmbedder : public GGMLBlock {
     TimestepEmbedder(int64_t hidden_size,
                      int64_t frequency_embedding_size = 256)
         : frequency_embedding_size(frequency_embedding_size) {
-        blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size));
-        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size));
+        blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size, true, true));
+        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* t) {
@@ -125,8 +125,8 @@ struct VectorEmbedder : public GGMLBlock {
 public:
     VectorEmbedder(int64_t input_dim,
                    int64_t hidden_size) {
-        blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(input_dim, hidden_size));
-        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size));
+        blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(input_dim, hidden_size, true, true));
+        blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -423,7 +423,7 @@ struct FinalLayer : public GGMLBlock {
                int64_t out_channels) {
         // total_out_channels is always None
         blocks["norm_final"]         = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
-        blocks["linear"]             = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels));
+        blocks["linear"]             = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels, true, true));
         blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
     }
 
@@ -510,7 +510,7 @@ struct MMDiT : public GGMLBlock {
             blocks["y_embedder"] = std::shared_ptr<GGMLBlock>(new VectorEmbedder(adm_in_channels, hidden_size));
         }
 
-        blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(4096, 1536));
+        blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(4096, 1536, true, true));
 
         for (int i = 0; i < depth; i++) {
             blocks["joint_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new JointBlock(hidden_size,
diff --git a/model.cpp b/model.cpp
@@ -1397,10 +1397,11 @@ ggml_type ModelLoader::get_sd_wtype() {
             continue;
         }
 
-        if (tensor_storage.name.find(".weight") != std::string::npos &&
-            (tensor_storage.name.find("time_embed") != std::string::npos ||
-             tensor_storage.name.find("context_embedder") != std::string::npos ||
-             tensor_storage.name.find("time_in") != std::string::npos)) {
+        if (ggml_is_quantized(tensor_storage.type)) {
+            return tensor_storage.type;
+        }
+
+        if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
             return tensor_storage.type;
         }
     }
@@ -1420,7 +1421,11 @@ ggml_type ModelLoader::get_conditioner_wtype() {
             continue;
         }
 
-        if (tensor_storage.name.find(".weight") != std::string::npos) {
+        if (ggml_is_quantized(tensor_storage.type)) {
+            return tensor_storage.type;
+        }
+
+        if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
             return tensor_storage.type;
         }
     }
@@ -1437,10 +1442,11 @@ ggml_type ModelLoader::get_diffusion_model_wtype() {
             continue;
         }
 
-        if (tensor_storage.name.find(".weight") != std::string::npos &&
-            (tensor_storage.name.find("time_embed") != std::string::npos ||
-             tensor_storage.name.find("context_embedder") != std::string::npos ||
-             tensor_storage.name.find("time_in") != std::string::npos)) {
+        if (ggml_is_quantized(tensor_storage.type)) {
+            return tensor_storage.type;
+        }
+
+        if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
             return tensor_storage.type;
         }
     }
@@ -1458,7 +1464,11 @@ ggml_type ModelLoader::get_vae_wtype() {
             continue;
         }
 
-        if (tensor_storage.name.find(".weight")) {
+        if (ggml_is_quantized(tensor_storage.type)) {
+            return tensor_storage.type;
+        }
+
+        if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
             return tensor_storage.type;
         }
     }
@@ -1723,6 +1733,37 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
     return true;
 }
 
+bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type) {
+    const std::string& name = tensor_storage.name;
+    if (type != GGML_TYPE_COUNT) {
+        if (ggml_is_quantized(type) && tensor_storage.ne[0] % ggml_blck_size(type) != 0) {
+            // Pass, do not convert
+        } else if (ends_with(name, ".bias")) {
+            // Pass, do not convert
+        } else if (ends_with(name, ".scale")) {
+            // Pass, do not convert
+        } else if (contains(name, "img_in.") ||
+                   contains(name, "txt_in.") ||
+                   contains(name, "time_in.") ||
+                   contains(name, "vector_in.") ||
+                   contains(name, "guidance_in.") ||
+                   contains(name, "final_layer.")) {
+            // Pass, do not convert. For FLUX
+        } else if (contains(name, "x_embedder.") ||
+                   contains(name, "t_embedder.") ||
+                   contains(name, "y_embedder.") ||
+                   contains(name, "pos_embed") ||
+                   contains(name, "context_embedder.")) {
+            // Pass, do not convert. For MMDiT
+        } else if (contains(name, "time_embed.") || contains(name, "label_emb.")) {
+            // Pass, do not convert. For Unet
+        } else {
+            return true;
+        }
+    }
+    return false;
+}
+
 bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) {
     auto backend    = ggml_backend_cpu_init();
     size_t mem_size = 1 * 1024 * 1024;  // for padding
@@ -1737,12 +1778,8 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
         const std::string& name = tensor_storage.name;
 
         ggml_type tensor_type = tensor_storage.type;
-        if (type != GGML_TYPE_COUNT) {
-            if (ggml_is_quantized(type) && tensor_storage.ne[0] % ggml_blck_size(type) != 0) {
-                tensor_type = GGML_TYPE_F16;
-            } else {
-                tensor_type = type;
-            }
+        if (tensor_should_be_converted(tensor_storage, type)) {
+            tensor_type = type;
         }
 
         ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
@@ -1792,15 +1829,9 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
     }
 
     for (auto& tensor_storage : processed_tensor_storages) {
-        ggml_type tensor_type = tensor_storage.type;
-        if (type != GGML_TYPE_COUNT) {
-            if (ggml_is_quantized(type) && tensor_storage.ne[0] % 32 != 0) {
-                tensor_type = GGML_TYPE_F16;
-            } else {
-                tensor_type = type;
-            }
+        if (tensor_should_be_converted(tensor_storage, type)) {
+            tensor_storage.type = type;
         }
-        tensor_storage.type = tensor_type;
         mem_size += tensor_storage.nbytes() + alignment;
     }
 
diff --git a/model.h b/model.h
@@ -157,6 +157,7 @@ class ModelLoader {
                       ggml_backend_t backend,
                       std::set<std::string> ignore_tensors = {});
     bool save_to_gguf_file(const std::string& file_path, ggml_type type);
+    bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
     int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
     ~ModelLoader() = default;