Skip to content

Commit 2f198d3

Browse files
Merge branch 'leejet:master' into master
2 parents 1505d5a + 8847114 commit 2f198d3

File tree

8 files changed

+80
-41
lines changed

8 files changed

+80
-41
lines changed

assets/flux/flux1-dev-q4_k.png

468 KB
Loading

docs/flux.md

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,17 @@ You can run Flux using stable-diffusion.cpp with a GPU that has 6GB or even 4GB
44

55
## Download weights
66

7-
- Download flux-dev from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors
8-
- Download flux-schnell from https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/main/flux1-schnell.safetensors
7+
- Download flux
8+
- If you don't want to do the conversion yourself, download the preconverted gguf model from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf)
9+
- Otherwise, download flux-dev from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/flux1-dev.safetensors or flux-schnell from https://huggingface.co/black-forest-labs/FLUX.1-schnell/blob/main/flux1-schnell.safetensors
910
- Download vae from https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/ae.safetensors
1011
- Download clip_l from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/clip_l.safetensors
1112
- Download t5xxl from https://huggingface.co/comfyanonymous/flux_text_encoders/blob/main/t5xxl_fp16.safetensors
1213

1314
## Convert flux weights
1415

16+
You can download the preconverted gguf weights from [FLUX.1-dev-gguf](https://huggingface.co/leejet/FLUX.1-dev-gguf) or [FLUX.1-schnell](https://huggingface.co/leejet/FLUX.1-schnell-gguf), this way you don't have to do the conversion yourself.
17+
1518
Using fp16 will lead to overflow, but ggml's support for bf16 is not yet fully developed. Therefore, we need to convert flux to gguf format here, which also saves VRAM. For example:
1619
```
1720
.\bin\Release\sd.exe -M convert -m ..\..\ComfyUI\models\unet\flux1-dev.sft -o ..\models\flux1-dev-q8_0.gguf -v --type q8_0
@@ -30,10 +33,10 @@ For example:
3033

3134
Using formats of different precisions will yield results of varying quality.
3235

33-
| Type | q8_0 | q4_0 | q3_k | q2_k |
34-
|---- | ---- |---- |---- |---- |
35-
| **Memory** | 12068.09 MB | 6394.53 MB | 4888.16 MB | 3735.73 MB |
36-
| **Result** | ![](../assets/flux/flux1-dev-q8_0.png) |![](../assets/flux/flux1-dev-q4_0.png) |![](../assets/flux/flux1-dev-q3_k.png) |![](../assets/flux/flux1-dev-q2_k.png)|
36+
| Type | q8_0 | q4_0 | q4_k | q3_k | q2_k |
37+
|---- | ---- |---- |---- |---- |---- |
38+
| **Memory** | 12068.09 MB | 6394.53 MB | 6395.17 MB | 4888.16 MB | 3735.73 MB |
39+
| **Result** | ![](../assets/flux/flux1-dev-q8_0.png) |![](../assets/flux/flux1-dev-q4_0.png) |![](../assets/flux/flux1-dev-q4_k.png) |![](../assets/flux/flux1-dev-q3_k.png) |![](../assets/flux/flux1-dev-q2_k.png)|
3740

3841

3942

flux.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -634,13 +634,13 @@ namespace Flux {
634634
int64_t out_channels = params.in_channels;
635635
int64_t pe_dim = params.hidden_size / params.num_heads;
636636

637-
blocks["img_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size));
637+
blocks["img_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size, true));
638638
blocks["time_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
639639
blocks["vector_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(params.vec_in_dim, params.hidden_size));
640640
if (params.guidance_embed) {
641641
blocks["guidance_in"] = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
642642
}
643-
blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.context_in_dim, params.hidden_size));
643+
blocks["txt_in"] = std::shared_ptr<GGMLBlock>(new Linear(params.context_in_dim, params.hidden_size, true));
644644

645645
for (int i = 0; i < params.depth; i++) {
646646
blocks["double_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new DoubleStreamBlock(params.hidden_size,

ggml_extend.hpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1187,9 +1187,10 @@ class Linear : public UnaryBlock {
11871187
int64_t in_features;
11881188
int64_t out_features;
11891189
bool bias;
1190+
bool force_f32;
11901191

11911192
void init_params(struct ggml_context* ctx, ggml_type wtype) {
1192-
if (in_features % ggml_blck_size(wtype) != 0) {
1193+
if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
11931194
wtype = GGML_TYPE_F32;
11941195
}
11951196
params["weight"] = ggml_new_tensor_2d(ctx, wtype, in_features, out_features);
@@ -1201,10 +1202,12 @@ class Linear : public UnaryBlock {
12011202
public:
12021203
Linear(int64_t in_features,
12031204
int64_t out_features,
1204-
bool bias = true)
1205+
bool bias = true,
1206+
bool force_f32 = false)
12051207
: in_features(in_features),
12061208
out_features(out_features),
1207-
bias(bias) {}
1209+
bias(bias),
1210+
force_f32(force_f32) {}
12081211

12091212
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
12101213
struct ggml_tensor* w = params["weight"];

lora.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ struct LoraModel : public GGMLRunner {
8282

8383
zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
8484
set_backend_tensor_data(zero_index, zero_index_vec.data());
85+
ggml_build_forward_expand(gf, zero_index);
8586

8687
std::set<std::string> applied_lora_tensors;
8788
for (auto it : model_tensors) {

mmdit.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,8 @@ struct TimestepEmbedder : public GGMLBlock {
101101
TimestepEmbedder(int64_t hidden_size,
102102
int64_t frequency_embedding_size = 256)
103103
: frequency_embedding_size(frequency_embedding_size) {
104-
blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size));
105-
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size));
104+
blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(frequency_embedding_size, hidden_size, true, true));
105+
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
106106
}
107107

108108
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* t) {
@@ -125,8 +125,8 @@ struct VectorEmbedder : public GGMLBlock {
125125
public:
126126
VectorEmbedder(int64_t input_dim,
127127
int64_t hidden_size) {
128-
blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(input_dim, hidden_size));
129-
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size));
128+
blocks["mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(input_dim, hidden_size, true, true));
129+
blocks["mlp.2"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, hidden_size, true, true));
130130
}
131131

132132
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -423,7 +423,7 @@ struct FinalLayer : public GGMLBlock {
423423
int64_t out_channels) {
424424
// total_out_channels is always None
425425
blocks["norm_final"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-06f, false));
426-
blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels));
426+
blocks["linear"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, patch_size * patch_size * out_channels, true, true));
427427
blocks["adaLN_modulation.1"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, 2 * hidden_size));
428428
}
429429

@@ -510,7 +510,7 @@ struct MMDiT : public GGMLBlock {
510510
blocks["y_embedder"] = std::shared_ptr<GGMLBlock>(new VectorEmbedder(adm_in_channels, hidden_size));
511511
}
512512

513-
blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(4096, 1536));
513+
blocks["context_embedder"] = std::shared_ptr<GGMLBlock>(new Linear(4096, 1536, true, true));
514514

515515
for (int i = 0; i < depth; i++) {
516516
blocks["joint_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new JointBlock(hidden_size,

model.cpp

Lines changed: 55 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1397,10 +1397,11 @@ ggml_type ModelLoader::get_sd_wtype() {
13971397
continue;
13981398
}
13991399

1400-
if (tensor_storage.name.find(".weight") != std::string::npos &&
1401-
(tensor_storage.name.find("time_embed") != std::string::npos ||
1402-
tensor_storage.name.find("context_embedder") != std::string::npos ||
1403-
tensor_storage.name.find("time_in") != std::string::npos)) {
1400+
if (ggml_is_quantized(tensor_storage.type)) {
1401+
return tensor_storage.type;
1402+
}
1403+
1404+
if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
14041405
return tensor_storage.type;
14051406
}
14061407
}
@@ -1420,7 +1421,11 @@ ggml_type ModelLoader::get_conditioner_wtype() {
14201421
continue;
14211422
}
14221423

1423-
if (tensor_storage.name.find(".weight") != std::string::npos) {
1424+
if (ggml_is_quantized(tensor_storage.type)) {
1425+
return tensor_storage.type;
1426+
}
1427+
1428+
if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
14241429
return tensor_storage.type;
14251430
}
14261431
}
@@ -1437,10 +1442,11 @@ ggml_type ModelLoader::get_diffusion_model_wtype() {
14371442
continue;
14381443
}
14391444

1440-
if (tensor_storage.name.find(".weight") != std::string::npos &&
1441-
(tensor_storage.name.find("time_embed") != std::string::npos ||
1442-
tensor_storage.name.find("context_embedder") != std::string::npos ||
1443-
tensor_storage.name.find("time_in") != std::string::npos)) {
1445+
if (ggml_is_quantized(tensor_storage.type)) {
1446+
return tensor_storage.type;
1447+
}
1448+
1449+
if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
14441450
return tensor_storage.type;
14451451
}
14461452
}
@@ -1458,7 +1464,11 @@ ggml_type ModelLoader::get_vae_wtype() {
14581464
continue;
14591465
}
14601466

1461-
if (tensor_storage.name.find(".weight")) {
1467+
if (ggml_is_quantized(tensor_storage.type)) {
1468+
return tensor_storage.type;
1469+
}
1470+
1471+
if (tensor_should_be_converted(tensor_storage, GGML_TYPE_Q4_K)) {
14621472
return tensor_storage.type;
14631473
}
14641474
}
@@ -1723,6 +1733,37 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
17231733
return true;
17241734
}
17251735

1736+
bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type) {
1737+
const std::string& name = tensor_storage.name;
1738+
if (type != GGML_TYPE_COUNT) {
1739+
if (ggml_is_quantized(type) && tensor_storage.ne[0] % ggml_blck_size(type) != 0) {
1740+
// Pass, do not convert
1741+
} else if (ends_with(name, ".bias")) {
1742+
// Pass, do not convert
1743+
} else if (ends_with(name, ".scale")) {
1744+
// Pass, do not convert
1745+
} else if (contains(name, "img_in.") ||
1746+
contains(name, "txt_in.") ||
1747+
contains(name, "time_in.") ||
1748+
contains(name, "vector_in.") ||
1749+
contains(name, "guidance_in.") ||
1750+
contains(name, "final_layer.")) {
1751+
// Pass, do not convert. For FLUX
1752+
} else if (contains(name, "x_embedder.") ||
1753+
contains(name, "t_embedder.") ||
1754+
contains(name, "y_embedder.") ||
1755+
contains(name, "pos_embed") ||
1756+
contains(name, "context_embedder.")) {
1757+
// Pass, do not convert. For MMDiT
1758+
} else if (contains(name, "time_embed.") || contains(name, "label_emb.")) {
1759+
// Pass, do not convert. For Unet
1760+
} else {
1761+
return true;
1762+
}
1763+
}
1764+
return false;
1765+
}
1766+
17261767
bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) {
17271768
auto backend = ggml_backend_cpu_init();
17281769
size_t mem_size = 1 * 1024 * 1024; // for padding
@@ -1737,12 +1778,8 @@ bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type
17371778
const std::string& name = tensor_storage.name;
17381779

17391780
ggml_type tensor_type = tensor_storage.type;
1740-
if (type != GGML_TYPE_COUNT) {
1741-
if (ggml_is_quantized(type) && tensor_storage.ne[0] % ggml_blck_size(type) != 0) {
1742-
tensor_type = GGML_TYPE_F16;
1743-
} else {
1744-
tensor_type = type;
1745-
}
1781+
if (tensor_should_be_converted(tensor_storage, type)) {
1782+
tensor_type = type;
17461783
}
17471784

17481785
ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
@@ -1792,15 +1829,9 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
17921829
}
17931830

17941831
for (auto& tensor_storage : processed_tensor_storages) {
1795-
ggml_type tensor_type = tensor_storage.type;
1796-
if (type != GGML_TYPE_COUNT) {
1797-
if (ggml_is_quantized(type) && tensor_storage.ne[0] % 32 != 0) {
1798-
tensor_type = GGML_TYPE_F16;
1799-
} else {
1800-
tensor_type = type;
1801-
}
1832+
if (tensor_should_be_converted(tensor_storage, type)) {
1833+
tensor_storage.type = type;
18021834
}
1803-
tensor_storage.type = tensor_type;
18041835
mem_size += tensor_storage.nbytes() + alignment;
18051836
}
18061837

model.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ class ModelLoader {
157157
ggml_backend_t backend,
158158
std::set<std::string> ignore_tensors = {});
159159
bool save_to_gguf_file(const std::string& file_path, ggml_type type);
160+
bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
160161
int64_t get_params_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
161162
~ModelLoader() = default;
162163

0 commit comments

Comments
 (0)