Skip to content

Commit 489069c

Browse files
committed
feat: reduce CLIP memory usage with no embeddings
The CLIP weights need to be converted to f32 for textual inversions (fbd42b6), but that increases the amount of allocated VRAM even when embeddings aren't being used.
1 parent 73b1ad5 commit 489069c

File tree

2 files changed

+21
-10
lines changed

2 files changed

+21
-10
lines changed

clip.hpp

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -548,9 +548,15 @@ class CLIPEmbeddings : public GGMLBlock {
548548
int64_t embed_dim;
549549
int64_t vocab_size;
550550
int64_t num_positions;
551+
bool force_clip_f32;
551552

552553
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
553554
enum ggml_type token_wtype = GGML_TYPE_F32;
555+
if (!force_clip_f32) {
556+
auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
557+
if (tensor_type != tensor_types.end())
558+
token_wtype = tensor_type->second;
559+
}
554560
enum ggml_type position_wtype = GGML_TYPE_F32;
555561

556562
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
@@ -560,10 +566,12 @@ class CLIPEmbeddings : public GGMLBlock {
560566
public:
561567
CLIPEmbeddings(int64_t embed_dim,
562568
int64_t vocab_size = 49408,
563-
int64_t num_positions = 77)
569+
int64_t num_positions = 77,
570+
bool force_clip_f32 = false)
564571
: embed_dim(embed_dim),
565572
vocab_size(vocab_size),
566-
num_positions(num_positions) {
573+
num_positions(num_positions),
574+
force_clip_f32(force_clip_f32) {
567575
}
568576

569577
struct ggml_tensor* get_token_embed_weight() {
@@ -681,7 +689,8 @@ class CLIPTextModel : public GGMLBlock {
681689
bool with_final_ln = true;
682690

683691
CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
684-
bool with_final_ln = true)
692+
bool with_final_ln = true,
693+
bool force_clip_f32 = false)
685694
: version(version), with_final_ln(with_final_ln) {
686695
if (version == OPEN_CLIP_VIT_H_14) {
687696
hidden_size = 1024;
@@ -695,7 +704,7 @@ class CLIPTextModel : public GGMLBlock {
695704
n_layer = 32;
696705
}
697706

698-
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
707+
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
699708
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
700709
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
701710
}
@@ -879,8 +888,9 @@ struct CLIPTextModelRunner : public GGMLRunner {
879888
const String2GGMLType& tensor_types,
880889
const std::string prefix,
881890
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
882-
bool with_final_ln = true)
883-
: GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln) {
891+
bool with_final_ln = true,
892+
bool force_clip_f32 = false)
893+
: GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) {
884894
model.init(params_ctx, tensor_types, prefix);
885895
}
886896

conditioner.hpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
6363
SDVersion version = VERSION_SD1,
6464
PMVersion pv = PM_VERSION_1)
6565
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
66+
bool force_clip_f32 = embd_dir.size() > 0;
6667
if (sd_version_is_sd1(version)) {
67-
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
68+
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
6869
} else if (sd_version_is_sd2(version)) {
69-
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
70+
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
7071
} else if (sd_version_is_sdxl(version)) {
71-
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
72-
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
72+
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
73+
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
7374
}
7475
}
7576

0 commit comments

Comments
 (0)