@@ -544,9 +544,15 @@ class CLIPEmbeddings : public GGMLBlock {
544544 int64_t embed_dim;
545545 int64_t vocab_size;
546546 int64_t num_positions;
547+ bool force_clip_f32;
547548
548549 void init_params (struct ggml_context * ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = " " ) {
549550 enum ggml_type token_wtype = GGML_TYPE_F32;
551+ if (!force_clip_f32) {
552+ auto tensor_type = tensor_types.find (prefix + " token_embedding.weight" );
553+ if (tensor_type != tensor_types.end ())
554+ token_wtype = tensor_type->second ;
555+ }
550556 enum ggml_type position_wtype = GGML_TYPE_F32;
551557
552558 params[" token_embedding.weight" ] = ggml_new_tensor_2d (ctx, token_wtype, embed_dim, vocab_size);
@@ -556,10 +562,12 @@ class CLIPEmbeddings : public GGMLBlock {
556562public:
557563 CLIPEmbeddings (int64_t embed_dim,
558564 int64_t vocab_size = 49408 ,
559- int64_t num_positions = 77 )
565+ int64_t num_positions = 77 ,
566+ bool force_clip_f32 = false )
560567 : embed_dim(embed_dim),
561568 vocab_size (vocab_size),
562- num_positions(num_positions) {
569+ num_positions(num_positions),
570+ force_clip_f32(force_clip_f32) {
563571 }
564572
565573 struct ggml_tensor * get_token_embed_weight () {
@@ -677,7 +685,8 @@ class CLIPTextModel : public GGMLBlock {
677685 bool with_final_ln = true ;
678686
679687 CLIPTextModel (CLIPVersion version = OPENAI_CLIP_VIT_L_14,
680- bool with_final_ln = true )
688+ bool with_final_ln = true ,
689+ bool force_clip_f32 = false )
681690 : version(version), with_final_ln(with_final_ln) {
682691 if (version == OPEN_CLIP_VIT_H_14) {
683692 hidden_size = 1024 ;
@@ -691,7 +700,7 @@ class CLIPTextModel : public GGMLBlock {
691700 n_layer = 32 ;
692701 }
693702
694- blocks[" embeddings" ] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings (hidden_size, vocab_size, n_token));
703+ blocks[" embeddings" ] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings (hidden_size, vocab_size, n_token, force_clip_f32 ));
695704 blocks[" encoder" ] = std::shared_ptr<GGMLBlock>(new CLIPEncoder (n_layer, hidden_size, n_head, intermediate_size));
696705 blocks[" final_layer_norm" ] = std::shared_ptr<GGMLBlock>(new LayerNorm (hidden_size));
697706 }
@@ -862,8 +871,9 @@ struct CLIPTextModelRunner : public GGMLRunner {
862871 const String2GGMLType& tensor_types,
863872 const std::string prefix,
864873 CLIPVersion version = OPENAI_CLIP_VIT_L_14,
865- bool with_final_ln = true )
866- : GGMLRunner(backend), model(version, with_final_ln) {
874+ bool with_final_ln = true ,
875+ bool force_clip_f32 = false )
876+ : GGMLRunner(backend), model(version, with_final_ln, force_clip_f32) {
867877 model.init (params_ctx, tensor_types, prefix);
868878 }
869879
0 commit comments