@@ -548,9 +548,15 @@ class CLIPEmbeddings : public GGMLBlock {
548548 int64_t embed_dim;
549549 int64_t vocab_size;
550550 int64_t num_positions;
551+ bool force_clip_f32;
551552
552553 void init_params (struct ggml_context * ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = " " ) {
553554 enum ggml_type token_wtype = GGML_TYPE_F32;
555+ if (!force_clip_f32) {
556+ auto tensor_type = tensor_types.find (prefix + " token_embedding.weight" );
557+ if (tensor_type != tensor_types.end ())
558+ token_wtype = tensor_type->second ;
559+ }
554560 enum ggml_type position_wtype = GGML_TYPE_F32;
555561
556562 params[" token_embedding.weight" ] = ggml_new_tensor_2d (ctx, token_wtype, embed_dim, vocab_size);
@@ -560,10 +566,12 @@ class CLIPEmbeddings : public GGMLBlock {
560566public:
561567 CLIPEmbeddings (int64_t embed_dim,
562568 int64_t vocab_size = 49408 ,
563- int64_t num_positions = 77 )
569+ int64_t num_positions = 77 ,
570+ bool force_clip_f32 = false )
564571 : embed_dim(embed_dim),
565572 vocab_size (vocab_size),
566- num_positions(num_positions) {
573+ num_positions(num_positions),
574+ force_clip_f32(force_clip_f32) {
567575 }
568576
569577 struct ggml_tensor * get_token_embed_weight () {
@@ -681,7 +689,8 @@ class CLIPTextModel : public GGMLBlock {
681689 bool with_final_ln = true ;
682690
683691 CLIPTextModel (CLIPVersion version = OPENAI_CLIP_VIT_L_14,
684- bool with_final_ln = true )
692+ bool with_final_ln = true ,
693+ bool force_clip_f32 = false )
685694 : version(version), with_final_ln(with_final_ln) {
686695 if (version == OPEN_CLIP_VIT_H_14) {
687696 hidden_size = 1024 ;
@@ -695,7 +704,7 @@ class CLIPTextModel : public GGMLBlock {
695704 n_layer = 32 ;
696705 }
697706
698- blocks[" embeddings" ] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings (hidden_size, vocab_size, n_token));
707+ blocks[" embeddings" ] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings (hidden_size, vocab_size, n_token, force_clip_f32 ));
699708 blocks[" encoder" ] = std::shared_ptr<GGMLBlock>(new CLIPEncoder (n_layer, hidden_size, n_head, intermediate_size));
700709 blocks[" final_layer_norm" ] = std::shared_ptr<GGMLBlock>(new LayerNorm (hidden_size));
701710 }
@@ -879,8 +888,9 @@ struct CLIPTextModelRunner : public GGMLRunner {
879888 const String2GGMLType& tensor_types,
880889 const std::string prefix,
881890 CLIPVersion version = OPENAI_CLIP_VIT_L_14,
882- bool with_final_ln = true )
883- : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln) {
891+ bool with_final_ln = true ,
892+ bool force_clip_f32 = false )
893+ : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) {
884894 model.init (params_ctx, tensor_types, prefix);
885895 }
886896
0 commit comments