@@ -488,14 +488,14 @@ struct CLIPLayer : public GGMLBlock {
488488 blocks[" mlp" ] = std::shared_ptr<GGMLBlock>(new CLIPMLP (d_model, intermediate_size));
489489 }
490490
491- struct ggml_tensor * forward (struct ggml_context * ctx, struct ggml_tensor * x, bool mask = true ) {
491+ struct ggml_tensor * forward (struct ggml_context * ctx, ggml_backend_t backend, struct ggml_tensor * x, bool mask = true ) {
492492 // x: [N, n_token, d_model]
493493 auto self_attn = std::dynamic_pointer_cast<MultiheadAttention>(blocks[" self_attn" ]);
494494 auto layer_norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks[" layer_norm1" ]);
495495 auto layer_norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks[" layer_norm2" ]);
496496 auto mlp = std::dynamic_pointer_cast<CLIPMLP>(blocks[" mlp" ]);
497497
498- x = ggml_add (ctx, x, self_attn->forward (ctx, layer_norm1->forward (ctx, x), mask));
498+ x = ggml_add (ctx, x, self_attn->forward (ctx, backend, layer_norm1->forward (ctx, x), mask));
499499 x = ggml_add (ctx, x, mlp->forward (ctx, layer_norm2->forward (ctx, x)));
500500 return x;
501501 }
@@ -517,7 +517,11 @@ struct CLIPEncoder : public GGMLBlock {
517517 }
518518 }
519519
520- struct ggml_tensor * forward (struct ggml_context * ctx, struct ggml_tensor * x, int clip_skip = -1 , bool mask = true ) {
520+ struct ggml_tensor * forward (struct ggml_context * ctx,
521+ ggml_backend_t backend,
522+ struct ggml_tensor * x,
523+ int clip_skip = -1 ,
524+ bool mask = true ) {
521525 // x: [N, n_token, d_model]
522526 int layer_idx = n_layer - 1 ;
523527 // LOG_DEBUG("clip_skip %d", clip_skip);
@@ -532,7 +536,7 @@ struct CLIPEncoder : public GGMLBlock {
532536 }
533537 std::string name = " layers." + std::to_string (i);
534538 auto layer = std::dynamic_pointer_cast<CLIPLayer>(blocks[name]);
535- x = layer->forward (ctx, x, mask); // [N, n_token, d_model]
539+ x = layer->forward (ctx, backend, x, mask); // [N, n_token, d_model]
536540 // LOG_DEBUG("layer %d", i);
537541 }
538542 return x;
@@ -712,6 +716,7 @@ class CLIPTextModel : public GGMLBlock {
712716 }
713717
714718 struct ggml_tensor * forward (struct ggml_context * ctx,
719+ ggml_backend_t backend,
715720 struct ggml_tensor * input_ids,
716721 struct ggml_tensor * tkn_embeddings,
717722 size_t max_token_idx = 0 ,
@@ -722,7 +727,7 @@ class CLIPTextModel : public GGMLBlock {
722727 auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks[" final_layer_norm" ]);
723728
724729 auto x = embeddings->forward (ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size]
725- x = encoder->forward (ctx, x, return_pooled ? -1 : clip_skip, true );
730+ x = encoder->forward (ctx, backend, x, return_pooled ? -1 : clip_skip, true );
726731 if (return_pooled || with_final_ln) {
727732 x = final_layer_norm->forward (ctx, x);
728733 }
@@ -775,6 +780,7 @@ class CLIPVisionModel : public GGMLBlock {
775780 }
776781
777782 struct ggml_tensor * forward (struct ggml_context * ctx,
783+ ggml_backend_t backend,
778784 struct ggml_tensor * pixel_values,
779785 bool return_pooled = true ,
780786 int clip_skip = -1 ) {
@@ -786,7 +792,7 @@ class CLIPVisionModel : public GGMLBlock {
786792
787793 auto x = embeddings->forward (ctx, pixel_values); // [N, num_positions, embed_dim]
788794 x = pre_layernorm->forward (ctx, x);
789- x = encoder->forward (ctx, x, clip_skip, false );
795+ x = encoder->forward (ctx, backend, x, clip_skip, false );
790796 // print_ggml_tensor(x, true, "ClipVisionModel x: ");
791797 auto last_hidden_state = x;
792798 x = post_layernorm->forward (ctx, x); // [N, n_token, hidden_size]
@@ -855,6 +861,7 @@ class CLIPVisionModelProjection : public GGMLBlock {
855861 }
856862
857863 struct ggml_tensor * forward (struct ggml_context * ctx,
864+ ggml_backend_t backend,
858865 struct ggml_tensor * pixel_values,
859866 bool return_pooled = true ,
860867 int clip_skip = -1 ) {
@@ -863,7 +870,7 @@ class CLIPVisionModelProjection : public GGMLBlock {
863870 auto vision_model = std::dynamic_pointer_cast<CLIPVisionModel>(blocks[" vision_model" ]);
864871 auto visual_projection = std::dynamic_pointer_cast<CLIPProjection>(blocks[" visual_projection" ]);
865872
866- auto x = vision_model->forward (ctx, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
873+ auto x = vision_model->forward (ctx, backend, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
867874
868875 if (return_pooled) {
869876 x = visual_projection->forward (ctx, x); // [N, projection_dim]
@@ -900,6 +907,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
900907 }
901908
902909 struct ggml_tensor * forward (struct ggml_context * ctx,
910+ ggml_backend_t backend,
903911 struct ggml_tensor * input_ids,
904912 struct ggml_tensor * embeddings,
905913 size_t max_token_idx = 0 ,
@@ -911,7 +919,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
911919 input_ids = ggml_reshape_2d (ctx, input_ids, model.n_token , input_ids->ne [0 ] / model.n_token );
912920 }
913921
914- return model.forward (ctx, input_ids, embeddings, max_token_idx, return_pooled);
922+ return model.forward (ctx, backend, input_ids, embeddings, max_token_idx, return_pooled);
915923 }
916924
917925 struct ggml_cgraph * build_graph (struct ggml_tensor * input_ids,
@@ -937,7 +945,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
937945 embeddings = ggml_concat (compute_ctx, token_embed_weight, custom_embeddings, 1 );
938946 }
939947
940- struct ggml_tensor * hidden_states = forward (compute_ctx, input_ids, embeddings, max_token_idx, return_pooled);
948+ struct ggml_tensor * hidden_states = forward (compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled);
941949
942950 ggml_build_forward_expand (gf, hidden_states);
943951
0 commit comments