feat: support mm embedding service and vlm embedding model factory.

Xianzhe Dong · Xianzhe Dong · commit f2b00889aedd · 2025-11-18T15:20:54.000+08:00
diff --git a/xllm/api_service/api_service.cpp b/xllm/api_service/api_service.cpp
@@ -64,6 +64,8 @@ APIService::APIService(Master* master,
     auto vlm_master = dynamic_cast<VLMMaster*>(master);
     mm_chat_service_impl_ =
         std::make_unique<MMChatServiceImpl>(vlm_master, model_names);
+    mm_embedding_service_impl_ =
+        std::make_unique<MMEmbeddingServiceImpl>(vlm_master, model_names);
   } else if (FLAGS_backend == "dit") {
     image_generation_service_impl_ =
         std::make_unique<ImageGenerationServiceImpl>(
@@ -190,10 +192,13 @@ void APIService::Embeddings(::google::protobuf::RpcController* controller,
   // TODO with xllm-service
 }
 
-void APIService::EmbeddingsHttp(::google::protobuf::RpcController* controller,
-                                const proto::HttpRequest* request,
-                                proto::HttpResponse* response,
-                                ::google::protobuf::Closure* done) {
+namespace {
+template <typename EmbeddingCall, typename Service>
+void EmbeddingsImpl(std::unique_ptr<Service>& embedding_service_impl_,
+                    ::google::protobuf::RpcController* controller,
+                    const proto::HttpRequest* request,
+                    proto::HttpResponse* response,
+                    ::google::protobuf::Closure* done) {
   xllm::ClosureGuard done_guard(
       done,
       std::bind(request_in_metric, nullptr),
@@ -202,12 +207,13 @@ void APIService::EmbeddingsHttp(::google::protobuf::RpcController* controller,
     LOG(ERROR) << "brpc request | respose | controller is null";
     return;
   }
-
   auto arena = response->GetArena();
   auto req_pb =
-      google::protobuf::Arena::CreateMessage<proto::EmbeddingRequest>(arena);
+      google::protobuf::Arena::CreateMessage<typename EmbeddingCall::ReqType>(
+          arena);
   auto resp_pb =
-      google::protobuf::Arena::CreateMessage<proto::EmbeddingResponse>(arena);
+      google::protobuf::Arena::CreateMessage<typename EmbeddingCall::ResType>(
+          arena);
 
   auto ctrl = reinterpret_cast<brpc::Controller*>(controller);
   std::string error;
@@ -230,6 +236,22 @@ void APIService::EmbeddingsHttp(::google::protobuf::RpcController* controller,
       ctrl, done_guard.release(), req_pb, resp_pb);
   embedding_service_impl_->process_async(call);
 }
+}  // namespace
+
+void APIService::EmbeddingsHttp(::google::protobuf::RpcController* controller,
+                                const proto::HttpRequest* request,
+                                proto::HttpResponse* response,
+                                ::google::protobuf::Closure* done) {
+  if (FLAGS_backend == "llm") {
+    CHECK(embedding_service_impl_) << " embedding service is invalid.";
+    EmbeddingsImpl<EmbeddingCall, EmbeddingServiceImpl>(
+        embedding_service_impl_, controller, request, response, done);
+  } else if (FLAGS_backend == "vlm") {
+    CHECK(mm_chat_service_impl_) << " mm embedding service is invalid.";
+    EmbeddingsImpl<MMEmbeddingCall, MMEmbeddingServiceImpl>(
+        mm_embedding_service_impl_, controller, request, response, done);
+  }
+}
 
 void APIService::ImageGeneration(::google::protobuf::RpcController* controller,
                                  const proto::ImageGenerationRequest* request,
diff --git a/xllm/api_service/api_service.h b/xllm/api_service/api_service.h
@@ -120,6 +120,7 @@ class APIService : public proto::XllmAPIService {
   std::unique_ptr<ChatServiceImpl> chat_service_impl_;
   std::unique_ptr<MMChatServiceImpl> mm_chat_service_impl_;
   std::unique_ptr<EmbeddingServiceImpl> embedding_service_impl_;
+  std::unique_ptr<MMEmbeddingServiceImpl> mm_embedding_service_impl_;
   std::unique_ptr<ModelsServiceImpl> models_service_impl_;
   std::unique_ptr<ImageGenerationServiceImpl> image_generation_service_impl_;
   std::unique_ptr<RerankServiceImpl> rerank_service_impl_;
diff --git a/xllm/api_service/embedding_service_impl.cpp b/xllm/api_service/embedding_service_impl.cpp
@@ -28,6 +28,7 @@ limitations under the License.
 namespace xllm {
 namespace {
 
+template <typename EmbeddingCall>
 bool send_result_to_client_brpc(std::shared_ptr<EmbeddingCall> call,
                                 const std::string& request_id,
                                 int64_t created_time,
@@ -113,9 +114,64 @@ void EmbeddingServiceImpl::process_async_impl(
           }
         }
 
-        return send_result_to_client_brpc(
+        return send_result_to_client_brpc<EmbeddingCall>(
             call, request_id, created_time, model, req_output);
       });
 }
 
+MMEmbeddingServiceImpl::MMEmbeddingServiceImpl(
+    VLMMaster* master,
+    const std::vector<std::string>& models)
+    : APIServiceImpl(models), master_(master) {
+  CHECK(master_ != nullptr);
+}
+
+void MMEmbeddingServiceImpl::process_async_impl(
+    std::shared_ptr<MMEmbeddingCall> call) {
+  const auto& rpc_request = call->request();
+  // check if model is supported
+  const auto& model = rpc_request.model();
+  if (!models_.contains(model)) {
+    call->finish_with_error(StatusCode::UNKNOWN, "Model not supported");
+    return;
+  }
+
+  // create RequestParams for embeddings request
+  // set is_embeddings and max_tokens = 1 to control engine step once.
+  RequestParams request_params(
+      rpc_request, call->get_x_request_id(), call->get_x_request_time());
+
+  auto& req_messages = rpc_request.messages();
+
+  std::vector<Message> messages;
+  MMInput mm_inputs;
+
+  static MMInputHelper helper;
+  if (!helper.trans(req_messages, messages, mm_inputs.items_)) {
+    call->finish_with_error(StatusCode::INVALID_ARGUMENT,
+                            "inputs argument is invalid.");
+    return;
+  }
+
+  // schedule the request
+  master_->handle_request(
+      std::move(messages),
+      std::move(mm_inputs),
+      std::move(request_params),
+      [call,
+       model,
+       request_id = request_params.request_id,
+       created_time = absl::ToUnixSeconds(absl::Now())](
+          const RequestOutput& req_output) -> bool {
+        if (req_output.status.has_value()) {
+          const auto& status = req_output.status.value();
+          if (!status.ok()) {
+            return call->finish_with_error(status.code(), status.message());
+          }
+        }
+
+        return send_result_to_client_brpc<MMEmbeddingCall>(
+            call, request_id, created_time, model, req_output);
+      });
+}
 }  // namespace xllm
diff --git a/xllm/api_service/embedding_service_impl.h b/xllm/api_service/embedding_service_impl.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "api_service/api_service_impl.h"
 #include "api_service/call.h"
 #include "api_service/non_stream_call.h"
+#include "core/runtime/vlm_master.h"
 #include "embedding.pb.h"
 
 namespace xllm {
@@ -40,4 +41,18 @@ class EmbeddingServiceImpl final : public APIServiceImpl<EmbeddingCall> {
   LLMMaster* master_ = nullptr;
 };
 
+using MMEmbeddingCall =
+    NonStreamCall<proto::MMEmbeddingRequest, proto::EmbeddingResponse>;
+class MMEmbeddingServiceImpl : public APIServiceImpl<MMEmbeddingCall> {
+ public:
+  MMEmbeddingServiceImpl(VLMMaster* master,
+                         const std::vector<std::string>& models);
+  // brpc call_data needs to use shared_ptr
+  void process_async_impl(std::shared_ptr<MMEmbeddingCall> call);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(MMEmbeddingServiceImpl);
+  VLMMaster* master_ = nullptr;
+};
+
 }  // namespace xllm
diff --git a/xllm/api_service/non_stream_call.h b/xllm/api_service/non_stream_call.h
@@ -33,6 +33,8 @@ namespace xllm {
 template <typename Request, typename Response>
 class NonStreamCall : public Call {
  public:
+  using ReqType = Request;
+  using ResType = Response;
   NonStreamCall(brpc::Controller* controller,
                 ::google::protobuf::Closure* done,
                 Request* request,
diff --git a/xllm/core/framework/model/CMakeLists.txt b/xllm/core/framework/model/CMakeLists.txt
@@ -32,6 +32,7 @@ cc_library(
     causal_vlm.h
     dit_model.h
     embedding_lm.h
+    embedding_vlm.h
     model_args.h
     npu_dp_ep_padding.h
     model_input_params.h
diff --git a/xllm/core/framework/model/embedding_vlm.h b/xllm/core/framework/model/embedding_vlm.h
@@ -0,0 +1,73 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <c10/core/Device.h>
+#include <torch/torch.h>
+
+#include <vector>
+
+#include "causal_vlm.h"
+#include "core/framework/kv_cache/kv_cache.h"
+#include "core/framework/quant_args.h"
+#include "core/framework/state_dict/state_dict.h"
+#include "model_args.h"
+#include "model_input_params.h"
+
+namespace xllm {
+
+class EmbeddingVLM : public CausalVLM {
+ public:
+  ~EmbeddingVLM() override = default;
+
+  // hidden_states: [num_tokens, hidden_size]
+  // seleted_idxes: [num_tokens]
+  // returns: [num_seqs, hidden_size]
+  virtual torch::Tensor pooler(const torch::Tensor& hidden_states,
+                               const torch::Tensor& seleted_idxes) = 0;
+};
+
+template <typename Model>
+class EmbeddingVLMImpl : public EmbeddingVLM {
+ public:
+  EmbeddingVLMImpl(Model model, const torch::TensorOptions& options)
+      : model_(std::move(model)), options_(options) {}
+
+  torch::Tensor logits(const torch::Tensor& hidden_states,
+                       const torch::Tensor& seleted_idxes) override {
+    return model_->logits(hidden_states, seleted_idxes);
+  }
+
+  torch::Tensor pooler(const torch::Tensor& hidden_states,
+                       const torch::Tensor& seleted_idxes) override {
+    return model_->pooler(hidden_states, seleted_idxes);
+  }
+
+  void load_model(std::unique_ptr<ModelLoader> loader) override {
+    model_->load_model(std::move(loader));
+  }
+
+  torch::Device device() const override { return options_.device(); }
+
+  const torch::TensorOptions& options() const override { return options_; }
+
+ private:
+  Model model_;
+
+  torch::TensorOptions options_;
+};
+
+}  // namespace xllm
diff --git a/xllm/core/framework/request/request_params.cpp b/xllm/core/framework/request/request_params.cpp
@@ -337,6 +337,18 @@ RequestParams::RequestParams(const proto::EmbeddingRequest& request,
   max_tokens = 1;
   streaming = false;
 }
+RequestParams::RequestParams(const proto::MMEmbeddingRequest& request,
+                             const std::string& x_rid,
+                             const std::string& x_rtime) {
+  if (request.has_service_request_id()) {
+    service_request_id = request.service_request_id();
+  }
+  x_request_id = x_rid;
+  x_request_time = x_rtime;
+  is_embeddings = true;
+  max_tokens = 1;
+  streaming = false;
+}
 
 RequestParams::RequestParams(const proto::RerankRequest& request,
                              const std::string& x_rid,
diff --git a/xllm/core/framework/request/request_params.h b/xllm/core/framework/request/request_params.h
@@ -49,6 +49,9 @@ struct RequestParams {
   RequestParams(const proto::EmbeddingRequest& request,
                 const std::string& x_rid,
                 const std::string& x_rtime);
+  RequestParams(const proto::MMEmbeddingRequest& request,
+                const std::string& x_rid,
+                const std::string& x_rtime);
   RequestParams(const proto::RerankRequest& request,
                 const std::string& x_rid,
                 const std::string& x_rtime);
diff --git a/xllm/core/runtime/embed_vlm_worker_impl.cpp b/xllm/core/runtime/embed_vlm_worker_impl.cpp
@@ -44,7 +44,7 @@ bool EmbedVLMWorkerImpl::init_model(ModelContext& context) {
   CHECK(model_ == nullptr) << "Model is already initialized.";
 
   context.set_image_embedding_mode(true);
-  model_ = create_vlm_model(context);
+  model_ = create_embeddingvlm_model(context);
   CHECK(model_ != nullptr) << "Failed to create model.";
   model_executor_ = std::make_unique<Executor>(
       model_.get(), context.get_model_args(), device_, options_);
@@ -80,7 +80,18 @@ std::optional<ForwardOutput> EmbedVLMWorkerImpl::step(
 
   // driver prepare model output
   ForwardOutput output;
-  output.embedding = hidden_states;
+  SampleOutput sample_output;
+
+  if (sampling_params.selected_token_idxes.defined() &&
+      inputs.micro_inputs[0].sampling_params.is_embeddings) {
+    EmbeddingVLM* em_model = dynamic_cast<EmbeddingVLM*>(model_.get());
+    auto embeddings =
+        em_model->pooler(hidden_states, sampling_params.selected_token_idxes);
+    sample_output.embeddings = embeddings;
+    output.sample_output = sample_output;
+    output.embedding = embeddings;
+  }
+
   return output;
 }
 
diff --git a/xllm/core/runtime/vlm_master.cpp b/xllm/core/runtime/vlm_master.cpp
@@ -355,6 +355,7 @@ std::shared_ptr<Request> VLMMaster::generate_request(std::string prompt,
   sampling_param.top_k = sp.top_k;
   sampling_param.logprobs = sp.logprobs;
   sampling_param.top_logprobs = sp.top_logprobs;
+  sampling_param.is_embeddings = sp.is_embeddings;
   if (best_of > sp.n) {
     // enable logprobs for best_of to generate sequence logprob
     sampling_param.logprobs = true;
diff --git a/xllm/models/model_registry.cpp b/xllm/models/model_registry.cpp
@@ -101,6 +101,19 @@ void ModelRegistry::register_embeddinglm_factory(const std::string& name,
   }
 }
 
+void ModelRegistry::register_embeddingvlm_factory(const std::string& name,
+                                                  EmbeddingVLMFactory factory) {
+  ModelRegistry* instance = get_instance();
+
+  if (instance->model_registry_[name].embedding_vlm_factory != nullptr) {
+    SAFE_LOG_WARNING("embedding vlm factory for " << name
+                                                  << " already registered.");
+  } else {
+    instance->model_registry_[name].embedding_vlm_factory = factory;
+    instance->model_backend_[name] = "vlm";
+  }
+}
+
 void ModelRegistry::register_dit_model_factory(const std::string& name,
                                                DiTModelFactory factory) {
   ModelRegistry* instance = get_instance();
@@ -195,6 +208,13 @@ EmbeddingLMFactory ModelRegistry::get_embeddinglm_factory(
   return instance->model_registry_[name].embedding_lm_factory;
 }
 
+EmbeddingVLMFactory ModelRegistry::get_embeddingvlm_factory(
+    const std::string& name) {
+  ModelRegistry* instance = get_instance();
+
+  return instance->model_registry_[name].embedding_vlm_factory;
+}
+
 DiTModelFactory ModelRegistry::get_dit_model_factory(const std::string& name) {
   ModelRegistry* instance = get_instance();
   return instance->model_registry_[name].dit_model_factory;
@@ -281,6 +301,21 @@ std::unique_ptr<EmbeddingLM> create_embeddinglm_model(
   return nullptr;
 }
 
+std::unique_ptr<EmbeddingVLM> create_embeddingvlm_model(
+    const ModelContext& context) {
+  // get the factory function for the model type from model registry
+  auto factory = ModelRegistry::get_embeddingvlm_factory(
+      context.get_model_args().model_type());
+  if (factory) {
+    return factory(context);
+  }
+
+  LOG(ERROR) << "Unsupported model type: "
+             << context.get_model_args().model_type();
+
+  return nullptr;
+}
+
 std::unique_ptr<DiTModel> create_dit_model(const DiTModelContext& context) {
   // get the factory function for the model type from model registry
   auto factory = ModelRegistry::get_dit_model_factory(context.model_type());
diff --git a/xllm/models/model_registry.h b/xllm/models/model_registry.h
diff --git a/xllm/proto/embedding.proto b/xllm/proto/embedding.proto