janhq · vansangpfiev · Apr 4, 2025 · Mar 11, 2025 · Mar 17, 2025 · Mar 17, 2025
diff --git a/.github/patches/windows/msvcp140.dll b/.github/patches/windows/msvcp140.dll
diff --git a/.github/patches/windows/vcruntime140.dll b/.github/patches/windows/vcruntime140.dll
diff --git a/.github/patches/windows/vcruntime140_1.dll b/.github/patches/windows/vcruntime140_1.dll
diff --git a/docs/docs/engines/engine-extension.mdx b/docs/docs/engines/engine-extension.mdx
@@ -71,9 +71,6 @@ class EngineI {
       std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
 
-  // Compatibility and model management
-  virtual bool IsSupported(const std::string& f) = 0;
-
   virtual void GetModels(
       std::shared_ptr<Json::Value> jsonBody,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;

diff --git a/docs/static/openapi/cortex.json b/docs/static/openapi/cortex.json
@@ -2754,7 +2754,7 @@
                       },
                       "version": {
                         "type": "string",
-                        "example": "0.1.35-28.10.24"
+                        "example": "b4920"
                       }
                     }
                   }
@@ -2763,11 +2763,11 @@
                   {
                     "engine": "llama-cpp",
                     "name": "mac-arm64",
-                    "version": "0.1.35-28.10.24"
+                    "version": "b4920"
                   },
                   {
                     "engine": "llama-cpp",
-                    "name": "linux-amd64-avx",
+                    "name": "linux-avx-x64",
                     "version": "0.1.35-27.10.24"
                   }
                 ]
@@ -2901,7 +2901,7 @@
                       "name": {
                         "type": "string",
                         "description": "The name of the variant, including OS, architecture, and capabilities",
-                        "example": "linux-amd64-avx-cuda-11-7"
+                        "example": "linux-avx-x64-cuda-11-7"
                       },
                       "created_at": {
                         "type": "string",
@@ -2973,7 +2973,7 @@
                       },
                       "name": {
                         "type": "string",
-                        "example": "0.1.39-linux-amd64-avx-cuda-11-7"
+                        "example": "llama-b4920-bin-linux-avx-cuda-cu11.7"
                       },
                       "size": {
                         "type": "integer",
@@ -3250,7 +3250,7 @@
                     },
                     "version": {
                       "type": "string",
-                      "example": "0.1.35-28.10.24"
+                      "example": "b4920"
                     }
                   }
                 }

diff --git a/engine/CMakeLists.txt b/engine/CMakeLists.txt
@@ -177,6 +177,7 @@ add_executable(${TARGET_NAME} main.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/utils/process/utils.cc
 
     ${CMAKE_CURRENT_SOURCE_DIR}/extensions/remote-engine/remote_engine.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/extensions/local-engine/local_engine.cc
 
 )
 
@@ -222,3 +223,12 @@ set_target_properties(${TARGET_NAME} PROPERTIES
                       RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}
                       RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
 )
+
+if(MSVC)
+  add_custom_command(
+    TARGET ${TARGET_NAME} POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_directory
+    ${CMAKE_CURRENT_SOURCE_DIR}/../.github/patches/windows
+    ${CMAKE_BINARY_DIR}/
+  )
+endif()
diff --git a/engine/cli/CMakeLists.txt b/engine/cli/CMakeLists.txt
@@ -86,7 +86,7 @@ add_executable(${TARGET_NAME} main.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/../services/hardware_service.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/../services/database_service.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/remote-engine/remote_engine.cc
-
+    ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/local-engine/local_engine.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/../extensions/template_renderer.cc
 
     ${CMAKE_CURRENT_SOURCE_DIR}/utils/easywsclient.cc

diff --git a/engine/cli/command_line_parser.cc b/engine/cli/command_line_parser.cc
@@ -33,6 +33,7 @@
 #include "services/engine_service.h"
 #include "utils/file_manager_utils.h"
 #include "utils/logging_utils.h"
+#include "utils/task_queue.h"
 
 namespace {
 constexpr const auto kCommonCommandsGroup = "Common Commands";
@@ -50,8 +51,7 @@ CommandLineParser::CommandLineParser()
       download_service_{std::make_shared<DownloadService>()},
       dylib_path_manager_{std::make_shared<cortex::DylibPathManager>()},
       db_service_{std::make_shared<DatabaseService>()},
-      engine_service_{std::make_shared<EngineService>(
-          download_service_, dylib_path_manager_, db_service_)} {}
+      engine_service_{std::make_shared<EngineService>(dylib_path_manager_)} {}
 
 bool CommandLineParser::SetupCommand(int argc, char** argv) {
   app_.usage("Usage:\n" + commands::GetCortexBinary() +

diff --git a/engine/cli/commands/cortex_upd_cmd.cc b/engine/cli/commands/cortex_upd_cmd.cc
@@ -532,10 +532,10 @@ bool CortexUpdCmd::GetLinuxInstallScript(const std::string& v,
                                          const std::string& channel) {
   std::vector<std::string> path_list;
   if (channel == "nightly") {
-    path_list = {"menloresearch",     "cortex.cpp", "dev",       "engine",
+    path_list = {kMenloOrg,     "cortex.cpp", "dev",       "engine",
                  "templates", "linux",      "install.sh"};
   } else {
-    path_list = {"menloresearch",     "cortex.cpp", "main",      "engine",
+    path_list = {kMenloOrg,     "cortex.cpp", "main",      "engine",
                  "templates", "linux",      "install.sh"};
   }
   auto url_obj = url_parser::Url{

diff --git a/engine/cli/commands/cortex_upd_cmd.h b/engine/cli/commands/cortex_upd_cmd.h
@@ -79,9 +79,9 @@ inline std::vector<std::string> GetReleasePath() {
   if (CORTEX_VARIANT == file_manager_utils::kNightlyVariant) {
     return {"cortex", "latest", "version.json"};
   } else if (CORTEX_VARIANT == file_manager_utils::kBetaVariant) {
-    return {"repos", "menloresearch", "cortex.cpp", "releases"};
+    return {"repos", kMenloOrg, "cortex.cpp", "releases"};
   } else {
-    return {"repos", "menloresearch", "cortex.cpp", "releases", "latest"};
+    return {"repos", kMenloOrg, "cortex.cpp", "releases", "latest"};
   }
 }
 

diff --git a/engine/cli/commands/engine_install_cmd.cc b/engine/cli/commands/engine_install_cmd.cc
@@ -92,7 +92,10 @@ bool EngineInstallCmd::Exec(const std::string& engine,
     std::vector<std::string> variant_selections;
     for (const auto& variant : variant_result.value()) {
       auto v_name = variant["name"].asString();
-      if (string_utils::StringContainsIgnoreCase(v_name, hw_inf_.sys_inf->os) &&
+      if ((string_utils::StringContainsIgnoreCase(v_name,
+                                                  hw_inf_.sys_inf->os) ||
+           (hw_inf_.sys_inf->os == kLinuxOs &&
+            string_utils::StringContainsIgnoreCase(v_name, kUbuntuOs))) &&
           string_utils::StringContainsIgnoreCase(v_name,
                                                  hw_inf_.sys_inf->arch)) {
         variant_selections.push_back(variant["name"].asString());

diff --git a/engine/cli/commands/server_start_cmd.cc b/engine/cli/commands/server_start_cmd.cc
@@ -106,10 +106,8 @@ bool ServerStartCmd::Exec(const std::string& host, int port,
 #else
   std::vector<std::string> commands;
   // Some engines requires to add lib search path before process being created
-  auto download_srv = std::make_shared<DownloadService>();
-  auto dylib_path_mng = std::make_shared<cortex::DylibPathManager>();
-  auto db_srv = std::make_shared<DatabaseService>();
-  EngineService(download_srv, dylib_path_mng, db_srv).RegisterEngineLibPath();
+  EngineService(std::make_shared<cortex::DylibPathManager>())
+      .RegisterEngineLibPath();
 
   std::string p = cortex_utils::GetCurrentPath() + "/" + exe;
   commands.push_back(p);

diff --git a/engine/cli/main.cc b/engine/cli/main.cc
@@ -155,7 +155,7 @@ int main(int argc, char* argv[]) {
       auto get_latest_version = []() -> cpp::result<std::string, std::string> {
         try {
           auto res = github_release_utils::GetReleaseByVersion(
-              "menloresearch", "cortex.llamacpp", "latest");
+              kGgmlOrg, kLlamaRepo, "latest");
           if (res.has_error()) {
             CTL_ERR("Failed to get latest llama.cpp version: " << res.error());
             return cpp::fail("Failed to get latest llama.cpp version: " +

diff --git a/engine/cli/utils/download_progress.cc b/engine/cli/utils/download_progress.cc
@@ -83,8 +83,8 @@ bool DownloadProgress::Handle(
                          size_t max_length = 20) -> std::string {
       // Check the length of the input string
       if (str.length() >= max_length) {
-        return str.substr(
-            0, max_length);  // Return truncated string if it's too long
+        return str.substr(0, max_length - 3) +
+               ".. ";  // Return truncated string if it's too long
       }
 
       // Calculate the number of spaces needed

diff --git a/engine/controllers/engines.cc b/engine/controllers/engines.cc
@@ -155,6 +155,7 @@ void Engines::GetEngineVariants(
       releases.append(json.value());
     }
   }
+  CTL_INF(releases.toStyledString());
   auto resp = cortex_utils::CreateCortexHttpJsonResponse(releases);
   resp->setStatusCode(k200OK);
   callback(resp);
@@ -177,6 +178,8 @@ void Engines::InstallEngine(
     }
     norm_version = version;
   }
+  CTL_INF("version: " << norm_version
+                      << ", norm_variant: " << norm_variant.value_or(""));
 
   auto result =
       engine_service_->InstallEngineAsync(engine, norm_version, norm_variant);

diff --git a/engine/controllers/server.cc b/engine/controllers/server.cc
@@ -138,7 +138,7 @@ void server::ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
   auto err_or_done = std::make_shared<std::atomic_bool>(false);
   auto chunked_content_provider = [this, q, err_or_done, engine_type, model_id](
                                       char* buf,
-                                       std::size_t buf_size) -> std::size_t {
+                                      std::size_t buf_size) -> std::size_t {
     if (buf == nullptr) {
       LOG_TRACE << "Buf is null";
       if (!(*err_or_done)) {

diff --git a/engine/cortex-common/EngineI.h b/engine/cortex-common/EngineI.h
@@ -47,9 +47,6 @@ class EngineI {
       std::shared_ptr<Json::Value> json_body,
       std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
 
-  // For backward compatible checking
-  virtual bool IsSupported(const std::string& f) = 0;
-
   // Get list of running models
   virtual void GetModels(
       std::shared_ptr<Json::Value> jsonBody,

diff --git a/engine/cortex-common/remote_enginei.h b/engine/cortex-common/remote_enginei.h
@@ -1,7 +1,5 @@
 #pragma once
 
-#pragma once
-
 #include <functional>
 #include <memory>
 

diff --git a/engine/e2e-test/api/engines/test_api_engine.py b/engine/e2e-test/api/engines/test_api_engine.py
@@ -28,14 +28,14 @@ def test_engines_get_llamacpp_should_be_successful(self):
 
     # engines install
     def test_engines_install_llamacpp_specific_version_and_variant(self):
-        data = {"version": "v0.1.40-b4354", "variant": "linux-amd64-avx"}
+        data = {"version": "b4932", "variant": "linux-avx-x64"}
         response = requests.post(
             "http://localhost:3928/v1/engines/llama-cpp/install", json=data
         )
         assert response.status_code == 200
 
     def test_engines_install_llamacpp_specific_version_and_null_variant(self):
-        data = {"version": "v0.1.40-b4354"}
+        data = {"version": "b4932"}
         response = requests.post(
             "http://localhost:3928/v1/engines/llama-cpp/install", json=data
         )
@@ -55,14 +55,14 @@ async def test_engines_install_uninstall_llamacpp_should_be_successful(self):
     @pytest.mark.asyncio
     async def test_engines_install_uninstall_llamacpp_with_only_version_should_be_failed(self):
         # install first
-        data = {"variant": "mac-arm64"}
+        data = {"variant": "linux-avx-x64"}
         install_response = requests.post(
             "http://127.0.0.1:3928/v1/engines/llama-cpp/install", json=data
         )
         await wait_for_websocket_download_success_event(timeout=120)
         assert install_response.status_code == 200
 
-        data = {"version": "v0.1.35"}
+        data = {"version": "b4932"}
         response = requests.delete(
             "http://localhost:3928/v1/engines/llama-cpp/install", json=data
         )
@@ -72,7 +72,7 @@ async def test_engines_install_uninstall_llamacpp_with_only_version_should_be_fa
     @pytest.mark.asyncio
     async def test_engines_install_uninstall_llamacpp_with_variant_should_be_successful(self):
         # install first
-        data = {"variant": "mac-arm64"}
+        data = {"variant": "linux-avx-x64"}
         install_response = requests.post(
             "http://127.0.0.1:3928/v1/engines/llama-cpp/install", json=data
         )
@@ -85,7 +85,7 @@ async def test_engines_install_uninstall_llamacpp_with_variant_should_be_success
     def test_engines_install_uninstall_llamacpp_with_specific_variant_and_version_should_be_successful(
         self,
     ):
-        data = {"variant": "mac-arm64", "version": "v0.1.35"}
+        data = {"variant": "linux-avx-x64", "version": "b4932"}
         # install first
         install_response = requests.post(
             "http://localhost:3928/v1/engines/llama-cpp/install", json=data

diff --git a/engine/e2e-test/api/engines/test_api_engine_install_nightly.py b/engine/e2e-test/api/engines/test_api_engine_install_nightly.py
@@ -2,7 +2,7 @@
 import requests
 from utils.test_runner import start_server, stop_server, get_latest_pre_release_tag
 
-latest_pre_release_tag = get_latest_pre_release_tag("menloresearch", "cortex.llamacpp")
+latest_pre_release_tag = get_latest_pre_release_tag("menloresearch", "llama.cpp")
 
 class TestApiEngineInstall:
 
@@ -23,7 +23,7 @@ def test_engines_install_llamacpp_should_be_successful(self):
         assert response.status_code == 200
 
     def test_engines_install_llamacpp_specific_version_and_variant(self):
-        data = {"version": latest_pre_release_tag, "variant": "linux-amd64-avx"}
+        data = {"version": latest_pre_release_tag, "variant": "linux-avx-x64"}
         response = requests.post(
             "http://localhost:3928/v1/engines/llama-cpp/install", json=data
         )

diff --git a/engine/e2e-test/api/engines/test_api_get_default_engine.py b/engine/e2e-test/api/engines/test_api_get_default_engine.py
@@ -24,8 +24,8 @@ def setup_and_teardown(self):
     def test_api_get_default_engine_successfully(self):
         # Data test
         engine= "llama-cpp"
-        name= "linux-amd64-avx"
-        version= "v0.1.35-27.10.24"
+        name= "linux-avx-x64"
+        version= "b4932"
 
         data = {"version": version, "variant": name}
         post_install_url = f"http://localhost:3928/v1/engines/{engine}/install"

diff --git a/engine/e2e-test/api/engines/test_api_get_list_engine.py b/engine/e2e-test/api/engines/test_api_get_list_engine.py
@@ -24,8 +24,8 @@ def setup_and_teardown(self):
     def test_api_get_list_engines_successfully(self):
         # Data test
         engine= "llama-cpp"
-        name= "linux-amd64-avx"
-        version= "v0.1.35-27.10.24"
+        name= "linux-avx-x64"
+        version= "b4932"
 
         post_install_url = f"http://localhost:3928/v1/engines/{engine}/install"
         response = requests.delete(

diff --git a/engine/e2e-test/api/engines/test_api_post_default_engine.py b/engine/e2e-test/api/engines/test_api_post_default_engine.py
@@ -23,8 +23,8 @@ def setup_and_teardown(self):
     def test_api_set_default_engine_successfully(self):
         # Data test
         engine= "llama-cpp"
-        name= "linux-amd64-avx"
-        version= "v0.1.35-27.10.24"
+        name= "linux-avx-x64"
+        version= "b4932"
 
         data = {"version": version, "variant": name}
         post_install_url = f"http://localhost:3928/v1/engines/{engine}/install"

diff --git a/engine/e2e-test/api/hardware/test_api_get_hardware.py b/engine/e2e-test/api/hardware/test_api_get_hardware.py
@@ -88,25 +88,6 @@ def test_api_get_hardware_successfully(self):
                     "example": True,
                     "description": "Indicates if the GPU is currently activated."
                     },
-                    "additional_information": {
-                    "type": "object",
-                    "properties": {
-                        "compute_cap": {
-                        "type": "string",
-                        "example": "8.6",
-                        "description": "The compute capability of the GPU."
-                        },
-                        "driver_version": {
-                        "type": "string",
-                        "example": "535.183",
-                        "description": "The version of the installed driver."
-                        }
-                    },
-                    "required": [
-                        "compute_cap",
-                        "driver_version"
-                    ]
-                    },
                     "free_vram": {
                     "type": "integer",
                     "example": 23983,
@@ -140,7 +121,6 @@ def test_api_get_hardware_successfully(self):
                 },
                 "required": [
                     "activated",
-                    "additional_information",
                     "free_vram",
                     "id",
                     "name",

diff --git a/engine/e2e-test/api/model/test_api_model.py b/engine/e2e-test/api/model/test_api_model.py
@@ -95,6 +95,7 @@ async def test_models_start_stop_should_be_successful(self):
         time.sleep(30)
 
         print("Pull model")
+        requests.delete("http://localhost:3928/v1/models/tinyllama:1b")
         json_body = {"model": "tinyllama:1b"}
         response = requests.post("http://localhost:3928/v1/models/pull", json=json_body)
         assert response.status_code == 200, f"Failed to pull model: tinyllama:1b"

diff --git a/engine/e2e-test/cli/engines/test_cli_engine_install.py b/engine/e2e-test/cli/engines/test_cli_engine_install.py
@@ -31,25 +31,9 @@ def test_engines_install_llamacpp_should_be_successfully(self):
         assert len(response.json()) > 0
         assert exit_code == 0, f"Install engine failed with error: {error}"
 
-    @pytest.mark.skipif(reason="Ignore onnx-runtime test")
-    def test_engines_install_onnx_on_macos_should_be_failed(self):
-        exit_code, output, error = run(
-            "Install Engine", ["engines", "install", "onnxruntime"]
-        )
-        assert "is not supported on" in output, "Should display error message"
-        assert exit_code == 0, f"Install engine failed with error: {error}"
-
-    @pytest.mark.skipif(reason="Ignore tensorrt-llm test")
-    def test_engines_install_onnx_on_tensorrt_should_be_failed(self):
-        exit_code, output, error = run(
-            "Install Engine", ["engines", "install", "tensorrt-llm"]
-        )
-        assert "is not supported on" in output, "Should display error message"
-        assert exit_code == 0, f"Install engine failed with error: {error}"
-
     @pytest.mark.skipif(platform.system() == "Windows", reason="Progress bar log issue on Windows")
     def test_engines_install_pre_release_llamacpp(self):
-        engine_version = "v0.1.43"
+        engine_version = "b4932"
         exit_code, output, error = run(
             "Install Engine",
             ["engines", "install", "llama-cpp", "-v", engine_version],