janhq · hiento09 · Jun 14, 2024 · Jun 14, 2024
diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt
@@ -263,3 +263,7 @@ if(BUILD_PYBIND)
 endif()
 
 add_subdirectory(plugins)
+
+if(BUILD_CORTEX_TENSORRT-LLM)
+  add_subdirectory(cortex.tensorrt-llm)
+endif()
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/CMakeLists.txt b/cpp/tensorrt_llm/cortex.tensorrt-llm/CMakeLists.txt
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
+# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# C++17
+# engine init
+include(CheckIncludeFileCXX)
+
+check_include_file_cxx(any HAS_ANY)
+check_include_file_cxx(string_view HAS_STRING_VIEW)
+check_include_file_cxx(coroutine HAS_COROUTINE)
+if(HAS_ANY
+   AND HAS_STRING_VIEW
+   AND HAS_COROUTINE)
+  set(CMAKE_CXX_STANDARD 20)
+elseif(HAS_ANY AND HAS_STRING_VIEW)
+  set(CMAKE_CXX_STANDARD 17)
+else()
+  set(CMAKE_CXX_STANDARD 14)
+endif()
+
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/build_deps/_install)
+
+message(STATUS "Current Source Directory CORTEX: ${CMAKE_CURRENT_SOURCE_DIR}")
+message(STATUS "Current Cmake Prefix Path of CORTEX: ${CMAKE_PREFIX_PATH}")
+
+
+set(OPENSSL_USE_STATIC_LIBS TRUE)
+
+
+# Enable pkg-config support in CMake
+find_package(PkgConfig REQUIRED)
+find_library(TRANTOR
+    NAMES trantor
+    HINTS "${CMAKE_PREFIX_PATH}/lib"
+)
+find_library(JSONCPP
+    NAMES jsoncpp
+    HINTS "${CMAKE_PREFIX_PATH}/lib"
+)
+
+# Use pkg-config to find the SentencePiece library
+
+if(NOT WIN32) # Linux
+  # Use pkg-config to find the SentencePiece library
+  pkg_search_module(SENTENCEPIECE REQUIRED sentencepiece)
+else() # Windows
+  set(SENTENCEPIECE_INCLUDE_DIRS "${CMAKE_PREFIX_PATH}/include")
+  set(SENTENCEPIECE_LIBRARY_DIRS "${CMAKE_PREFIX_PATH}/lib")
+endif()
+
+message(STATUS "SentencePiece library dirs: ${SENTENCEPIECE_LIBRARY_DIRS}")
+message(STATUS "SentencePiece header dirs: ${SENTENCEPIECE_INCLUDE_DIRS}")
+
+include_directories(${PROJECT_SOURCE_DIR}/include ${SENTENCEPIECE_INCLUDE_DIRS})
+
+link_directories(${SENTENCEPIECE_LIBRARY_DIRS})
+
+set(TOP_LEVEL_DIR "${PROJECT_SOURCE_DIR}/..")
+
+add_custom_target(engine_proj)
+
+set(CXXOPTS_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/cxxopts)
+add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts)
+
+# main
+# add_executable(engine main.cc)
+add_library(engine SHARED src/tensorrt-llm_engine.cc)
+target_link_libraries(
+  engine PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm cxxopts::cxxopts sentencepiece PRIVATE ${JSONCPP} ${TRANTOR} ${CMAKE_THREAD_LIBS_INIT} )
+
+target_compile_features(engine PRIVATE cxx_std_17)
+target_compile_definitions(engine PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}")
+
+aux_source_directory(src SRC)
+
+target_include_directories(engine PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+target_sources(engine PRIVATE ${SRC})
+
+
+add_dependencies(engine_proj engine)
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/Makefile b/cpp/tensorrt_llm/cortex.tensorrt-llm/Makefile
@@ -0,0 +1,66 @@
+CMAKE_EXTRA_FLAGS ?= ""
+RUN_TESTS ?= true
+
+# Default target, does nothing
+all:
+	@echo "Specify a target to run"
+
+install-dependencies:
+ifeq ($(OS),Windows_NT) # Windows
+	cmd /C install_deps.bat
+else  # Unix-like systems (Linux and MacOS)
+	bash ./install_deps.sh
+endif
+
+build-engine:
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "mkdir -p build; cd build; cmake .. $(CMAKE_EXTRA_FLAGS); cmake --build . --config Release;"
+else
+	# Go to cpp/ dir
+	@cd ../../ && \
+	mkdir -p build && \
+	cd build && \
+	cmake .. -DBUILD_TESTS=OFF -DBUILD_BENCHMARKS=OFF -DBUILD_CORTEX_TENSORRT-LLM=ON -DBUILD_BATCH_MANAGER_DEFAULT=OFF -DCMAKE_CUDA_ARCHITECTURES=89-real -DTRT_LIB_DIR=/usr/local/tensorrt/lib -DTRT_INCLUDE_DIR=/usr/local/tensorrt/include -DCMAKE_BUILD_TYPE=Release && \
+	make -j $(nproc)
+endif
+
+build-example-server:
+ifeq ($(OS),Windows_NT)
+else 
+	@cd examples/server && \
+	mkdir -p build && cd build && \
+	cmake .. && cmake --build . --config Release -j12
+endif
+
+package:
+ifeq ($(OS),Windows_NT)
+else
+	@mkdir -p cortex.tensorrt-llm && \
+	cp ../../build/tensorrt_llm/cortex.tensorrt-llm/libengine.$(shell uname | tr '[:upper:]' '[:lower:]' | sed 's/darwin/dylib/;s/linux/so/') cortex.tensorrt-llm && \
+	cp /usr/local/cuda-12.4/targets/x86_64-linux/lib/libcublas.so.12 cortex.tensorrt-llm && \
+	cp /usr/local/cuda-12.4/targets/x86_64-linux/lib/libcublas.so.12.4.2.65 cortex.tensorrt-llm && \
+	cp /usr/local/cuda-12.4/targets/x86_64-linux/lib/libcublasLt.so.12 cortex.tensorrt-llm && \
+	cp /usr/local/cuda-12.4/targets/x86_64-linux/lib/libcublasLt.so.12.4.2.65 cortex.tensorrt-llm && \
+	cp /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10 cortex.tensorrt-llm && \
+	cp /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10.0.1 cortex.tensorrt-llm && \
+	cp /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so cortex.tensorrt-llm && \
+	cp /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so.10 cortex.tensorrt-llm && \
+	cp /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so cortex.tensorrt-llm && \
+	cp /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm_nvrtc_wrapper.so cortex.tensorrt-llm && \
+	cp /usr/lib/x86_64-linux-gnu/libnccl.so.2 cortex.tensorrt-llm && \
+	cp /usr/lib/x86_64-linux-gnu/libnccl.so.2.20.5 cortex.tensorrt-llm && \
+	tar -czvf cortex.tensorrt-llm.tar.gz cortex.tensorrt-llm
+endif
+
+run-e2e-test:
+ifeq ($(RUN_TESTS),false)
+	@echo "Skipping tests"
+else
+ifeq ($(OS),Windows_NT)
+else
+	mkdir -p examples/server/build/engines/cortex.tensorrt-llm;
+	cp ../../build/tensorrt_llm/cortex.tensorrt-llm/libengine.so examples/server/build/engines/cortex.tensorrt-llm;
+	@cd ../../../ && \
+	bash ./.github/scripts/e2e-test-server-linux-and-mac.sh "$$(pwd)"
+endif
+endif
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/README.md b/cpp/tensorrt_llm/cortex.tensorrt-llm/README.md
@@ -0,0 +1,105 @@
+# How to set up dev env for nitro-tensorrt-llm (And future cortex-tensorrtllm)
+
+Follow below steps:
+
+1. Get a machine with NVIDIA GPU (recommend at least more than Ampere generation)
+
+2. Clone this repo (or TensorRT-llm repo will do, but the upstream commit must match)
+
+3. Make sure the below installations is available on your computer:
+- Install latest cuda-toolkit, it is available through [Download CUDA](https://developer.nvidia.com/cuda-downloads)
+- Install NVIDIA container toolkit [Installing with Apt](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt)
+- Install latest NVIDIA driver
+- Install git lfs (apt install git-lfs)
+- Recommend to use ubuntu or debian
+
+3. Build the TensorRT image using the below command:
+```zsh
+cd nitro-tensorrt-llm
+git lfs install
+git lfs pull
+make -C docker release_build
+```
+After building the image you will have an image with tag `tensorrt_llm/release:latest`
+
+4. How to start the dev environment properly
+Use this docker-compose.yaml template below for the image
+```yaml
+services:
+......
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+```
+You can put values per your taste for deployment of the docker image (personally i use neovim image with base from the tensorrt_llm image) but need to have that deploy section, if you have 2 gpus or more just increase the count of gpu or change the setting, the setting is set for using the first gpu on single gpu machine.
+
+After you have started the docker environment you can either use vscode to ssh into the container, or, use neovim to develop directly, your choice.
+
+5. Install or build nitro-tensorrt-llm for the first time
+Now you are inside nitro-tensorrt-llm, just clone nitro-tensorrt-llm again
+```zsh
+apt update && apt install git-lfs
+git clone --recurse https://github.com/janhq/nitro-tensorrt-llm
+cd nitro-tensorrt-llm
+git lfs install
+git lfs pull
+```
+After that you need to install uuid-dev
+```zsh
+apt install uuid-dev
+```
+Now you need to install nitro-tensorrt-llm dependencies
+```zsh
+cd cpp/tensorrt_llm/nitro
+./install_deps.sh
+```
+After you have installed dependencies go back to main cpp folder and build nitro
+```zsh
+cd ../../
+./build_nitro.sh
+```
+
+**notes**: inside the build_nitro.sh script you can see parameter of the gpu name, i set 89-real as for ada lovelace, you can change to whatever you like per this tutorial [Arch](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/)
+
+6. Build the engine to test
+Binary already built but you need to test it if it's running properly, you need a tensorRT engine (it's a model for tensorRT in this context)
+
+Go to the root dir and do `cd examples/llama`
+
+Make sure you set the correct link dir
+```zsh
+export LD_LIBRARY_PATH=/usr/local/tensorrt/lib
+```
+
+Clone a model (need to be chatML template compatible), i use hermes
+```zsh
+git clone https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B
+```
+
+Now first I recommend to quantize it to FP8 to make it smaller
+```zsh
+python ../quantization/quantize.py --model_dir ./Hermes-2-Pro-Mistral-7B \
+                                   --dtype float16 \
+                                   --qformat fp8 \
+                                   --kv_cache_dtype fp8 \
+                                   --output_dir ./tllm_checkpoint_1gpu_fp8_hermes \
+                                   --calib_size 512 \
+                                   --tp_size 1
+```
+
+After you have already quantized, you can build the engine
+```zsh
+trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_fp8_hermes \
+             --output_dir ./tllm_checkpoint_1gpu_fp8_hermes_engine \
+             --gemm_plugin float16 \
+             --strongly_typed \
+             --workers 1
+```
+
+Now ./tllm_checkpoint_1gpu_fp8_hermes_engine is already the path for the "engine" that you can load with your freshly built nitro binary
+
+Go to main README page to follow the process of testing with the engine.
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/base/cortex-common/cortextensorrtllmi.h b/cpp/tensorrt_llm/cortex.tensorrt-llm/base/cortex-common/cortextensorrtllmi.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+
+#include "json/value.h"
+
+class CortexTensorrtLlmEngineI {
+ public: 
+  virtual ~CortexTensorrtLlmEngineI() {}
+
+  virtual void HandleChatCompletion(
+      std::shared_ptr<Json::Value> jsonBody,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+  virtual void LoadModel(
+      std::shared_ptr<Json::Value> jsonBody,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+  virtual void Destroy(
+      std::shared_ptr<Json::Value> jsonBody,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;  
+};
diff --git a/cpp/tensorrt_llm/cortex.tensorrt-llm/examples/server/CMakeLists.txt b/cpp/tensorrt_llm/cortex.tensorrt-llm/examples/server/CMakeLists.txt
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
+# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# C++17
+# cortex.tensorrt-llm init
+cmake_minimum_required(VERSION 3.5)
+project(server)
+find_package(Threads REQUIRED)
+
+if(UNIX AND NOT APPLE)
+  set(LINKER_FLAGS -ldl)
+endif()
+
+include(CheckIncludeFileCXX)
+# CPP version
+check_include_file_cxx(any HAS_ANY)
+check_include_file_cxx(string_view HAS_STRING_VIEW)
+check_include_file_cxx(coroutine HAS_COROUTINE)
+if(HAS_ANY
+  AND HAS_STRING_VIEW
+  AND HAS_COROUTINE)
+  set(CMAKE_CXX_STANDARD 20)
+elseif(HAS_ANY AND HAS_STRING_VIEW)
+  set(CMAKE_CXX_STANDARD 17)
+else()
+  set(CMAKE_CXX_STANDARD 14)
+endif()
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+add_executable(${PROJECT_NAME}
+    server.cc
+    dylib.h
+    httplib.h
+)
+
+set(THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../build_deps/_install)
+set(CORTEX_COMMON_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../base/)
+
+find_library(JSONCPP
+    NAMES jsoncpp
+    HINTS "${THIRD_PARTY_PATH}/lib"
+)
+
+find_library(TRANTOR
+    NAMES trantor
+    HINTS "${THIRD_PARTY_PATH}/lib"
+)
+
+target_link_libraries(${PROJECT_NAME} PRIVATE ${JSONCPP} ${TRANTOR} ${LINKER_FLAGS}
+                                              ${CMAKE_THREAD_LIBS_INIT})
+
+target_include_directories(${PROJECT_NAME} PRIVATE 
+                            ${CORTEX_COMMON_PATH}
+                            ${THIRD_PARTY_PATH}/include)