Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cpp/tensorrt_llm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -263,3 +263,7 @@ if(BUILD_PYBIND)
endif()

add_subdirectory(plugins)

if(BUILD_CORTEX_TENSORRT-LLM)
add_subdirectory(cortex.tensorrt-llm)
endif()
95 changes: 95 additions & 0 deletions cpp/tensorrt_llm/cortex.tensorrt-llm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
# C++17
# engine init
include(CheckIncludeFileCXX)

check_include_file_cxx(any HAS_ANY)
check_include_file_cxx(string_view HAS_STRING_VIEW)
check_include_file_cxx(coroutine HAS_COROUTINE)
if(HAS_ANY
AND HAS_STRING_VIEW
AND HAS_COROUTINE)
set(CMAKE_CXX_STANDARD 20)
elseif(HAS_ANY AND HAS_STRING_VIEW)
set(CMAKE_CXX_STANDARD 17)
else()
set(CMAKE_CXX_STANDARD 14)
endif()


set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_PREFIX_PATH ${CMAKE_CURRENT_SOURCE_DIR}/build_deps/_install)

message(STATUS "Current Source Directory CORTEX: ${CMAKE_CURRENT_SOURCE_DIR}")
message(STATUS "Current Cmake Prefix Path of CORTEX: ${CMAKE_PREFIX_PATH}")


set(OPENSSL_USE_STATIC_LIBS TRUE)


# Enable pkg-config support in CMake
find_package(PkgConfig REQUIRED)
find_library(TRANTOR
NAMES trantor
HINTS "${CMAKE_PREFIX_PATH}/lib"
)
find_library(JSONCPP
NAMES jsoncpp
HINTS "${CMAKE_PREFIX_PATH}/lib"
)

# Use pkg-config to find the SentencePiece library

if(NOT WIN32) # Linux
# Use pkg-config to find the SentencePiece library
pkg_search_module(SENTENCEPIECE REQUIRED sentencepiece)
else() # Windows
set(SENTENCEPIECE_INCLUDE_DIRS "${CMAKE_PREFIX_PATH}/include")
set(SENTENCEPIECE_LIBRARY_DIRS "${CMAKE_PREFIX_PATH}/lib")
endif()

message(STATUS "SentencePiece library dirs: ${SENTENCEPIECE_LIBRARY_DIRS}")
message(STATUS "SentencePiece header dirs: ${SENTENCEPIECE_INCLUDE_DIRS}")

include_directories(${PROJECT_SOURCE_DIR}/include ${SENTENCEPIECE_INCLUDE_DIRS})

link_directories(${SENTENCEPIECE_LIBRARY_DIRS})

set(TOP_LEVEL_DIR "${PROJECT_SOURCE_DIR}/..")

add_custom_target(engine_proj)

set(CXXOPTS_SRC_DIR ${PROJECT_SOURCE_DIR}/../3rdparty/cxxopts)
add_subdirectory(${CXXOPTS_SRC_DIR} ${CMAKE_CURRENT_BINARY_DIR}/cxxopts)

# main
# add_executable(engine main.cc)
add_library(engine SHARED src/tensorrt-llm_engine.cc)
target_link_libraries(
engine PUBLIC ${SHARED_TARGET} nvinfer_plugin_tensorrt_llm cxxopts::cxxopts sentencepiece PRIVATE ${JSONCPP} ${TRANTOR} ${CMAKE_THREAD_LIBS_INIT} )

target_compile_features(engine PRIVATE cxx_std_17)
target_compile_definitions(engine PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}")

aux_source_directory(src SRC)

target_include_directories(engine PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
target_sources(engine PRIVATE ${SRC})


add_dependencies(engine_proj engine)
66 changes: 66 additions & 0 deletions cpp/tensorrt_llm/cortex.tensorrt-llm/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
CMAKE_EXTRA_FLAGS ?= ""
RUN_TESTS ?= true

# Default target, does nothing
all:
@echo "Specify a target to run"

install-dependencies:
ifeq ($(OS),Windows_NT) # Windows
cmd /C install_deps.bat
else # Unix-like systems (Linux and MacOS)
bash ./install_deps.sh
endif

build-engine:
ifeq ($(OS),Windows_NT)
@powershell -Command "mkdir -p build; cd build; cmake .. $(CMAKE_EXTRA_FLAGS); cmake --build . --config Release;"
else
# Go to cpp/ dir
@cd ../../ && \
mkdir -p build && \
cd build && \
cmake .. -DBUILD_TESTS=OFF -DBUILD_BENCHMARKS=OFF -DBUILD_CORTEX_TENSORRT-LLM=ON -DBUILD_BATCH_MANAGER_DEFAULT=OFF -DCMAKE_CUDA_ARCHITECTURES=89-real -DTRT_LIB_DIR=/usr/local/tensorrt/lib -DTRT_INCLUDE_DIR=/usr/local/tensorrt/include -DCMAKE_BUILD_TYPE=Release && \
make -j $(nproc)
endif

build-example-server:
ifeq ($(OS),Windows_NT)
else
@cd examples/server && \
mkdir -p build && cd build && \
cmake .. && cmake --build . --config Release -j12
endif

package:
ifeq ($(OS),Windows_NT)
else
@mkdir -p cortex.tensorrt-llm && \
cp ../../build/tensorrt_llm/cortex.tensorrt-llm/libengine.$(shell uname | tr '[:upper:]' '[:lower:]' | sed 's/darwin/dylib/;s/linux/so/') cortex.tensorrt-llm && \
cp /usr/local/cuda-12.4/targets/x86_64-linux/lib/libcublas.so.12 cortex.tensorrt-llm && \
cp /usr/local/cuda-12.4/targets/x86_64-linux/lib/libcublas.so.12.4.2.65 cortex.tensorrt-llm && \
cp /usr/local/cuda-12.4/targets/x86_64-linux/lib/libcublasLt.so.12 cortex.tensorrt-llm && \
cp /usr/local/cuda-12.4/targets/x86_64-linux/lib/libcublasLt.so.12.4.2.65 cortex.tensorrt-llm && \
cp /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10 cortex.tensorrt-llm && \
cp /usr/local/tensorrt/targets/x86_64-linux-gnu/lib/libnvinfer.so.10.0.1 cortex.tensorrt-llm && \
cp /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so cortex.tensorrt-llm && \
cp /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so.10 cortex.tensorrt-llm && \
cp /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm.so cortex.tensorrt-llm && \
cp /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/libtensorrt_llm_nvrtc_wrapper.so cortex.tensorrt-llm && \
cp /usr/lib/x86_64-linux-gnu/libnccl.so.2 cortex.tensorrt-llm && \
cp /usr/lib/x86_64-linux-gnu/libnccl.so.2.20.5 cortex.tensorrt-llm && \
tar -czvf cortex.tensorrt-llm.tar.gz cortex.tensorrt-llm
endif

run-e2e-test:
ifeq ($(RUN_TESTS),false)
@echo "Skipping tests"
else
ifeq ($(OS),Windows_NT)
else
mkdir -p examples/server/build/engines/cortex.tensorrt-llm;
cp ../../build/tensorrt_llm/cortex.tensorrt-llm/libengine.so examples/server/build/engines/cortex.tensorrt-llm;
@cd ../../../ && \
bash ./.github/scripts/e2e-test-server-linux-and-mac.sh "$$(pwd)"
endif
endif
105 changes: 105 additions & 0 deletions cpp/tensorrt_llm/cortex.tensorrt-llm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# How to set up dev env for nitro-tensorrt-llm (And future cortex-tensorrtllm)

Follow below steps:

1. Get a machine with NVIDIA GPU (recommend at least more than Ampere generation)

2. Clone this repo (or TensorRT-llm repo will do, but the upstream commit must match)

3. Make sure the below installations is available on your computer:
- Install latest cuda-toolkit, it is available through [Download CUDA](https://developer.nvidia.com/cuda-downloads)
- Install NVIDIA container toolkit [Installing with Apt](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt)
- Install latest NVIDIA driver
- Install git lfs (apt install git-lfs)
- Recommend to use ubuntu or debian

3. Build the TensorRT image using the below command:
```zsh
cd nitro-tensorrt-llm
git lfs install
git lfs pull
make -C docker release_build
```
After building the image you will have an image with tag `tensorrt_llm/release:latest`

4. How to start the dev environment properly
Use this docker-compose.yaml template below for the image
```yaml
services:
......
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
```
You can put values per your taste for deployment of the docker image (personally i use neovim image with base from the tensorrt_llm image) but need to have that deploy section, if you have 2 gpus or more just increase the count of gpu or change the setting, the setting is set for using the first gpu on single gpu machine.

After you have started the docker environment you can either use vscode to ssh into the container, or, use neovim to develop directly, your choice.

5. Install or build nitro-tensorrt-llm for the first time
Now you are inside nitro-tensorrt-llm, just clone nitro-tensorrt-llm again
```zsh
apt update && apt install git-lfs
git clone --recurse https://github.com/janhq/nitro-tensorrt-llm
cd nitro-tensorrt-llm
git lfs install
git lfs pull
```
After that you need to install uuid-dev
```zsh
apt install uuid-dev
```
Now you need to install nitro-tensorrt-llm dependencies
```zsh
cd cpp/tensorrt_llm/nitro
./install_deps.sh
```
After you have installed dependencies go back to main cpp folder and build nitro
```zsh
cd ../../
./build_nitro.sh
```

**notes**: inside the build_nitro.sh script you can see parameter of the gpu name, i set 89-real as for ada lovelace, you can change to whatever you like per this tutorial [Arch](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/)

6. Build the engine to test
Binary already built but you need to test it if it's running properly, you need a tensorRT engine (it's a model for tensorRT in this context)

Go to the root dir and do `cd examples/llama`

Make sure you set the correct link dir
```zsh
export LD_LIBRARY_PATH=/usr/local/tensorrt/lib
```

Clone a model (need to be chatML template compatible), i use hermes
```zsh
git clone https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B
```

Now first I recommend to quantize it to FP8 to make it smaller
```zsh
python ../quantization/quantize.py --model_dir ./Hermes-2-Pro-Mistral-7B \
--dtype float16 \
--qformat fp8 \
--kv_cache_dtype fp8 \
--output_dir ./tllm_checkpoint_1gpu_fp8_hermes \
--calib_size 512 \
--tp_size 1
```

After you have already quantized, you can build the engine
```zsh
trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_fp8_hermes \
--output_dir ./tllm_checkpoint_1gpu_fp8_hermes_engine \
--gemm_plugin float16 \
--strongly_typed \
--workers 1
```

Now ./tllm_checkpoint_1gpu_fp8_hermes_engine is already the path for the "engine" that you can load with your freshly built nitro binary

Go to main README page to follow the process of testing with the engine.
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once

#include <functional>
#include <memory>

#include "json/value.h"

class CortexTensorrtLlmEngineI {
public:
virtual ~CortexTensorrtLlmEngineI() {}

virtual void HandleChatCompletion(
std::shared_ptr<Json::Value> jsonBody,
std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
virtual void LoadModel(
std::shared_ptr<Json::Value> jsonBody,
std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
virtual void Destroy(
std::shared_ptr<Json::Value> jsonBody,
std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
};
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION &
# AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
# C++17
# cortex.tensorrt-llm init
cmake_minimum_required(VERSION 3.5)
project(server)
find_package(Threads REQUIRED)

if(UNIX AND NOT APPLE)
set(LINKER_FLAGS -ldl)
endif()

include(CheckIncludeFileCXX)
# CPP version
check_include_file_cxx(any HAS_ANY)
check_include_file_cxx(string_view HAS_STRING_VIEW)
check_include_file_cxx(coroutine HAS_COROUTINE)
if(HAS_ANY
AND HAS_STRING_VIEW
AND HAS_COROUTINE)
set(CMAKE_CXX_STANDARD 20)
elseif(HAS_ANY AND HAS_STRING_VIEW)
set(CMAKE_CXX_STANDARD 17)
else()
set(CMAKE_CXX_STANDARD 14)
endif()

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

add_executable(${PROJECT_NAME}
server.cc
dylib.h
httplib.h
)

set(THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../build_deps/_install)
set(CORTEX_COMMON_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../base/)

find_library(JSONCPP
NAMES jsoncpp
HINTS "${THIRD_PARTY_PATH}/lib"
)

find_library(TRANTOR
NAMES trantor
HINTS "${THIRD_PARTY_PATH}/lib"
)

target_link_libraries(${PROJECT_NAME} PRIVATE ${JSONCPP} ${TRANTOR} ${LINKER_FLAGS}
${CMAKE_THREAD_LIBS_INIT})

target_include_directories(${PROJECT_NAME} PRIVATE
${CORTEX_COMMON_PATH}
${THIRD_PARTY_PATH}/include)
Loading