intel · jiafuzha · Apr 17, 2024 · Apr 26, 2024 · May 7, 2024 · May 8, 2024
diff --git a/.github/license/header_exclude_files.txt b/.github/license/header_exclude_files.txt
@@ -0,0 +1 @@
+vllm-ext/vllm/extension/ns/__init__.py
diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
@@ -34,7 +34,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-ipex-llm, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-vllm, gemma-2b, deepseek-coder-33b-instruct]
+        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-ipex-llm, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-vllm, llama-2-7b-chat-hf-vllm-ns, gemma-2b, deepseek-coder-33b-instruct]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -97,7 +97,11 @@ jobs:
         run: |
           TARGET=${{steps.target.outputs.target}}
           source dev/scripts/ci-functions.sh
-          strat_ray ${TARGET}
+          if [[ "$TARGET" == *ns ]]; then
+            start_ray ${TARGET} 1
+          else
+            start_ray ${TARGET}
+          fi
 
       - name: Run Inference Test
         run: |

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
@@ -94,7 +94,7 @@ jobs:
           # check and remove exited container
           cid=$(docker ps -a -q --filter "name=${TARGET}")
           if [[ ! -z "$cid" ]]; then docker rm $cid; fi
-          docker run -tid --name="${TARGET}" --hostname="${TARGET}-container" --runtime=habana -v /home/yizhong/Model-References:/root/Model-References -v ${{ inputs.code_checkout_path }}:/root/llm-on-ray -v ${{ inputs.model_cache_path }}:/root/.cache/huggingface/hub/ -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --cap-add sys_ptrace --net=host --ipc=host ${TARGET}:habana
+          docker run -tid --privileged --name="${TARGET}" --hostname="${TARGET}-container" --runtime=habana -v /home/yizhong/Model-References:/root/Model-References -v ${{ inputs.code_checkout_path }}:/root/llm-on-ray -v ${{ inputs.model_cache_path }}:/root/.cache/huggingface/hub/ -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --cap-add sys_ptrace --net=host --ipc=host ${TARGET}:habana
       - name: Start Ray Cluster
         run: |
           TARGET=${{steps.target.outputs.target}}

diff --git a/.github/workflows/workflow_test_benchmark.yml b/.github/workflows/workflow_test_benchmark.yml
@@ -80,7 +80,7 @@ jobs:
           # check and remove exited container
           cid=$(docker ps -a -q --filter "name=${TARGET}")
           if [[ ! -z "$cid" ]]; then docker rm $cid; fi
-          docker run -tid -v ${{ inputs.model_cache_path }}:/root/.cache/huggingface/hub -v ${{ inputs.code_checkout_path }}:/root/llm-on-ray -e http_proxy=${{ inputs.http_proxy }} -e https_proxy=${{ inputs.https_proxy }} --name="${TARGET}" --hostname="${TARGET}-container" ${TARGET}:latest
+          docker run -tid --privileged -v ${{ inputs.model_cache_path }}:/root/.cache/huggingface/hub -v ${{ inputs.code_checkout_path }}:/root/llm-on-ray -e http_proxy=${{ inputs.http_proxy }} -e https_proxy=${{ inputs.https_proxy }} --name="${TARGET}" --hostname="${TARGET}-container" ${TARGET}:latest
 
       - name: Start Ray Cluster
         run: |

diff --git a/.github/workflows/workflow_tests.yml b/.github/workflows/workflow_tests.yml
@@ -176,7 +176,7 @@ jobs:
         run: |
           TARGET=${{steps.target.outputs.target}}
           source dev/scripts/ci-functions.sh
-          strat_ray ${TARGET}
+          start_ray ${TARGET}
 
       - name: Run Tests
         run: |

diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,9 @@ build/lib/
 *.json
 *.txt
 *.egg-info
+.eggs
+*.log
+*.so
+*.ninja_log
+build/
+runtime_outs/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,6 +7,12 @@ repos:
     hooks:
       - id: ruff
         args: [ --fix, --exit-non-zero-on-fix, --ignore=E402, --ignore=E501, --ignore=E731, --ignore=F401]
+        exclude: |
+            (?x)^(
+              examples/inference/vllm/ray-vllm-examples/llm.py|
+              vllm-ext/vllm/extension/ns/__init__.py|
+            )$
+
 
   # Black needs to be ran after ruff with --fix
   - repo: https://github.com/psf/black
@@ -18,7 +24,18 @@ repos:
     rev: "v0.981"
     hooks:
       - id: mypy
-        exclude: tests
+        exclude: |
+          (?x)^(
+            tests|
+            vllm-ext/vllm/extension/ns/model/ns_loader.py|
+            vllm-ext/vllm/extension/ns/kv_cache/ns_cache.py|
+            vllm-ext/inference_engine/python/inference_engine/|
+            vllm-ext/setup.py|
+            examples/inference/vllm/ray-vllm-examples/llm.py|
+            llm_on_ray/inference/inference_config.py|
+            vllm-ext/vllm/extension/ns/
+          )
+
         additional_dependencies:
           - mypy-extensions
           - pydantic==1.10.0

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -284,7 +284,7 @@ async def send_request(
 
     token_latencies_per_request: List[float] = []
 
-    timeout = aiohttp.ClientTimeout(total=3 * 3600)
+    timeout = aiohttp.ClientTimeout(total=5 * 3600)
     async with aiohttp.ClientSession(timeout=timeout) as session:
         while True:
             async with session.post(api_url, headers=headers, json=pload) as response:

diff --git a/benchmarks/run_benchmark.sh b/benchmarks/run_benchmark.sh
@@ -12,11 +12,23 @@ then
     echo "Please pass in the value of parameter RUN_MODE, which can be 'test' or 'benchmark'."
 fi
 VALUE_INF=2000
+
+MAX_NUM_SEQS=$VALUE_INF
+DYNAMIC_BATCH_SIZE=0
+if [ "$#" -gt 2 ]
+then
+    MAX_NUM_SEQS=${3}
+fi
+if [ "$#" -gt 3 ]
+then
+    DYNAMIC_BATCH_SIZE=${4}
+fi
+
 MODEL_ENDPOINT="http://localhost:8000/llama-2-7b-chat-hf"
 MODEL_NAME="llama-2-7b-chat-hf"
 SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
 BENCHMARK_SCRIPT=$SHELL_FOLDER"/benchmark_serving.py"
-WITH_VLLM_CONFIG_FILE=$SHELL_FOLDER"/../llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml"
+WITH_VLLM_CONFIG_FILE=$SHELL_FOLDER"/../llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-ns.yaml"
 WO_VLLM_CONFIG_FILE=$SHELL_FOLDER"/../llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml"
 DATASET_PATH=$SHELL_FOLDER"/../dataset"
 DATASET_SHAREGPT_PATH=$SHELL_FOLDER"/../dataset/ShareGPT_V3_unfiltered_cleaned_split.json"
@@ -107,19 +119,37 @@ latency_throughput(){
     tokens_dir=$choice_dir"/tokens_"$input_tokens_length"_"$output_tokens_length
 
     # server
-    $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $VALUE_INF --max_num_seqs $VALUE_INF
+    #$numa_server_command llm_on_ray-serve --config_file $with_vllm_config_file --simple --max_concurrent_queries $VALUE_INF --vllm_max_num_seqs $VALUE_INF
 
     # client
-    for i in $(seq 1 $num_iter)
+    for num_prompts in ${query_num}
     do
-        echo "Run iter $i"
-        iter_dir=$tokens_dir"/iter_"$i
-        for num_prompts in ${query_num}
+        max_con_q=$VALUE_INF
+        if [ ! "$DYNAMIC_BATCH_SIZE" = "0" ]
+        then
+            if [ "$num_prompts" -lt "$NUM_REPLICA" ] || [ "$num_prompts" -eq "$NUM_REPLICA" ]
+            then
+                max_con_q=1
+            else
+                max_con_q=$((num_prompts/NUM_REPLICA))
+            fi
+        fi
+        echo "Run num_prompts ${num_prompts} ======================="
+        echo "deploying model with --max_concurrent_queries $max_con_q --vllm_max_num_seqs $MAX_NUM_SEQS ..."
+        $NUMA_SERVER_COMMAND llm_on_ray-serve --config_file $WITH_VLLM_CONFIG_FILE --simple --max_ongoing_requests $max_con_q --max_num_seqs $MAX_NUM_SEQS
+        sleep 1
+        for i in $(seq 0 $num_iter)
         do
+            if [ $i = 0 ]; then
+                iter_dir="$tokens_dir/warmup"
+                echo "Run warmup"
+            else
+                iter_dir=$tokens_dir"/iter_"$i
+                echo "Run iter $i"
+            fi
             results_dir=$iter_dir"/num_prompts_"$num_prompts
-            echo "Run num_prompts ${num_prompts}"
             echo "results_dir: ${results_dir}"
-            $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_IPEX_PATH --num-prompts $num_prompts  --dataset-format IPEX --input-tokens $input_tokens_length --max-new-tokens $output_tokens_length --track-token-latency --vllm-engine --simple --results-dir $results_dir
+            $NUMA_CLIENT_COMMAND python $BENCHMARK_SCRIPT --model-endpoint-base $MODEL_ENDPOINT --model-name $MODEL_NAME --dataset $DATASET_IPEX_PATH --num-prompts $num_prompts  --dataset-format IPEX --input-tokens $input_tokens_length --track-token-latency --max-new-tokens  $output_tokens_length --vllm-engine --simple --results-dir $results_dir
         done
     done
     echo "CHOICE 3 generation completed"
@@ -229,4 +259,4 @@ then
     fi
     output_tokens_length=32
     get_best_latency $iter "${input_tokens_length[*]}" $output_tokens_length $benchmark_dir
-fi
+fi
diff --git a/dev/docker/Dockerfile.vllm_ns b/dev/docker/Dockerfile.vllm_ns
@@ -0,0 +1,42 @@
+# syntax=docker/dockerfile:1
+FROM ubuntu:22.04
+
+ENV LANG C.UTF-8
+
+WORKDIR /root/llm-on-ray
+
+RUN --mount=type=cache,target=/var/cache/apt apt-get update -y \
+    && apt-get install -y build-essential cmake wget curl git vim htop ssh net-tools \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV CONDA_DIR /opt/conda
+RUN wget --quiet https://github.com/conda-forge/miniforge/releases/download/23.3.1-1/Miniforge3-Linux-x86_64.sh -O ~/miniforge.sh && \
+    /bin/bash ~/miniforge.sh -b -p /opt/conda
+ENV PATH $CONDA_DIR/bin:$PATH
+
+# setup env
+SHELL ["/bin/bash", "--login", "-c"]
+
+RUN --mount=type=cache,target=/opt/conda/pkgs conda init bash && \
+    unset -f conda && \
+    export PATH=$CONDA_DIR/bin/:${PATH} && \
+    mamba config --add channels intel && \
+    mamba install -y -c conda-forge python==3.9 gxx=12.3 gxx_linux-64=12.3 libxcrypt
+
+COPY ./pyproject.toml .
+COPY ./MANIFEST.in .
+
+
+# Install llm_on_ray
+# Create llm_on_ray package directory to bypass the following 'pip install -e' command
+RUN mkdir ./llm_on_ray
+RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[vllm-cpu] --extra-index-url https://download.pytorch.org/whl/cpu \
+    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+
+# Install vllm-ext
+# We cannot make empty folder here like './llm_on_ray' since vllm-ext has cpp files to be compiled
+COPY ./vllm-ext ./vllm-ext
+COPY ./dev/scripts/check-vllm-cpu-build-env.sh ./dev/scripts/check-vllm-cpu-build-env.sh
+RUN --mount=type=cache,target=/root/.cache/pip \
+    source /opt/conda/bin/activate base && cd vllm-ext && pip install . && pip install --upgrade protobuf
diff --git a/dev/scripts/check-vllm-cpu-build-env.sh b/dev/scripts/check-vllm-cpu-build-env.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# Check tools
+[[ -n $(which g++) ]] || { echo "GNU C++ Compiler (g++) is not found!";  exit 1; }
+[[ -n $(which pip) ]] || { echo "pip command is not found!";  exit 1; }
+
+# g++ version should be >=12.3. You can run the following to install GCC 12.3 and dependencies on conda:
+# conda install -y -c conda-forge gxx=12.3 gxx_linux-64=12.3 libxcrypt
+version_greater_equal()
+{
+    printf '%s\n%s\n' "$2" "$1" | sort --check=quiet --version-sort
+}
+gcc_version=$(g++ --version | grep -o -E '[0-9]+\.[0-9]+\.[0-9]+' | head -n1)
+echo
+echo Current GNU C++ Compiler version: $gcc_version
+echo
+version_greater_equal "${gcc_version}" 12.3.0 || { echo "GNU C++ Compiler 12.3.0 or above is required!"; exit 1; }
diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh
@@ -64,7 +64,7 @@ start_docker() {
         docker_args+=("-e=https_proxy=${HTTPS_PROXY}")
     fi
 
-    echo "docker run -tid  "${docker_args[@]}" "${TARGET}:latest""
+    echo "docker run -tid --privileged "${docker_args[@]}" "${TARGET}:latest""
     docker run -tid  "${docker_args[@]}" "${TARGET}:latest"   
 }
 
@@ -75,11 +75,19 @@ install_dependencies(){
     docker exec "${TARGET}" bash -c "pip install -r ./tests/requirements.txt"
 }
 
-strat_ray(){
+start_ray(){
     local TARGET=$1
+    local UNLIMITED_MAXLOCKMEM=0
+    if [ "$2" == "1" ]; then
+        UNLIMITED_MAXLOCKMEM=1
+    fi
 
     # Start Ray Cluster
-    docker exec "${TARGET}" bash -c "./dev/scripts/start-ray-cluster.sh"
+    if [ "$UNLIMITED_MAXLOCKMEM" == "1" ]; then
+        docker exec "${TARGET}" bash -c "ulimit -l unlimited; ./dev/scripts/start-ray-cluster.sh"
+    else
+        docker exec "${TARGET}" bash -c "./dev/scripts/start-ray-cluster.sh"
+    fi
 }
 
 stop_ray(){
@@ -111,6 +119,7 @@ declare -A DF_SUFFIX_MAPPER
 DF_SUFFIX_MAPPER=(
     ["mpt-7b-ipex-llm"]=".ipex-llm"
     ["llama-2-7b-chat-hf-vllm"]=".vllm"
+    ["llama-2-7b-chat-hf-vllm-ns"]=".vllm_ns"
     ["gpt-j-6b"]=".cpu_and_deepspeed.pip_non_editable"
 )
 
@@ -128,6 +137,7 @@ declare -A TARGET_SUFFIX_MAPPER
 TARGET_SUFFIX_MAPPER=(
     ["mpt-7b-ipex-llm"]="_ipex-llm"
     ["llama-2-7b-chat-hf-vllm"]="_vllm"
+    ["llama-2-7b-chat-hf-vllm-ns"]="_vllm-ns"
 )
 
 get_TARGET_SUFFIX() {
@@ -143,6 +153,7 @@ declare -A INFERENCE_MAPPER
 INFERENCE_MAPPER=(
     ["mpt-7b-ipex-llm"]="llm_on_ray-serve --config_file llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml --simple"
     ["llama-2-7b-chat-hf-vllm"]="llm_on_ray-serve --config_file .github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml --simple"
+    ["llama-2-7b-chat-hf-vllm-ns"]="llm_on_ray-serve --config_file llm_on_ray/inference/models/vllm/llama2-7b-chat-hf-vllm-ns.yaml --simple --max_ongoing_requests 1 --max_num_seqs 1"
     ["default"]="llm_on_ray-serve --simple --models ${model}"
 )
 

diff --git a/docs/assets/choice3_tokens_32_64.png b/docs/assets/choice3_tokens_32_64.png
diff --git a/docs/vllm.md b/docs/vllm.md
@@ -24,10 +24,30 @@ Then please run the following script to install vLLM for CPU into your LLM-on-Ra
 dev/scripts/install-vllm-cpu.sh
 ```
 
+## Install vLLM Extension for Quantization (Optional)
+To further speed up quantized model inference on Intel CPU, we extend vLLM to run the model decoding in own own inference engine, which is based on [https://github.com/intel/neural-speed](neural-speed).
+Neural Speed is an innovative library designed to support the efficient inference of large language models (LLMs) on Intel platforms through the state-of-the-art (SOTA) low-bit quantization powered by
+[https://github.com/intel/neural-compressor](Intel Neural Compressor). The work is inspired by [https://github.com/ggerganov/llama.cpp](llama.cpp) and further optimized for Intel platforms with our
+innovations in [https://arxiv.org/abs/2311.00502](NeurIPS' 2023).
+
+You need to first install llm-on-ray with "vllm-cpu" extra.
+
+```bash
+pip install .[vllm-cpu] --extra-index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+```
+
+Then, install the vLLM extension and the inference engine.
+```bash
+cd vllm-ext
+pip install .
+
+```
+
 ## Run
 
 #### Serving
 
+* Vanilla vLLM
 To serve model with vLLM and simple protocol, run the following:
 
 ```bash
@@ -36,6 +56,19 @@ llm_on_ray-serve --config_file llm_on_ray/inference/models/vllm/llama-2-7b-chat-
 
 In the above example, `vllm` property is set to `true` in the config file for enabling vLLM.
 
+* vLLM Extension
+To serve model with vLLM extension with Intel inference engine, run with following (Note: only Llama-2-7b-chat-hf is supported for now):
+
+```bash
+# copy quantization config file to your specific snapshot dir, for example .../snapshots/f5db02db7.../
+# the quant_ns_config.json will be copied from llm_on_ray package with default config if you don't copy your desired one manually.
+cp llm_on_ray/inference/models/vllm/quantization/quant_ns_config.json <your model snapshot dir>
+# deploy model serving. Note: It includes quantizing the model on the fly based on the quant_ns_config.json if it has not been quantized.
+llm_on_ray-serve --config_file llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm-ns.yaml --simple --keep_serve_terminal --max_num_seqs 64
+```
+
+For now, only Llama2 model is supported.
+
 #### Querying
 
 To start a non-streaming query, run the following: