From ff52ed5699510323cc35052d5b6027f6d3507549 Mon Sep 17 00:00:00 2001 From: Deegue Date: Thu, 16 May 2024 15:00:37 +0000 Subject: [PATCH 01/26] add validated models for Gaudi --- .../workflows/workflow_inference_gaudi2.yml | 47 +++++++++---------- .../models/hpu/CodeLlama-7b-hf-hpu.yaml | 16 +++++++ .../models/hpu/MindChat-Qwen2-4B-hpu.yaml | 16 +++++++ .../inference/models/hpu/bloom-560m-hpu.yaml | 20 ++++++++ .../inference/models/hpu/bloom-7b1-hpu.yaml | 18 +++++++ .../inference/models/hpu/falcon-40b-hpu.yaml | 20 ++++++++ .../inference/models/hpu/falcon-7b-hpu.yaml | 18 +++++++ .../inference/models/hpu/gemma-2b-hpu.yaml | 20 ++++++++ .../inference/models/hpu/gpt-j-6b-hpu.yaml | 24 ++++++++++ llm_on_ray/inference/models/hpu/gpt2-hpu.yaml | 17 +++++++ .../hpu/meta-llama-3-70b-instruct-hpu.yaml | 21 +++++++++ .../hpu/meta-llama-3-8b-instruct-hpu.yaml | 12 +++++ .../models/hpu/mistral-7b-v0.1-hpu.yaml | 20 ++++++++ .../inference/models/hpu/mpt-7b-hpu.yaml | 25 ++++++++++ .../models/hpu/neural-chat-7b-v3-3-hpu.yaml | 17 +++++++ 15 files changed, 285 insertions(+), 26 deletions(-) create mode 100644 llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml create mode 100644 llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml create mode 100644 llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml create mode 100644 llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml create mode 100644 llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml create mode 100644 llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml create mode 100644 llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml create mode 100644 llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml create mode 100644 llm_on_ray/inference/models/hpu/gpt2-hpu.yaml create mode 100644 llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml create mode 100644 llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml create mode 100644 llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml create mode 100644 llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml create mode 100644 llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index 794b83181..139d3a21c 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -28,7 +28,7 @@ jobs: name: inference strategy: matrix: - model: [ llama-2-7b-chat-hf, llama-2-70b-chat-hf ] + model: [ bloom-7b1, bloom-560m, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, MindChat-Qwen2-4B, mistral-7b-v0.1, mpt-7b ] isPR: - ${{inputs.ci_type == 'pr'}} @@ -36,8 +36,20 @@ jobs: - { isPR: true } include: + - { model: "bloom-7b1"} + - { model: "bloom-560m"} + - { model: "CodeLlama-7b-hf"} + - { model: "falcon-7b"} + - { model: "falcon-40b"} + - { model: "gemma-2b"} + - { model: "gpt-j-6b"} - { model: "llama-2-7b-chat-hf"} - { model: "llama-2-70b-chat-hf"} + - { model: "meta-llama-3-8b-instruct"} + - { model: "meta-llama-3-70b-instruct"} + - { model: "MindChat-Qwen2-4B"} + - { model: "mistral-7b-v0.1"} + - { model: "mpt-7b"} runs-on: gaudi2 @@ -59,11 +71,7 @@ jobs: id: "target" run: | target="inference" - if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf" ]]; then - target="${target}_gaudi2" - elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then - target="${target}_gaudi2" - fi + target="${target}_gaudi2" echo "target is ${target}" echo "target=$target" >> $GITHUB_OUTPUT @@ -101,28 +109,15 @@ jobs: TARGET=${{steps.target.outputs.target}} CMD=$(cat << EOF import yaml - if ("${{ matrix.model }}" == "llama-2-7b-chat-hf"): - conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml" - with open(conf_path, encoding="utf-8") as reader: - result = yaml.load(reader, Loader=yaml.FullLoader) - result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}" - with open(conf_path, 'w') as output: - yaml.dump(result, output, sort_keys=False) - elif ("${{ matrix.model }}" == "llama-2-70b-chat-hf"): - conf_path = "llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml" - with open(conf_path, encoding="utf-8") as reader: - result = yaml.load(reader, Loader=yaml.FullLoader) - result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}" - with open(conf_path, 'w') as output: - yaml.dump(result, output, sort_keys=False) - EOF + conf_path = "llm_on_ray/inference/models/hpu/" + "${{ matrix.model }}" + "-hpu.yaml" + with open(conf_path, encoding="utf-8") as reader: + result = yaml.load(reader, Loader=yaml.FullLoader) + result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}" + with open(conf_path, 'w') as output: + yaml.dump(result, output, sort_keys=False) ) docker exec "${TARGET}" python -c "$CMD" - if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf" ]]; then - docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml --keep_serve_terminal" - elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then - docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml --keep_serve_terminal" - fi + docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/${{ matrix.model }}-hpu.yaml --keep_serve_terminal" echo Streaming query: docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }} --streaming_response" diff --git a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml new file mode 100644 index 000000000..fa794db98 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml @@ -0,0 +1,16 @@ +port: 8000 +name: CodeLlama-7b-hf +route_prefix: /CodeLlama-7b-hf +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: codellama/CodeLlama-7b-hf + tokenizer_name_or_path: codellama/CodeLlama-7b-hf + chat_processor: ChatModelGptJ + prompt: + intro: '' + human_id: '' + bot_id: '' + stop_words: [] diff --git a/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml b/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml new file mode 100644 index 000000000..1e5779dd8 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml @@ -0,0 +1,16 @@ +port: 8000 +name: MindChat-Qwen2-4B +route_prefix: /MindChat-Qwen2-4B +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: X-D-Lab/MindChat-Qwen2-4B + tokenizer_name_or_path: X-D-Lab/MindChat-Qwen2-4B + chat_processor: ChatModelGptJ + prompt: + intro: '' + human_id: '' + bot_id: '' + stop_words: [] diff --git a/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml new file mode 100644 index 000000000..9fcd0f193 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml @@ -0,0 +1,20 @@ +port: 8000 +name: bloom-560m +route_prefix: /bloom-560m +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +deepspeed: true +workers_per_group: 8 +device: hpu +model_description: + model_id_or_path: bigscience/bloom-560m + tokenizer_name_or_path: bigscience/bloom-560m + chat_processor: ChatModelGptJ + prompt: + intro: '' + human_id: '' + bot_id: '' + stop_words: [] + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml new file mode 100644 index 000000000..fc7f4217b --- /dev/null +++ b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml @@ -0,0 +1,18 @@ +port: 8000 +name: bloom-7b1 +route_prefix: /bloom-7b1 +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: bigscience/bloom-7b1 + tokenizer_name_or_path: bigscience/bloom-7b1 + chat_processor: ChatModelGptJ + prompt: + intro: '' + human_id: '' + bot_id: '' + stop_words: [] + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml new file mode 100644 index 000000000..50f9e3ec3 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml @@ -0,0 +1,20 @@ +port: 8000 +name: falcon-40b +route_prefix: /falcon-40b +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +deepspeed: true +workers_per_group: 8 +device: hpu +model_description: + model_id_or_path: tiiuae/falcon-40b + tokenizer_name_or_path: tiiuae/falcon-40b + chat_processor: ChatModelGptJ + prompt: + intro: '' + human_id: '' + bot_id: '' + stop_words: [] + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml new file mode 100644 index 000000000..e450a066a --- /dev/null +++ b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml @@ -0,0 +1,18 @@ +port: 8000 +name: falcon-7b +route_prefix: /falcon-7b +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: tiiuae/falcon-7b + tokenizer_name_or_path: tiiuae/falcon-7b + chat_processor: ChatModelGptJ + prompt: + intro: '' + human_id: '' + bot_id: '' + stop_words: [] + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml b/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml new file mode 100644 index 000000000..5e84eb2e4 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml @@ -0,0 +1,20 @@ +port: 8000 +name: gemma-2b +route_prefix: /gemma-2b +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: google/gemma-2b + tokenizer_name_or_path: google/gemma-2b + chat_processor: ChatModelGemma + prompt: + intro: '' + human_id: 'user + {msg}' + bot_id: 'model + {msg}' + stop_words: [] + config: + use_auth_token: ' ' diff --git a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml new file mode 100644 index 000000000..203763fa9 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml @@ -0,0 +1,24 @@ +port: 8000 +name: gpt-j-6b +route_prefix: /gpt-j-6b +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: EleutherAI/gpt-j-6b + tokenizer_name_or_path: EleutherAI/gpt-j-6b + chat_processor: ChatModelGptJ + gpt_base_model: true + prompt: + intro: 'Below is an instruction that describes a task. Write a response that appropriately + completes the request. + + ' + human_id: ' + + ### Instruction' + bot_id: ' + + ### Response' + stop_words: [] diff --git a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml new file mode 100644 index 000000000..c22b97288 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml @@ -0,0 +1,17 @@ +port: 8000 +name: gpt2 +route_prefix: /gpt2 +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: gpt2 + tokenizer_name_or_path: gpt2 + chat_processor: ChatModelGptJ + gpt_base_model: true + prompt: + intro: '' + human_id: '' + bot_id: '' + stop_words: [] diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml new file mode 100644 index 000000000..6976eafb3 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml @@ -0,0 +1,21 @@ +port: 8000 +name: meta-llama-3-70b-instruct +route_prefix: /meta-llama-3-70b-instruct +cpus_per_worker: 8 +hpus_per_worker: 1 +deepspeed: true +workers_per_group: 8 +device: hpu +model_description: + model_id_or_path: meta-llama/Meta-Llama-3-70b-Instruct + tokenizer_name_or_path: meta-llama/Meta-Llama-3-70b-Instruct + chat_processor: ChatModelLLama + prompt: + intro: '' + human_id: '[INST] {msg} [/INST] + + ' + bot_id: '' + stop_words: [] + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml new file mode 100644 index 000000000..d57ffcc22 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml @@ -0,0 +1,12 @@ +port: 8000 +name: meta-llama-3-8b-instruct +route_prefix: /meta-llama-3-8b-instruct +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: meta-llama/Meta-Llama-3-8b-Instruct + tokenizer_name_or_path: meta-llama/Meta-Llama-3-8b-Instruct + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml b/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml new file mode 100644 index 000000000..e91f08e8a --- /dev/null +++ b/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml @@ -0,0 +1,20 @@ +port: 8000 +name: mistral-7b-v0.1 +route_prefix: /mistral-7b-v0.1 +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: mistralai/Mistral-7B-v0.1 + tokenizer_name_or_path: mistralai/Mistral-7B-v0.1 + chat_processor: ChatModelLLama + prompt: + intro: '' + human_id: '[INST] {msg} [/INST] + + ' + bot_id: '' + stop_words: [] + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml new file mode 100644 index 000000000..5d6b36dc1 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml @@ -0,0 +1,25 @@ +port: 8000 +name: mpt-7b +route_prefix: /mpt-7b +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: EleutherAI/gpt-neox-20b + tokenizer_name_or_path: EleutherAI/gpt-neox-20b + chat_processor: ChatModelLLama + prompt: + intro: 'Below is an instruction that describes a task, paired with an input that + provides further context. Write a response that appropriately completes the request. + + ' + human_id: ' + + ### Instruction' + bot_id: ' + + ### Response' + stop_words: [] + config: + trust_remote_code: true diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml new file mode 100644 index 000000000..64566a6d8 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml @@ -0,0 +1,17 @@ +port: 8000 +name: neural-chat-7b-v3-3 +route_prefix: /neural-chat-7b-v3-3 +num_replicas: 1 +cpus_per_worker: 0 +gpus_per_worker: 0 +hpus_per_worker: 1 +deepspeed: false +workers_per_group: 2 +device: hpu +ipex: + enabled: false + precision: bf16 +model_description: + model_id_or_path: Intel/neural-chat-7b-v3-3 + tokenizer_name_or_path: Intel/neural-chat-7b-v3-3 + chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja" From cd543a5f92db697f89a2e5ae14ba1a940ad5eb19 Mon Sep 17 00:00:00 2001 From: Deegue Date: Thu, 16 May 2024 15:50:33 +0000 Subject: [PATCH 02/26] nit --- .github/workflows/workflow_inference_gaudi2.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index 139d3a21c..1bb5d6278 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -109,6 +109,7 @@ jobs: TARGET=${{steps.target.outputs.target}} CMD=$(cat << EOF import yaml + model = ${{ matrix.model }} conf_path = "llm_on_ray/inference/models/hpu/" + "${{ matrix.model }}" + "-hpu.yaml" with open(conf_path, encoding="utf-8") as reader: result = yaml.load(reader, Loader=yaml.FullLoader) From 55595f7169600e2ddf76555df4ebda53f6d01627 Mon Sep 17 00:00:00 2001 From: Deegue Date: Fri, 17 May 2024 01:46:47 +0000 Subject: [PATCH 03/26] fix --- .github/workflows/workflow_inference_gaudi2.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index 1bb5d6278..3e40ab11e 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -116,6 +116,7 @@ jobs: result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}" with open(conf_path, 'w') as output: yaml.dump(result, output, sort_keys=False) + EOF ) docker exec "${TARGET}" python -c "$CMD" docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/${{ matrix.model }}-hpu.yaml --keep_serve_terminal" From 0a72f39ed5bb2874c9ff30a8a754aa06aacafc62 Mon Sep 17 00:00:00 2001 From: Deegue Date: Fri, 17 May 2024 01:51:19 +0000 Subject: [PATCH 04/26] remove --- .github/workflows/workflow_inference_gaudi2.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index 3e40ab11e..0f94960f0 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -109,7 +109,6 @@ jobs: TARGET=${{steps.target.outputs.target}} CMD=$(cat << EOF import yaml - model = ${{ matrix.model }} conf_path = "llm_on_ray/inference/models/hpu/" + "${{ matrix.model }}" + "-hpu.yaml" with open(conf_path, encoding="utf-8") as reader: result = yaml.load(reader, Loader=yaml.FullLoader) From 0b17988cf743127445266677a0415d393e4f5b1e Mon Sep 17 00:00:00 2001 From: Deegue Date: Fri, 17 May 2024 02:19:56 +0000 Subject: [PATCH 05/26] add config --- llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml | 2 ++ llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml | 2 ++ llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml | 2 ++ llm_on_ray/inference/models/hpu/gpt2-hpu.yaml | 2 ++ llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml | 1 + llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml | 2 ++ 6 files changed, 11 insertions(+) diff --git a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml index fa794db98..6d5780a9d 100644 --- a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml @@ -14,3 +14,5 @@ model_description: human_id: '' bot_id: '' stop_words: [] + config: + use_auth_token: '' \ No newline at end of file diff --git a/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml b/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml index 1e5779dd8..e84f7126f 100644 --- a/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml @@ -14,3 +14,5 @@ model_description: human_id: '' bot_id: '' stop_words: [] + config: + use_auth_token: '' \ No newline at end of file diff --git a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml index 203763fa9..cfde9a781 100644 --- a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml @@ -22,3 +22,5 @@ model_description: ### Response' stop_words: [] + config: + use_auth_token: '' \ No newline at end of file diff --git a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml index c22b97288..3c1cf1913 100644 --- a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml @@ -15,3 +15,5 @@ model_description: human_id: '' bot_id: '' stop_words: [] + config: + use_auth_token: '' \ No newline at end of file diff --git a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml index 5d6b36dc1..e45a64ea6 100644 --- a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml @@ -22,4 +22,5 @@ model_description: ### Response' stop_words: [] config: + use_auth_token: '' trust_remote_code: true diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml index 64566a6d8..bc147f105 100644 --- a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml @@ -15,3 +15,5 @@ model_description: model_id_or_path: Intel/neural-chat-7b-v3-3 tokenizer_name_or_path: Intel/neural-chat-7b-v3-3 chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja" + config: + use_auth_token: '' \ No newline at end of file From ef337637603a7b099097797d91cecc543ce8377b Mon Sep 17 00:00:00 2001 From: Deegue Date: Fri, 17 May 2024 02:20:09 +0000 Subject: [PATCH 06/26] nit --- llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml index 64566a6d8..bc147f105 100644 --- a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml +++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml @@ -15,3 +15,5 @@ model_description: model_id_or_path: Intel/neural-chat-7b-v3-3 tokenizer_name_or_path: Intel/neural-chat-7b-v3-3 chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja" + config: + use_auth_token: '' \ No newline at end of file From c1b2a2d6512e0a2586b84c350ca9065f5f7088ba Mon Sep 17 00:00:00 2001 From: Deegue Date: Fri, 17 May 2024 03:45:59 +0000 Subject: [PATCH 07/26] remove prompt and add gpt2 --- .github/workflows/workflow_inference_gaudi2.yml | 3 ++- .../inference/models/hpu/CodeLlama-7b-hf-hpu.yaml | 5 ----- .../inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml | 5 ----- llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml | 5 ----- llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml | 5 ----- llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml | 5 ----- llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml | 5 ----- llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml | 7 ------- llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml | 12 ------------ llm_on_ray/inference/models/hpu/gpt2-hpu.yaml | 5 ----- .../models/hpu/llama-3-70b-chat-hf-hpu.yaml | 13 ------------- .../models/hpu/llama-3-8b-instruct-hpu.yaml | 12 ------------ .../models/hpu/meta-llama-3-70b-instruct-hpu.yaml | 7 ------- .../inference/models/hpu/mistral-7b-v0.1-hpu.yaml | 7 ------- llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml | 12 ------------ 15 files changed, 2 insertions(+), 106 deletions(-) delete mode 100644 llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml delete mode 100644 llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index 0f94960f0..3e2c2fa51 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -28,7 +28,7 @@ jobs: name: inference strategy: matrix: - model: [ bloom-7b1, bloom-560m, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, MindChat-Qwen2-4B, mistral-7b-v0.1, mpt-7b ] + model: [ bloom-7b1, bloom-560m, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, MindChat-Qwen2-4B, mistral-7b-v0.1, mpt-7b ] isPR: - ${{inputs.ci_type == 'pr'}} @@ -43,6 +43,7 @@ jobs: - { model: "falcon-40b"} - { model: "gemma-2b"} - { model: "gpt-j-6b"} + - { model: "gpt2"} - { model: "llama-2-7b-chat-hf"} - { model: "llama-2-70b-chat-hf"} - { model: "meta-llama-3-8b-instruct"} diff --git a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml index 6d5780a9d..3f93b2c92 100644 --- a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml @@ -9,10 +9,5 @@ model_description: model_id_or_path: codellama/CodeLlama-7b-hf tokenizer_name_or_path: codellama/CodeLlama-7b-hf chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] config: use_auth_token: '' \ No newline at end of file diff --git a/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml b/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml index e84f7126f..2f26846cf 100644 --- a/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml @@ -9,10 +9,5 @@ model_description: model_id_or_path: X-D-Lab/MindChat-Qwen2-4B tokenizer_name_or_path: X-D-Lab/MindChat-Qwen2-4B chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] config: use_auth_token: '' \ No newline at end of file diff --git a/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml index 9fcd0f193..a72fe4f72 100644 --- a/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml @@ -11,10 +11,5 @@ model_description: model_id_or_path: bigscience/bloom-560m tokenizer_name_or_path: bigscience/bloom-560m chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml index fc7f4217b..464540aae 100644 --- a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml @@ -9,10 +9,5 @@ model_description: model_id_or_path: bigscience/bloom-7b1 tokenizer_name_or_path: bigscience/bloom-7b1 chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml index 50f9e3ec3..c900af72a 100644 --- a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml @@ -11,10 +11,5 @@ model_description: model_id_or_path: tiiuae/falcon-40b tokenizer_name_or_path: tiiuae/falcon-40b chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml index e450a066a..17c6cf85c 100644 --- a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml @@ -9,10 +9,5 @@ model_description: model_id_or_path: tiiuae/falcon-7b tokenizer_name_or_path: tiiuae/falcon-7b chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml b/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml index 5e84eb2e4..15f13c890 100644 --- a/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml @@ -9,12 +9,5 @@ model_description: model_id_or_path: google/gemma-2b tokenizer_name_or_path: google/gemma-2b chat_processor: ChatModelGemma - prompt: - intro: '' - human_id: 'user - {msg}' - bot_id: 'model - {msg}' - stop_words: [] config: use_auth_token: ' ' diff --git a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml index cfde9a781..3177ad69a 100644 --- a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml @@ -10,17 +10,5 @@ model_description: tokenizer_name_or_path: EleutherAI/gpt-j-6b chat_processor: ChatModelGptJ gpt_base_model: true - prompt: - intro: 'Below is an instruction that describes a task. Write a response that appropriately - completes the request. - - ' - human_id: ' - - ### Instruction' - bot_id: ' - - ### Response' - stop_words: [] config: use_auth_token: '' \ No newline at end of file diff --git a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml index 3c1cf1913..4d03d004f 100644 --- a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml @@ -10,10 +10,5 @@ model_description: tokenizer_name_or_path: gpt2 chat_processor: ChatModelGptJ gpt_base_model: true - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] config: use_auth_token: '' \ No newline at end of file diff --git a/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml deleted file mode 100644 index 32cf9bb4e..000000000 --- a/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml +++ /dev/null @@ -1,13 +0,0 @@ -port: 8000 -name: meta-llama-3-70b-instruct -route_prefix: /meta-llama-3-70b-instruct -cpus_per_worker: 8 -hpus_per_worker: 1 -deepspeed: true -workers_per_group: 8 -device: hpu -model_description: - model_id_or_path: meta-llama/Meta-Llama-3-70b-Instruct - tokenizer_name_or_path: meta-llama/Meta-Llama-3-70b-Instruct - config: - use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml deleted file mode 100644 index d57ffcc22..000000000 --- a/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml +++ /dev/null @@ -1,12 +0,0 @@ -port: 8000 -name: meta-llama-3-8b-instruct -route_prefix: /meta-llama-3-8b-instruct -num_replicas: 1 -cpus_per_worker: 8 -hpus_per_worker: 1 -device: hpu -model_description: - model_id_or_path: meta-llama/Meta-Llama-3-8b-Instruct - tokenizer_name_or_path: meta-llama/Meta-Llama-3-8b-Instruct - config: - use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml index 6976eafb3..d8bdb1d7d 100644 --- a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml @@ -10,12 +10,5 @@ model_description: model_id_or_path: meta-llama/Meta-Llama-3-70b-Instruct tokenizer_name_or_path: meta-llama/Meta-Llama-3-70b-Instruct chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml b/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml index e91f08e8a..ad3bcbaa7 100644 --- a/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml @@ -9,12 +9,5 @@ model_description: model_id_or_path: mistralai/Mistral-7B-v0.1 tokenizer_name_or_path: mistralai/Mistral-7B-v0.1 chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml index e45a64ea6..bcc2225b4 100644 --- a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml @@ -9,18 +9,6 @@ model_description: model_id_or_path: EleutherAI/gpt-neox-20b tokenizer_name_or_path: EleutherAI/gpt-neox-20b chat_processor: ChatModelLLama - prompt: - intro: 'Below is an instruction that describes a task, paired with an input that - provides further context. Write a response that appropriately completes the request. - - ' - human_id: ' - - ### Instruction' - bot_id: ' - - ### Response' - stop_words: [] config: use_auth_token: '' trust_remote_code: true From 28acd73642fc8bc389d3b750c8b165bbf2e55a2e Mon Sep 17 00:00:00 2001 From: Deegue Date: Mon, 20 May 2024 07:41:21 +0000 Subject: [PATCH 08/26] check and add all template, remove bloom-560m, add mixtral, change Qwen to version 1.5 --- .github/workflows/workflow_inference_gaudi2.yml | 4 ++-- .../inference/models/hpu/CodeLlama-7b-hf-hpu.yaml | 2 +- .../models/hpu/MindChat-Qwen2-4B-hpu.yaml | 13 ------------- .../hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml | 12 ++++++++++++ .../inference/models/hpu/Qwen1.5-110B-hpu.yaml | 12 ++++++++++++ .../inference/models/hpu/bloom-560m-hpu.yaml | 15 --------------- .../inference/models/hpu/bloom-7b1-hpu.yaml | 2 +- .../inference/models/hpu/falcon-40b-hpu.yaml | 2 +- .../inference/models/hpu/falcon-7b-hpu.yaml | 2 +- llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml | 2 +- llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml | 4 ++-- llm_on_ray/inference/models/hpu/gpt2-hpu.yaml | 2 +- .../models/hpu/llama-2-70b-chat-hf-hpu.yaml | 1 + .../models/hpu/llama-2-7b-chat-hf-hpu.yaml | 1 + .../models/hpu/meta-llama-3-70b-instruct-hpu.yaml | 2 +- .../models/hpu/meta-llama-3-8b-instruct-hpu.yaml | 1 + .../inference/models/hpu/mistral-7b-v0.1-hpu.yaml | 2 +- llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml | 2 +- 18 files changed, 40 insertions(+), 41 deletions(-) delete mode 100644 llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml create mode 100644 llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml create mode 100644 llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml delete mode 100644 llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index 3e2c2fa51..35aaf0a7b 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -28,7 +28,7 @@ jobs: name: inference strategy: matrix: - model: [ bloom-7b1, bloom-560m, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, MindChat-Qwen2-4B, mistral-7b-v0.1, mpt-7b ] + model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, MindChat-Qwen2-4B, mistral-7b-v0.1, Mixtral-8x7B-Instruct-v0.1, mpt-7b ] isPR: - ${{inputs.ci_type == 'pr'}} @@ -37,7 +37,6 @@ jobs: include: - { model: "bloom-7b1"} - - { model: "bloom-560m"} - { model: "CodeLlama-7b-hf"} - { model: "falcon-7b"} - { model: "falcon-40b"} @@ -50,6 +49,7 @@ jobs: - { model: "meta-llama-3-70b-instruct"} - { model: "MindChat-Qwen2-4B"} - { model: "mistral-7b-v0.1"} + - { model: "Mixtral-8x7B-Instruct-v0.1"} - { model: "mpt-7b"} runs-on: gaudi2 diff --git a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml index 3f93b2c92..1f9d40c2b 100644 --- a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml @@ -8,6 +8,6 @@ device: hpu model_description: model_id_or_path: codellama/CodeLlama-7b-hf tokenizer_name_or_path: codellama/CodeLlama-7b-hf - chat_processor: ChatModelGptJ + chat_template: "llm_on_ray/inference/models/templates/template_codellama.jinja" config: use_auth_token: '' \ No newline at end of file diff --git a/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml b/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml deleted file mode 100644 index 2f26846cf..000000000 --- a/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml +++ /dev/null @@ -1,13 +0,0 @@ -port: 8000 -name: MindChat-Qwen2-4B -route_prefix: /MindChat-Qwen2-4B -num_replicas: 1 -cpus_per_worker: 8 -hpus_per_worker: 1 -device: hpu -model_description: - model_id_or_path: X-D-Lab/MindChat-Qwen2-4B - tokenizer_name_or_path: X-D-Lab/MindChat-Qwen2-4B - chat_processor: ChatModelGptJ - config: - use_auth_token: '' \ No newline at end of file diff --git a/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml b/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml new file mode 100644 index 000000000..da331511e --- /dev/null +++ b/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml @@ -0,0 +1,12 @@ +port: 8000 +name: Mixtral-8x7B-Instruct-v0.1 +route_prefix: /Mixtral-8x7B-Instruct-v0.1 +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: mistralai/Mixtral-8x7B-Instruct-v0.1 + tokenizer_name_or_path: bmistralai/Mixtral-8x7B-Instruct-v0.1 + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml b/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml new file mode 100644 index 000000000..f8836e4b6 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml @@ -0,0 +1,12 @@ +port: 8000 +name: Qwen1.5-110B +route_prefix: /Qwen1.5-110B +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: Qwen/Qwen1.5-110B + tokenizer_name_or_path: Qwen/Qwen1.5-110B + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml deleted file mode 100644 index a72fe4f72..000000000 --- a/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml +++ /dev/null @@ -1,15 +0,0 @@ -port: 8000 -name: bloom-560m -route_prefix: /bloom-560m -num_replicas: 1 -cpus_per_worker: 8 -hpus_per_worker: 1 -deepspeed: true -workers_per_group: 8 -device: hpu -model_description: - model_id_or_path: bigscience/bloom-560m - tokenizer_name_or_path: bigscience/bloom-560m - chat_processor: ChatModelGptJ - config: - use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml index 464540aae..9b2b69861 100644 --- a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml @@ -8,6 +8,6 @@ device: hpu model_description: model_id_or_path: bigscience/bloom-7b1 tokenizer_name_or_path: bigscience/bloom-7b1 - chat_processor: ChatModelGptJ + chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml index c900af72a..a017ca465 100644 --- a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml @@ -10,6 +10,6 @@ device: hpu model_description: model_id_or_path: tiiuae/falcon-40b tokenizer_name_or_path: tiiuae/falcon-40b - chat_processor: ChatModelGptJ + chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml index 17c6cf85c..9e0a8931f 100644 --- a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml @@ -8,6 +8,6 @@ device: hpu model_description: model_id_or_path: tiiuae/falcon-7b tokenizer_name_or_path: tiiuae/falcon-7b - chat_processor: ChatModelGptJ + chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml b/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml index 15f13c890..3b7e6d582 100644 --- a/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml @@ -8,6 +8,6 @@ device: hpu model_description: model_id_or_path: google/gemma-2b tokenizer_name_or_path: google/gemma-2b - chat_processor: ChatModelGemma + chat_template: "llm_on_ray/inference/models/templates/template_gemma.jinja" config: use_auth_token: ' ' diff --git a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml index 3177ad69a..47abb8829 100644 --- a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml @@ -8,7 +8,7 @@ device: hpu model_description: model_id_or_path: EleutherAI/gpt-j-6b tokenizer_name_or_path: EleutherAI/gpt-j-6b - chat_processor: ChatModelGptJ + chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja" gpt_base_model: true config: - use_auth_token: '' \ No newline at end of file + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml index 4d03d004f..34803ad2d 100644 --- a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml @@ -8,7 +8,7 @@ device: hpu model_description: model_id_or_path: gpt2 tokenizer_name_or_path: gpt2 - chat_processor: ChatModelGptJ + chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja" gpt_base_model: true config: use_auth_token: '' \ No newline at end of file diff --git a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml index ab411ff0e..776a1b76a 100644 --- a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml @@ -10,5 +10,6 @@ device: hpu model_description: model_id_or_path: meta-llama/Llama-2-70b-chat-hf tokenizer_name_or_path: meta-llama/Llama-2-70b-chat-hf + chat_template: "llm_on_ray/inference/models/templates/template_llama2.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml index b7b19f02a..fb2027d12 100644 --- a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml @@ -8,5 +8,6 @@ device: hpu model_description: model_id_or_path: meta-llama/Llama-2-7b-chat-hf tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf + chat_template: "llm_on_ray/inference/models/templates/template_llama2.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml index d8bdb1d7d..f1b6fac9c 100644 --- a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml @@ -9,6 +9,6 @@ device: hpu model_description: model_id_or_path: meta-llama/Meta-Llama-3-70b-Instruct tokenizer_name_or_path: meta-llama/Meta-Llama-3-70b-Instruct - chat_processor: ChatModelLLama + chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml index d57ffcc22..cb3d3ada6 100644 --- a/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml @@ -8,5 +8,6 @@ device: hpu model_description: model_id_or_path: meta-llama/Meta-Llama-3-8b-Instruct tokenizer_name_or_path: meta-llama/Meta-Llama-3-8b-Instruct + chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml b/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml index ad3bcbaa7..738ad49b2 100644 --- a/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml @@ -8,6 +8,6 @@ device: hpu model_description: model_id_or_path: mistralai/Mistral-7B-v0.1 tokenizer_name_or_path: mistralai/Mistral-7B-v0.1 - chat_processor: ChatModelLLama + chat_template: "llm_on_ray/inference/models/templates/template_mistral.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml index bcc2225b4..f9021b85f 100644 --- a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml @@ -8,7 +8,7 @@ device: hpu model_description: model_id_or_path: EleutherAI/gpt-neox-20b tokenizer_name_or_path: EleutherAI/gpt-neox-20b - chat_processor: ChatModelLLama + chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja" config: use_auth_token: '' trust_remote_code: true From ecf40e68fae314ed2b6b2f84d0f414f583b9c711 Mon Sep 17 00:00:00 2001 From: Deegue Date: Tue, 21 May 2024 05:03:26 +0000 Subject: [PATCH 09/26] nit --- llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml | 2 +- llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml | 2 +- llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml | 2 +- llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml | 2 +- .../inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml | 2 +- .../inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml | 2 +- llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml index 9b2b69861..b6f4d949e 100644 --- a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml @@ -8,6 +8,6 @@ device: hpu model_description: model_id_or_path: bigscience/bloom-7b1 tokenizer_name_or_path: bigscience/bloom-7b1 - chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja" + chat_template: "llm_on_ray/inference/models/templates/default_template.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml index a017ca465..ecf0cd303 100644 --- a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml @@ -10,6 +10,6 @@ device: hpu model_description: model_id_or_path: tiiuae/falcon-40b tokenizer_name_or_path: tiiuae/falcon-40b - chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja" + chat_template: "llm_on_ray/inference/models/templates/default_template.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml index 9e0a8931f..86b7efaae 100644 --- a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml @@ -8,6 +8,6 @@ device: hpu model_description: model_id_or_path: tiiuae/falcon-7b tokenizer_name_or_path: tiiuae/falcon-7b - chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja" + chat_template: "llm_on_ray/inference/models/templates/default_template.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml index 47abb8829..6f675f05e 100644 --- a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml @@ -8,7 +8,7 @@ device: hpu model_description: model_id_or_path: EleutherAI/gpt-j-6b tokenizer_name_or_path: EleutherAI/gpt-j-6b - chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja" + chat_template: "llm_on_ray/inference/models/templates/default_template.jinja" gpt_base_model: true config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml index f1b6fac9c..58f96b110 100644 --- a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml @@ -9,6 +9,6 @@ device: hpu model_description: model_id_or_path: meta-llama/Meta-Llama-3-70b-Instruct tokenizer_name_or_path: meta-llama/Meta-Llama-3-70b-Instruct - chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja" + chat_template: "llm_on_ray/inference/models/templates/default_template.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml index cb3d3ada6..b2b31221c 100644 --- a/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml @@ -8,6 +8,6 @@ device: hpu model_description: model_id_or_path: meta-llama/Meta-Llama-3-8b-Instruct tokenizer_name_or_path: meta-llama/Meta-Llama-3-8b-Instruct - chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja" + chat_template: "llm_on_ray/inference/models/templates/default_template.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml index f9021b85f..6f6701b97 100644 --- a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml @@ -8,7 +8,7 @@ device: hpu model_description: model_id_or_path: EleutherAI/gpt-neox-20b tokenizer_name_or_path: EleutherAI/gpt-neox-20b - chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja" + chat_template: "llm_on_ray/inference/models/templates/default_template.jinja" config: use_auth_token: '' trust_remote_code: true From f4e02ff40600e71ccf4de3593d456f84418c6009 Mon Sep 17 00:00:00 2001 From: Deegue Date: Tue, 21 May 2024 07:53:37 +0000 Subject: [PATCH 10/26] fix --- .github/workflows/workflow_inference_gaudi2.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index 35aaf0a7b..700ab65e4 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -28,7 +28,7 @@ jobs: name: inference strategy: matrix: - model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, MindChat-Qwen2-4B, mistral-7b-v0.1, Mixtral-8x7B-Instruct-v0.1, mpt-7b ] + model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-8x7B-Instruct-v0.1, mpt-7b ] isPR: - ${{inputs.ci_type == 'pr'}} @@ -47,7 +47,7 @@ jobs: - { model: "llama-2-70b-chat-hf"} - { model: "meta-llama-3-8b-instruct"} - { model: "meta-llama-3-70b-instruct"} - - { model: "MindChat-Qwen2-4B"} + - { model: "Qwen1.5-110B"} - { model: "mistral-7b-v0.1"} - { model: "Mixtral-8x7B-Instruct-v0.1"} - { model: "mpt-7b"} From 8218531270cd513207533177c5e67f14d2463780 Mon Sep 17 00:00:00 2001 From: Deegue Date: Tue, 21 May 2024 08:25:09 +0000 Subject: [PATCH 11/26] fix --- .../inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml b/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml index da331511e..e8424b60e 100644 --- a/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml @@ -7,6 +7,6 @@ hpus_per_worker: 1 device: hpu model_description: model_id_or_path: mistralai/Mixtral-8x7B-Instruct-v0.1 - tokenizer_name_or_path: bmistralai/Mixtral-8x7B-Instruct-v0.1 + tokenizer_name_or_path: mistralai/Mixtral-8x7B-Instruct-v0.1 config: use_auth_token: '' From 0baa303fb386a4942109e85eb67c3b09618a5f23 Mon Sep 17 00:00:00 2001 From: Deegue Date: Wed, 22 May 2024 02:18:30 +0000 Subject: [PATCH 12/26] fix --- .github/workflows/workflow_inference_gaudi2.yml | 4 ++-- ...ixtral-8x7B-Instruct-v0.1-hpu.yaml => Mixtral-7B-hpu.yaml} | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) rename llm_on_ray/inference/models/hpu/{Mixtral-8x7B-Instruct-v0.1-hpu.yaml => Mixtral-7B-hpu.yaml} (76%) diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index 700ab65e4..b4d8f691f 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -28,7 +28,7 @@ jobs: name: inference strategy: matrix: - model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-8x7B-Instruct-v0.1, mpt-7b ] + model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-7B, mpt-7b ] isPR: - ${{inputs.ci_type == 'pr'}} @@ -49,7 +49,7 @@ jobs: - { model: "meta-llama-3-70b-instruct"} - { model: "Qwen1.5-110B"} - { model: "mistral-7b-v0.1"} - - { model: "Mixtral-8x7B-Instruct-v0.1"} + - { model: "Mixtral-7B"} - { model: "mpt-7b"} runs-on: gaudi2 diff --git a/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml b/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml similarity index 76% rename from llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml rename to llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml index e8424b60e..fbd6ccaeb 100644 --- a/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml @@ -1,6 +1,6 @@ port: 8000 -name: Mixtral-8x7B-Instruct-v0.1 -route_prefix: /Mixtral-8x7B-Instruct-v0.1 +name: Mixtral-7B +route_prefix: /Mixtral-7B num_replicas: 1 cpus_per_worker: 8 hpus_per_worker: 1 From 96b36bd3139e73d4dbe0ae8392d667866bafc936 Mon Sep 17 00:00:00 2001 From: Deegue Date: Wed, 22 May 2024 02:40:00 +0000 Subject: [PATCH 13/26] remove default template --- llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml | 1 - llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml | 1 - llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml | 1 - llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml | 1 - .../inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml | 1 - .../inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml | 1 - llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml | 1 - 7 files changed, 7 deletions(-) diff --git a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml index b6f4d949e..5de2a3e30 100644 --- a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml @@ -8,6 +8,5 @@ device: hpu model_description: model_id_or_path: bigscience/bloom-7b1 tokenizer_name_or_path: bigscience/bloom-7b1 - chat_template: "llm_on_ray/inference/models/templates/default_template.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml index ecf0cd303..dd6bd2acf 100644 --- a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml @@ -10,6 +10,5 @@ device: hpu model_description: model_id_or_path: tiiuae/falcon-40b tokenizer_name_or_path: tiiuae/falcon-40b - chat_template: "llm_on_ray/inference/models/templates/default_template.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml index 86b7efaae..e21110c08 100644 --- a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml @@ -8,6 +8,5 @@ device: hpu model_description: model_id_or_path: tiiuae/falcon-7b tokenizer_name_or_path: tiiuae/falcon-7b - chat_template: "llm_on_ray/inference/models/templates/default_template.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml index 6f675f05e..8260a6445 100644 --- a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml @@ -8,7 +8,6 @@ device: hpu model_description: model_id_or_path: EleutherAI/gpt-j-6b tokenizer_name_or_path: EleutherAI/gpt-j-6b - chat_template: "llm_on_ray/inference/models/templates/default_template.jinja" gpt_base_model: true config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml index 58f96b110..32cf9bb4e 100644 --- a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml @@ -9,6 +9,5 @@ device: hpu model_description: model_id_or_path: meta-llama/Meta-Llama-3-70b-Instruct tokenizer_name_or_path: meta-llama/Meta-Llama-3-70b-Instruct - chat_template: "llm_on_ray/inference/models/templates/default_template.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml index b2b31221c..d57ffcc22 100644 --- a/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml @@ -8,6 +8,5 @@ device: hpu model_description: model_id_or_path: meta-llama/Meta-Llama-3-8b-Instruct tokenizer_name_or_path: meta-llama/Meta-Llama-3-8b-Instruct - chat_template: "llm_on_ray/inference/models/templates/default_template.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml index 6f6701b97..df4908376 100644 --- a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml @@ -8,7 +8,6 @@ device: hpu model_description: model_id_or_path: EleutherAI/gpt-neox-20b tokenizer_name_or_path: EleutherAI/gpt-neox-20b - chat_template: "llm_on_ray/inference/models/templates/default_template.jinja" config: use_auth_token: '' trust_remote_code: true From 1de2ebb6c047d06e2afec13bbd711ca26b497ca6 Mon Sep 17 00:00:00 2001 From: Deegue Date: Thu, 23 May 2024 02:31:26 +0000 Subject: [PATCH 14/26] fix when list length is 1 --- llm_on_ray/inference/predictor_deployment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index f5ac35d80..319eb5e09 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -335,7 +335,7 @@ def preprocess_prompts( HTTPException: If the input prompt format is invalid or not supported. """ - if isinstance(input, str): + if isinstance(input, str) or (isinstance(input, List) and len(input) == 1): return input elif isinstance(input, List): prompts = [] From 9b8e57d7fa5ea944619e7cd27e2679233e7a7964 Mon Sep 17 00:00:00 2001 From: Deegue Date: Thu, 23 May 2024 02:50:53 +0000 Subject: [PATCH 15/26] fix --- llm_on_ray/inference/predictor_deployment.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index 319eb5e09..3bb6a4ffc 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -335,8 +335,10 @@ def preprocess_prompts( HTTPException: If the input prompt format is invalid or not supported. """ - if isinstance(input, str) or (isinstance(input, List) and len(input) == 1): + if isinstance(input, str): return input + elif isinstance(input, List) and len(input) == 1: + return input[0] elif isinstance(input, List): prompts = [] images = [] From 8940d0de5af44e0d05f8b0958470327f6a471799 Mon Sep 17 00:00:00 2001 From: Deegue Date: Mon, 27 May 2024 01:43:47 +0000 Subject: [PATCH 16/26] fix target --- .github/workflows/workflow_inference_gaudi2.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index cc678f361..fb22630cd 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -72,13 +72,11 @@ jobs: - name: Determine Target id: "target" run: | - target="${target}_gaudi2" - if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf" ]]; then - target="${target}_gaudi2" - elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then - target="${target}_gaudi2" - elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then + target="inference" + if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then target="${target}_vllm_gaudi2" + else + target="${target}_gaudi2" fi echo "target is ${target}" echo "target=$target" >> $GITHUB_OUTPUT From 50c4988fc007c83764a6a3594dd4fcd40cf454e1 Mon Sep 17 00:00:00 2001 From: Deegue Date: Mon, 27 May 2024 12:20:20 +0000 Subject: [PATCH 17/26] change cache dir --- .github/workflows/workflow_inference_gaudi2.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index fb22630cd..47878a205 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -17,7 +17,7 @@ on: default: '/home/ci/actions-runner/_work/llm-on-ray/llm-on-ray' model_cache_path: type: string - default: '/mnt/DP_disk1/huggingface/cache' + default: '/scratch-2/huggingface/cache' concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-inf-gaudi2 From 762e84c1a099b381e405bf8a3fd4ebb68c7b4252 Mon Sep 17 00:00:00 2001 From: Deegue Date: Wed, 29 May 2024 01:09:19 +0000 Subject: [PATCH 18/26] remove Mixtral --- .github/workflows/workflow_inference_gaudi2.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index 47878a205..bbdd9ac8a 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -28,7 +28,7 @@ jobs: name: inference strategy: matrix: - model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-7B, mpt-7b, llama-2-7b-chat-hf-vllm ] + model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, mpt-7b, llama-2-7b-chat-hf-vllm ] isPR: - ${{inputs.ci_type == 'pr'}} @@ -49,7 +49,6 @@ jobs: - { model: "meta-llama-3-70b-instruct"} - { model: "Qwen1.5-110B"} - { model: "mistral-7b-v0.1"} - - { model: "Mixtral-7B"} - { model: "mpt-7b"} - { model: "llama-2-7b-chat-hf-vllm"} From 012bac2c2950fc781b21ba4d62dc8f751ab6f47e Mon Sep 17 00:00:00 2001 From: Deegue Date: Wed, 29 May 2024 03:49:20 +0000 Subject: [PATCH 19/26] change to 8 cards --- llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml b/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml index f8836e4b6..5086c3d2e 100644 --- a/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml @@ -4,6 +4,8 @@ route_prefix: /Qwen1.5-110B num_replicas: 1 cpus_per_worker: 8 hpus_per_worker: 1 +deepspeed: true +workers_per_group: 8 device: hpu model_description: model_id_or_path: Qwen/Qwen1.5-110B From 4496e73d1a38ae254fc68eabbf549e028a57bb8b Mon Sep 17 00:00:00 2001 From: Deegue Date: Thu, 30 May 2024 02:00:59 +0000 Subject: [PATCH 20/26] remove Qwen and fix --- .github/workflows/workflow_inference_gaudi2.yml | 3 +-- llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml | 2 +- llm_on_ray/inference/models/hpu/gpt2-hpu.yaml | 2 +- llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml | 2 +- llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml | 2 +- 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index bbdd9ac8a..eea90fb8d 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -28,7 +28,7 @@ jobs: name: inference strategy: matrix: - model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, mpt-7b, llama-2-7b-chat-hf-vllm ] + model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-7B, mpt-7b, llama-2-7b-chat-hf-vllm ] isPR: - ${{inputs.ci_type == 'pr'}} @@ -47,7 +47,6 @@ jobs: - { model: "llama-2-70b-chat-hf"} - { model: "meta-llama-3-8b-instruct"} - { model: "meta-llama-3-70b-instruct"} - - { model: "Qwen1.5-110B"} - { model: "mistral-7b-v0.1"} - { model: "mpt-7b"} - { model: "llama-2-7b-chat-hf-vllm"} diff --git a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml index 1f9d40c2b..717627563 100644 --- a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml @@ -10,4 +10,4 @@ model_description: tokenizer_name_or_path: codellama/CodeLlama-7b-hf chat_template: "llm_on_ray/inference/models/templates/template_codellama.jinja" config: - use_auth_token: '' \ No newline at end of file + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml index 34803ad2d..b25903cf7 100644 --- a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml @@ -11,4 +11,4 @@ model_description: chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja" gpt_base_model: true config: - use_auth_token: '' \ No newline at end of file + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml index bc147f105..1973ae1a2 100644 --- a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml @@ -16,4 +16,4 @@ model_description: tokenizer_name_or_path: Intel/neural-chat-7b-v3-3 chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja" config: - use_auth_token: '' \ No newline at end of file + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml index bc147f105..1973ae1a2 100644 --- a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml +++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml @@ -16,4 +16,4 @@ model_description: tokenizer_name_or_path: Intel/neural-chat-7b-v3-3 chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja" config: - use_auth_token: '' \ No newline at end of file + use_auth_token: '' From 33a1478479237f78ef4b194515a4187b4e1c3c72 Mon Sep 17 00:00:00 2001 From: Deegue Date: Tue, 4 Jun 2024 07:11:18 +0000 Subject: [PATCH 21/26] revert and add Qwen&Mixtral back --- .github/workflows/workflow_inference_gaudi2.yml | 2 ++ llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml | 2 ++ llm_on_ray/inference/predictor_deployment.py | 2 -- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index eea90fb8d..63fc34b1e 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -48,6 +48,8 @@ jobs: - { model: "meta-llama-3-8b-instruct"} - { model: "meta-llama-3-70b-instruct"} - { model: "mistral-7b-v0.1"} + - { model: "Qwen1.5-110B"} + - { model: "Mixtral-7B"} - { model: "mpt-7b"} - { model: "llama-2-7b-chat-hf-vllm"} diff --git a/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml b/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml index fbd6ccaeb..2d52e217b 100644 --- a/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml @@ -4,6 +4,8 @@ route_prefix: /Mixtral-7B num_replicas: 1 cpus_per_worker: 8 hpus_per_worker: 1 +deepspeed: true +workers_per_group: 8 device: hpu model_description: model_id_or_path: mistralai/Mixtral-8x7B-Instruct-v0.1 diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index 44b11f20b..a1055915d 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -337,8 +337,6 @@ def preprocess_prompts( if isinstance(input, str): return input - elif isinstance(input, List) and len(input) == 1: - return input[0] elif isinstance(input, List): prompts = [] images = [] From 43a75bcaffbadc5d86f110e6d314ea643926b122 Mon Sep 17 00:00:00 2001 From: Deegue Date: Wed, 5 Jun 2024 07:13:41 +0000 Subject: [PATCH 22/26] nit --- .github/workflows/workflow_inference_gaudi2.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index 63fc34b1e..eea90fb8d 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -48,8 +48,6 @@ jobs: - { model: "meta-llama-3-8b-instruct"} - { model: "meta-llama-3-70b-instruct"} - { model: "mistral-7b-v0.1"} - - { model: "Qwen1.5-110B"} - - { model: "Mixtral-7B"} - { model: "mpt-7b"} - { model: "llama-2-7b-chat-hf-vllm"} From 2b868ca7c20402aa188becaa74aabbf9c395aaa5 Mon Sep 17 00:00:00 2001 From: Deegue Date: Wed, 12 Jun 2024 05:49:02 +0000 Subject: [PATCH 23/26] add Qwen1.5-7B-Chat --- .github/workflows/workflow_inference_gaudi2.yml | 3 ++- .../inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 llm_on_ray/inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index eea90fb8d..2bb6ee9b3 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -28,7 +28,7 @@ jobs: name: inference strategy: matrix: - model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-7B, mpt-7b, llama-2-7b-chat-hf-vllm ] + model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-7B, mpt-7b, Qwen1.5-7B-Chat, llama-2-7b-chat-hf-vllm ] isPR: - ${{inputs.ci_type == 'pr'}} @@ -49,6 +49,7 @@ jobs: - { model: "meta-llama-3-70b-instruct"} - { model: "mistral-7b-v0.1"} - { model: "mpt-7b"} + - { model: "Qwen1.5-7B-Chat"} - { model: "llama-2-7b-chat-hf-vllm"} runs-on: gaudi2 diff --git a/llm_on_ray/inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml b/llm_on_ray/inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml new file mode 100644 index 000000000..15680b4fb --- /dev/null +++ b/llm_on_ray/inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml @@ -0,0 +1,12 @@ +port: 8000 +name: Qwen1.5-7B-Chat +route_prefix: /Qwen1.5-7B-Chat +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: Qwen/Qwen1.5-7B-Chat + tokenizer_name_or_path: Qwen/Qwen1.5-7B-Chat + config: + use_auth_token: '' From 755593501be92f01f67774ab7c6a8d14b258a1ae Mon Sep 17 00:00:00 2001 From: Deegue Date: Wed, 12 Jun 2024 05:56:46 +0000 Subject: [PATCH 24/26] add Qwen2-7B-Instruct --- .github/workflows/workflow_inference_gaudi2.yml | 3 ++- .../inference/models/hpu/Qwen2-7B-Instruct-hpu.yaml | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 llm_on_ray/inference/models/hpu/Qwen2-7B-Instruct-hpu.yaml diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index 2bb6ee9b3..0847eeefb 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -28,7 +28,7 @@ jobs: name: inference strategy: matrix: - model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-7B, mpt-7b, Qwen1.5-7B-Chat, llama-2-7b-chat-hf-vllm ] + model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-7B, mpt-7b, Qwen1.5-7B-Chat, Qwen2-7B-Instruct, llama-2-7b-chat-hf-vllm ] isPR: - ${{inputs.ci_type == 'pr'}} @@ -50,6 +50,7 @@ jobs: - { model: "mistral-7b-v0.1"} - { model: "mpt-7b"} - { model: "Qwen1.5-7B-Chat"} + - { model: "Qwen2-7B-Instruct"} - { model: "llama-2-7b-chat-hf-vllm"} runs-on: gaudi2 diff --git a/llm_on_ray/inference/models/hpu/Qwen2-7B-Instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/Qwen2-7B-Instruct-hpu.yaml new file mode 100644 index 000000000..09f705f12 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/Qwen2-7B-Instruct-hpu.yaml @@ -0,0 +1,12 @@ +port: 8000 +name: Qwen2-7B-Instruct +route_prefix: /Qwen2-7B-Instruct +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: Qwen/Qwen2-7B-Instruct + tokenizer_name_or_path: Qwen/Qwen2-7B-Instruct + config: + use_auth_token: '' From 53187b598ba931ae77f6d87caf7fd5c2509d835f Mon Sep 17 00:00:00 2001 From: Deegue Date: Thu, 13 Jun 2024 06:02:07 +0000 Subject: [PATCH 25/26] remove several models --- .github/workflows/workflow_inference_gaudi2.yml | 3 +-- .../inference/models/hpu/Mixtral-7B-hpu.yaml | 14 -------------- .../inference/models/hpu/Qwen1.5-110B-hpu.yaml | 14 -------------- .../inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml | 12 ------------ 4 files changed, 1 insertion(+), 42 deletions(-) delete mode 100644 llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml delete mode 100644 llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml delete mode 100644 llm_on_ray/inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index 0847eeefb..055d59026 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -28,7 +28,7 @@ jobs: name: inference strategy: matrix: - model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-7B, mpt-7b, Qwen1.5-7B-Chat, Qwen2-7B-Instruct, llama-2-7b-chat-hf-vllm ] + model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, mistral-7b-v0.1, mpt-7b, Qwen2-7B-Instruct, llama-2-7b-chat-hf-vllm ] isPR: - ${{inputs.ci_type == 'pr'}} @@ -49,7 +49,6 @@ jobs: - { model: "meta-llama-3-70b-instruct"} - { model: "mistral-7b-v0.1"} - { model: "mpt-7b"} - - { model: "Qwen1.5-7B-Chat"} - { model: "Qwen2-7B-Instruct"} - { model: "llama-2-7b-chat-hf-vllm"} diff --git a/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml b/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml deleted file mode 100644 index 2d52e217b..000000000 --- a/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml +++ /dev/null @@ -1,14 +0,0 @@ -port: 8000 -name: Mixtral-7B -route_prefix: /Mixtral-7B -num_replicas: 1 -cpus_per_worker: 8 -hpus_per_worker: 1 -deepspeed: true -workers_per_group: 8 -device: hpu -model_description: - model_id_or_path: mistralai/Mixtral-8x7B-Instruct-v0.1 - tokenizer_name_or_path: mistralai/Mixtral-8x7B-Instruct-v0.1 - config: - use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml b/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml deleted file mode 100644 index 5086c3d2e..000000000 --- a/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml +++ /dev/null @@ -1,14 +0,0 @@ -port: 8000 -name: Qwen1.5-110B -route_prefix: /Qwen1.5-110B -num_replicas: 1 -cpus_per_worker: 8 -hpus_per_worker: 1 -deepspeed: true -workers_per_group: 8 -device: hpu -model_description: - model_id_or_path: Qwen/Qwen1.5-110B - tokenizer_name_or_path: Qwen/Qwen1.5-110B - config: - use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml b/llm_on_ray/inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml deleted file mode 100644 index 15680b4fb..000000000 --- a/llm_on_ray/inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml +++ /dev/null @@ -1,12 +0,0 @@ -port: 8000 -name: Qwen1.5-7B-Chat -route_prefix: /Qwen1.5-7B-Chat -num_replicas: 1 -cpus_per_worker: 8 -hpus_per_worker: 1 -device: hpu -model_description: - model_id_or_path: Qwen/Qwen1.5-7B-Chat - tokenizer_name_or_path: Qwen/Qwen1.5-7B-Chat - config: - use_auth_token: '' From 6d16dd4fe91390f6c63c4b8d2160403cc0a0b19f Mon Sep 17 00:00:00 2001 From: Deegue Date: Tue, 18 Jun 2024 05:19:28 +0000 Subject: [PATCH 26/26] add falcon qwen linear all reduce to hpu_predictor --- llm_on_ray/inference/predictors/hpu_predictor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py index 4710e0bf9..5292a2ca8 100644 --- a/llm_on_ray/inference/predictors/hpu_predictor.py +++ b/llm_on_ray/inference/predictors/hpu_predictor.py @@ -337,7 +337,7 @@ def load_model(self): engine = deepspeed.init_inference(model, **ds_inference_kwargs) self.model = engine.module - if self.model.config.model_type == "llama": + if self.model.config.model_type in ["llama", "falcon", "qwen2"]: def patch_scoped_linear_all_reduce(model): from deepspeed.module_inject.layers import LinearAllreduce