From ff52ed5699510323cc35052d5b6027f6d3507549 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Thu, 16 May 2024 15:00:37 +0000
Subject: [PATCH 01/26] add validated models for Gaudi

---
 .../workflows/workflow_inference_gaudi2.yml   | 47 +++++++++----------
 .../models/hpu/CodeLlama-7b-hf-hpu.yaml       | 16 +++++++
 .../models/hpu/MindChat-Qwen2-4B-hpu.yaml     | 16 +++++++
 .../inference/models/hpu/bloom-560m-hpu.yaml  | 20 ++++++++
 .../inference/models/hpu/bloom-7b1-hpu.yaml   | 18 +++++++
 .../inference/models/hpu/falcon-40b-hpu.yaml  | 20 ++++++++
 .../inference/models/hpu/falcon-7b-hpu.yaml   | 18 +++++++
 .../inference/models/hpu/gemma-2b-hpu.yaml    | 20 ++++++++
 .../inference/models/hpu/gpt-j-6b-hpu.yaml    | 24 ++++++++++
 llm_on_ray/inference/models/hpu/gpt2-hpu.yaml | 17 +++++++
 .../hpu/meta-llama-3-70b-instruct-hpu.yaml    | 21 +++++++++
 .../hpu/meta-llama-3-8b-instruct-hpu.yaml     | 12 +++++
 .../models/hpu/mistral-7b-v0.1-hpu.yaml       | 20 ++++++++
 .../inference/models/hpu/mpt-7b-hpu.yaml      | 25 ++++++++++
 .../models/hpu/neural-chat-7b-v3-3-hpu.yaml   | 17 +++++++
 15 files changed, 285 insertions(+), 26 deletions(-)
 create mode 100644 llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
 create mode 100644 llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml
 create mode 100644 llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml
 create mode 100644 llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
 create mode 100644 llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
 create mode 100644 llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
 create mode 100644 llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml
 create mode 100644 llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
 create mode 100644 llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
 create mode 100644 llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml
 create mode 100644 llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml
 create mode 100644 llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml
 create mode 100644 llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
 create mode 100644 llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index 794b83181..139d3a21c 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -28,7 +28,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ llama-2-7b-chat-hf, llama-2-70b-chat-hf ]
+        model: [ bloom-7b1, bloom-560m, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, MindChat-Qwen2-4B, mistral-7b-v0.1, mpt-7b ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -36,8 +36,20 @@ jobs:
           - { isPR: true }
 
         include:
+          - { model: "bloom-7b1"}
+          - { model: "bloom-560m"}
+          - { model: "CodeLlama-7b-hf"}
+          - { model: "falcon-7b"}
+          - { model: "falcon-40b"}
+          - { model: "gemma-2b"}
+          - { model: "gpt-j-6b"}
           - { model: "llama-2-7b-chat-hf"}
           - { model: "llama-2-70b-chat-hf"}
+          - { model: "meta-llama-3-8b-instruct"}
+          - { model: "meta-llama-3-70b-instruct"}
+          - { model: "MindChat-Qwen2-4B"}
+          - { model: "mistral-7b-v0.1"}
+          - { model: "mpt-7b"}
 
     runs-on: gaudi2
 
@@ -59,11 +71,7 @@ jobs:
         id: "target"
         run: |
           target="inference"
-          if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf" ]]; then
-            target="${target}_gaudi2"
-          elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then
-            target="${target}_gaudi2"
-          fi
+          target="${target}_gaudi2"
           echo "target is ${target}"
           echo "target=$target" >> $GITHUB_OUTPUT
 
@@ -101,28 +109,15 @@ jobs:
           TARGET=${{steps.target.outputs.target}}
           CMD=$(cat << EOF
           import yaml
-          if ("${{ matrix.model }}" == "llama-2-7b-chat-hf"):
-              conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml"
-              with open(conf_path, encoding="utf-8") as reader:
-                  result = yaml.load(reader, Loader=yaml.FullLoader)
-                  result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
-              with open(conf_path, 'w') as output:
-                  yaml.dump(result, output, sort_keys=False)
-          elif ("${{ matrix.model }}" == "llama-2-70b-chat-hf"):
-              conf_path = "llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml"
-              with open(conf_path, encoding="utf-8") as reader:
-                  result = yaml.load(reader, Loader=yaml.FullLoader)
-                  result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
-              with open(conf_path, 'w') as output:
-                  yaml.dump(result, output, sort_keys=False)
-          EOF
+          conf_path = "llm_on_ray/inference/models/hpu/" + "${{ matrix.model }}" + "-hpu.yaml"
+          with open(conf_path, encoding="utf-8") as reader:
+              result = yaml.load(reader, Loader=yaml.FullLoader)
+              result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
+          with open(conf_path, 'w') as output:
+              yaml.dump(result, output, sort_keys=False)
           )
           docker exec "${TARGET}" python -c "$CMD"
-          if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf" ]]; then
-            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml --keep_serve_terminal"
-          elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then
-            docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml --keep_serve_terminal"
-          fi
+          docker exec "${TARGET}" bash -c "llm_on_ray-serve  --config_file llm_on_ray/inference/models/hpu/${{ matrix.model }}-hpu.yaml --keep_serve_terminal"
           echo Streaming query:
           docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }} --streaming_response"
 
diff --git a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
new file mode 100644
index 000000000..fa794db98
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
@@ -0,0 +1,16 @@
+port: 8000
+name: CodeLlama-7b-hf
+route_prefix: /CodeLlama-7b-hf
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: codellama/CodeLlama-7b-hf
+  tokenizer_name_or_path: codellama/CodeLlama-7b-hf
+  chat_processor: ChatModelGptJ
+  prompt:
+    intro: ''
+    human_id: ''
+    bot_id: ''
+    stop_words: []
diff --git a/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml b/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml
new file mode 100644
index 000000000..1e5779dd8
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml
@@ -0,0 +1,16 @@
+port: 8000
+name: MindChat-Qwen2-4B 
+route_prefix: /MindChat-Qwen2-4B 
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: X-D-Lab/MindChat-Qwen2-4B
+  tokenizer_name_or_path: X-D-Lab/MindChat-Qwen2-4B
+  chat_processor: ChatModelGptJ
+  prompt:
+    intro: ''
+    human_id: ''
+    bot_id: ''
+    stop_words: []
diff --git a/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml
new file mode 100644
index 000000000..9fcd0f193
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml
@@ -0,0 +1,20 @@
+port: 8000
+name: bloom-560m
+route_prefix: /bloom-560m
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+deepspeed: true
+workers_per_group: 8
+device: hpu
+model_description:
+  model_id_or_path: bigscience/bloom-560m
+  tokenizer_name_or_path: bigscience/bloom-560m
+  chat_processor: ChatModelGptJ
+  prompt:
+    intro: ''
+    human_id: ''
+    bot_id: ''
+    stop_words: []
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
new file mode 100644
index 000000000..fc7f4217b
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
@@ -0,0 +1,18 @@
+port: 8000
+name: bloom-7b1
+route_prefix: /bloom-7b1
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: bigscience/bloom-7b1
+  tokenizer_name_or_path: bigscience/bloom-7b1
+  chat_processor: ChatModelGptJ
+  prompt:
+    intro: ''
+    human_id: ''
+    bot_id: ''
+    stop_words: []
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
new file mode 100644
index 000000000..50f9e3ec3
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
@@ -0,0 +1,20 @@
+port: 8000
+name: falcon-40b
+route_prefix: /falcon-40b
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+deepspeed: true
+workers_per_group: 8
+device: hpu
+model_description:
+  model_id_or_path: tiiuae/falcon-40b
+  tokenizer_name_or_path: tiiuae/falcon-40b
+  chat_processor: ChatModelGptJ
+  prompt:
+    intro: ''
+    human_id: ''
+    bot_id: ''
+    stop_words: []
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
new file mode 100644
index 000000000..e450a066a
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
@@ -0,0 +1,18 @@
+port: 8000
+name: falcon-7b
+route_prefix: /falcon-7b
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: tiiuae/falcon-7b
+  tokenizer_name_or_path: tiiuae/falcon-7b
+  chat_processor: ChatModelGptJ
+  prompt:
+    intro: ''
+    human_id: ''
+    bot_id: ''
+    stop_words: []
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml b/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml
new file mode 100644
index 000000000..5e84eb2e4
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml
@@ -0,0 +1,20 @@
+port: 8000
+name: gemma-2b
+route_prefix: /gemma-2b
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: google/gemma-2b
+  tokenizer_name_or_path: google/gemma-2b
+  chat_processor: ChatModelGemma
+  prompt:
+    intro: ''
+    human_id: '<bos><start_of_turn>user
+    {msg}<end_of_turn>'
+    bot_id: '<bos><start_of_turn>model
+    {msg}<end_of_turn>'
+    stop_words: []
+  config:
+    use_auth_token: ' '
diff --git a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
new file mode 100644
index 000000000..203763fa9
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
@@ -0,0 +1,24 @@
+port: 8000
+name: gpt-j-6b
+route_prefix: /gpt-j-6b
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: EleutherAI/gpt-j-6b
+  tokenizer_name_or_path: EleutherAI/gpt-j-6b
+  chat_processor: ChatModelGptJ
+  gpt_base_model: true
+  prompt:
+    intro: 'Below is an instruction that describes a task. Write a response that appropriately
+      completes the request.
+
+      '
+    human_id: '
+
+      ### Instruction'
+    bot_id: '
+
+      ### Response'
+    stop_words: []
diff --git a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
new file mode 100644
index 000000000..c22b97288
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
@@ -0,0 +1,17 @@
+port: 8000
+name: gpt2
+route_prefix: /gpt2
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: gpt2
+  tokenizer_name_or_path: gpt2
+  chat_processor: ChatModelGptJ
+  gpt_base_model: true
+  prompt:
+    intro: ''
+    human_id: ''
+    bot_id: ''
+    stop_words: []
diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml
new file mode 100644
index 000000000..6976eafb3
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml
@@ -0,0 +1,21 @@
+port: 8000
+name: meta-llama-3-70b-instruct
+route_prefix: /meta-llama-3-70b-instruct
+cpus_per_worker: 8
+hpus_per_worker: 1
+deepspeed: true
+workers_per_group: 8
+device: hpu
+model_description:
+  model_id_or_path: meta-llama/Meta-Llama-3-70b-Instruct
+  tokenizer_name_or_path: meta-llama/Meta-Llama-3-70b-Instruct
+  chat_processor: ChatModelLLama
+  prompt:
+    intro: ''
+    human_id: '[INST] {msg} [/INST]
+
+      '
+    bot_id: ''
+    stop_words: []
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml
new file mode 100644
index 000000000..d57ffcc22
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml
@@ -0,0 +1,12 @@
+port: 8000
+name: meta-llama-3-8b-instruct
+route_prefix: /meta-llama-3-8b-instruct
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: meta-llama/Meta-Llama-3-8b-Instruct
+  tokenizer_name_or_path: meta-llama/Meta-Llama-3-8b-Instruct
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml b/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml
new file mode 100644
index 000000000..e91f08e8a
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml
@@ -0,0 +1,20 @@
+port: 8000
+name: mistral-7b-v0.1
+route_prefix: /mistral-7b-v0.1
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: mistralai/Mistral-7B-v0.1
+  tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
+  chat_processor: ChatModelLLama
+  prompt:
+    intro: ''
+    human_id: '[INST] {msg} [/INST]
+
+      '
+    bot_id: ''
+    stop_words: []
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
new file mode 100644
index 000000000..5d6b36dc1
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
@@ -0,0 +1,25 @@
+port: 8000
+name: mpt-7b
+route_prefix: /mpt-7b
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: EleutherAI/gpt-neox-20b
+  tokenizer_name_or_path: EleutherAI/gpt-neox-20b
+  chat_processor: ChatModelLLama
+  prompt:
+    intro: 'Below is an instruction that describes a task, paired with an input that
+      provides further context. Write a response that appropriately completes the request.
+
+      '
+    human_id: '
+
+      ### Instruction'
+    bot_id: '
+
+      ### Response'
+    stop_words: []
+  config:
+    trust_remote_code: true
diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml
new file mode 100644
index 000000000..64566a6d8
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml
@@ -0,0 +1,17 @@
+port: 8000
+name: neural-chat-7b-v3-3
+route_prefix: /neural-chat-7b-v3-3
+num_replicas: 1
+cpus_per_worker: 0
+gpus_per_worker: 0
+hpus_per_worker: 1
+deepspeed: false
+workers_per_group: 2
+device: hpu
+ipex:
+  enabled: false
+  precision: bf16
+model_description:
+  model_id_or_path: Intel/neural-chat-7b-v3-3
+  tokenizer_name_or_path: Intel/neural-chat-7b-v3-3
+  chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja"

From cd543a5f92db697f89a2e5ae14ba1a940ad5eb19 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Thu, 16 May 2024 15:50:33 +0000
Subject: [PATCH 02/26] nit

---
 .github/workflows/workflow_inference_gaudi2.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index 139d3a21c..1bb5d6278 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -109,6 +109,7 @@ jobs:
           TARGET=${{steps.target.outputs.target}}
           CMD=$(cat << EOF
           import yaml
+          model = ${{ matrix.model }}
           conf_path = "llm_on_ray/inference/models/hpu/" + "${{ matrix.model }}" + "-hpu.yaml"
           with open(conf_path, encoding="utf-8") as reader:
               result = yaml.load(reader, Loader=yaml.FullLoader)

From 55595f7169600e2ddf76555df4ebda53f6d01627 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Fri, 17 May 2024 01:46:47 +0000
Subject: [PATCH 03/26] fix

---
 .github/workflows/workflow_inference_gaudi2.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index 1bb5d6278..3e40ab11e 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -116,6 +116,7 @@ jobs:
               result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
           with open(conf_path, 'w') as output:
               yaml.dump(result, output, sort_keys=False)
+          EOF
           )
           docker exec "${TARGET}" python -c "$CMD"
           docker exec "${TARGET}" bash -c "llm_on_ray-serve  --config_file llm_on_ray/inference/models/hpu/${{ matrix.model }}-hpu.yaml --keep_serve_terminal"

From 0a72f39ed5bb2874c9ff30a8a754aa06aacafc62 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Fri, 17 May 2024 01:51:19 +0000
Subject: [PATCH 04/26] remove

---
 .github/workflows/workflow_inference_gaudi2.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index 3e40ab11e..0f94960f0 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -109,7 +109,6 @@ jobs:
           TARGET=${{steps.target.outputs.target}}
           CMD=$(cat << EOF
           import yaml
-          model = ${{ matrix.model }}
           conf_path = "llm_on_ray/inference/models/hpu/" + "${{ matrix.model }}" + "-hpu.yaml"
           with open(conf_path, encoding="utf-8") as reader:
               result = yaml.load(reader, Loader=yaml.FullLoader)

From 0b17988cf743127445266677a0415d393e4f5b1e Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Fri, 17 May 2024 02:19:56 +0000
Subject: [PATCH 05/26] add config

---
 llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml     | 2 ++
 llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml   | 2 ++
 llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml            | 2 ++
 llm_on_ray/inference/models/hpu/gpt2-hpu.yaml                | 2 ++
 llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml              | 1 +
 llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml | 2 ++
 6 files changed, 11 insertions(+)

diff --git a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
index fa794db98..6d5780a9d 100644
--- a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
@@ -14,3 +14,5 @@ model_description:
     human_id: ''
     bot_id: ''
     stop_words: []
+  config:
+    use_auth_token: ''
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml b/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml
index 1e5779dd8..e84f7126f 100644
--- a/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml
@@ -14,3 +14,5 @@ model_description:
     human_id: ''
     bot_id: ''
     stop_words: []
+  config:
+    use_auth_token: ''
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
index 203763fa9..cfde9a781 100644
--- a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
@@ -22,3 +22,5 @@ model_description:
 
       ### Response'
     stop_words: []
+  config:
+    use_auth_token: ''
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
index c22b97288..3c1cf1913 100644
--- a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
@@ -15,3 +15,5 @@ model_description:
     human_id: ''
     bot_id: ''
     stop_words: []
+  config:
+    use_auth_token: ''
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
index 5d6b36dc1..e45a64ea6 100644
--- a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
@@ -22,4 +22,5 @@ model_description:
       ### Response'
     stop_words: []
   config:
+    use_auth_token: ''
     trust_remote_code: true
diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml
index 64566a6d8..bc147f105 100644
--- a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml
@@ -15,3 +15,5 @@ model_description:
   model_id_or_path: Intel/neural-chat-7b-v3-3
   tokenizer_name_or_path: Intel/neural-chat-7b-v3-3
   chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja"
+  config:
+    use_auth_token: ''
\ No newline at end of file

From ef337637603a7b099097797d91cecc543ce8377b Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Fri, 17 May 2024 02:20:09 +0000
Subject: [PATCH 06/26] nit

---
 llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
index 64566a6d8..bc147f105 100644
--- a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
+++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
@@ -15,3 +15,5 @@ model_description:
   model_id_or_path: Intel/neural-chat-7b-v3-3
   tokenizer_name_or_path: Intel/neural-chat-7b-v3-3
   chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja"
+  config:
+    use_auth_token: ''
\ No newline at end of file

From c1b2a2d6512e0a2586b84c350ca9065f5f7088ba Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Fri, 17 May 2024 03:45:59 +0000
Subject: [PATCH 07/26] remove prompt and add gpt2

---
 .github/workflows/workflow_inference_gaudi2.yml     |  3 ++-
 .../inference/models/hpu/CodeLlama-7b-hf-hpu.yaml   |  5 -----
 .../inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml |  5 -----
 llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml |  5 -----
 llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml  |  5 -----
 llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml |  5 -----
 llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml  |  5 -----
 llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml   |  7 -------
 llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml   | 12 ------------
 llm_on_ray/inference/models/hpu/gpt2-hpu.yaml       |  5 -----
 .../models/hpu/llama-3-70b-chat-hf-hpu.yaml         | 13 -------------
 .../models/hpu/llama-3-8b-instruct-hpu.yaml         | 12 ------------
 .../models/hpu/meta-llama-3-70b-instruct-hpu.yaml   |  7 -------
 .../inference/models/hpu/mistral-7b-v0.1-hpu.yaml   |  7 -------
 llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml     | 12 ------------
 15 files changed, 2 insertions(+), 106 deletions(-)
 delete mode 100644 llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml
 delete mode 100644 llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index 0f94960f0..3e2c2fa51 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -28,7 +28,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ bloom-7b1, bloom-560m, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, MindChat-Qwen2-4B, mistral-7b-v0.1, mpt-7b ]
+        model: [ bloom-7b1, bloom-560m, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, MindChat-Qwen2-4B, mistral-7b-v0.1, mpt-7b ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -43,6 +43,7 @@ jobs:
           - { model: "falcon-40b"}
           - { model: "gemma-2b"}
           - { model: "gpt-j-6b"}
+          - { model: "gpt2"}
           - { model: "llama-2-7b-chat-hf"}
           - { model: "llama-2-70b-chat-hf"}
           - { model: "meta-llama-3-8b-instruct"}
diff --git a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
index 6d5780a9d..3f93b2c92 100644
--- a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
@@ -9,10 +9,5 @@ model_description:
   model_id_or_path: codellama/CodeLlama-7b-hf
   tokenizer_name_or_path: codellama/CodeLlama-7b-hf
   chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml b/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml
index e84f7126f..2f26846cf 100644
--- a/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml
@@ -9,10 +9,5 @@ model_description:
   model_id_or_path: X-D-Lab/MindChat-Qwen2-4B
   tokenizer_name_or_path: X-D-Lab/MindChat-Qwen2-4B
   chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml
index 9fcd0f193..a72fe4f72 100644
--- a/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml
@@ -11,10 +11,5 @@ model_description:
   model_id_or_path: bigscience/bloom-560m
   tokenizer_name_or_path: bigscience/bloom-560m
   chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
index fc7f4217b..464540aae 100644
--- a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
@@ -9,10 +9,5 @@ model_description:
   model_id_or_path: bigscience/bloom-7b1
   tokenizer_name_or_path: bigscience/bloom-7b1
   chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
index 50f9e3ec3..c900af72a 100644
--- a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
@@ -11,10 +11,5 @@ model_description:
   model_id_or_path: tiiuae/falcon-40b
   tokenizer_name_or_path: tiiuae/falcon-40b
   chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
index e450a066a..17c6cf85c 100644
--- a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
@@ -9,10 +9,5 @@ model_description:
   model_id_or_path: tiiuae/falcon-7b
   tokenizer_name_or_path: tiiuae/falcon-7b
   chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml b/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml
index 5e84eb2e4..15f13c890 100644
--- a/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml
@@ -9,12 +9,5 @@ model_description:
   model_id_or_path: google/gemma-2b
   tokenizer_name_or_path: google/gemma-2b
   chat_processor: ChatModelGemma
-  prompt:
-    intro: ''
-    human_id: '<bos><start_of_turn>user
-    {msg}<end_of_turn>'
-    bot_id: '<bos><start_of_turn>model
-    {msg}<end_of_turn>'
-    stop_words: []
   config:
     use_auth_token: ' '
diff --git a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
index cfde9a781..3177ad69a 100644
--- a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
@@ -10,17 +10,5 @@ model_description:
   tokenizer_name_or_path: EleutherAI/gpt-j-6b
   chat_processor: ChatModelGptJ
   gpt_base_model: true
-  prompt:
-    intro: 'Below is an instruction that describes a task. Write a response that appropriately
-      completes the request.
-
-      '
-    human_id: '
-
-      ### Instruction'
-    bot_id: '
-
-      ### Response'
-    stop_words: []
   config:
     use_auth_token: ''
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
index 3c1cf1913..4d03d004f 100644
--- a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
@@ -10,10 +10,5 @@ model_description:
   tokenizer_name_or_path: gpt2
   chat_processor: ChatModelGptJ
   gpt_base_model: true
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml
deleted file mode 100644
index 32cf9bb4e..000000000
--- a/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-port: 8000
-name: meta-llama-3-70b-instruct
-route_prefix: /meta-llama-3-70b-instruct
-cpus_per_worker: 8
-hpus_per_worker: 1
-deepspeed: true
-workers_per_group: 8
-device: hpu
-model_description:
-  model_id_or_path: meta-llama/Meta-Llama-3-70b-Instruct
-  tokenizer_name_or_path: meta-llama/Meta-Llama-3-70b-Instruct
-  config:
-    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml
deleted file mode 100644
index d57ffcc22..000000000
--- a/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-port: 8000
-name: meta-llama-3-8b-instruct
-route_prefix: /meta-llama-3-8b-instruct
-num_replicas: 1
-cpus_per_worker: 8
-hpus_per_worker: 1
-device: hpu
-model_description:
-  model_id_or_path: meta-llama/Meta-Llama-3-8b-Instruct
-  tokenizer_name_or_path: meta-llama/Meta-Llama-3-8b-Instruct
-  config:
-    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml
index 6976eafb3..d8bdb1d7d 100644
--- a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml
@@ -10,12 +10,5 @@ model_description:
   model_id_or_path: meta-llama/Meta-Llama-3-70b-Instruct
   tokenizer_name_or_path: meta-llama/Meta-Llama-3-70b-Instruct
   chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml b/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml
index e91f08e8a..ad3bcbaa7 100644
--- a/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml
@@ -9,12 +9,5 @@ model_description:
   model_id_or_path: mistralai/Mistral-7B-v0.1
   tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
   chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
index e45a64ea6..bcc2225b4 100644
--- a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
@@ -9,18 +9,6 @@ model_description:
   model_id_or_path: EleutherAI/gpt-neox-20b
   tokenizer_name_or_path: EleutherAI/gpt-neox-20b
   chat_processor: ChatModelLLama
-  prompt:
-    intro: 'Below is an instruction that describes a task, paired with an input that
-      provides further context. Write a response that appropriately completes the request.
-
-      '
-    human_id: '
-
-      ### Instruction'
-    bot_id: '
-
-      ### Response'
-    stop_words: []
   config:
     use_auth_token: ''
     trust_remote_code: true

From 28acd73642fc8bc389d3b750c8b165bbf2e55a2e Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Mon, 20 May 2024 07:41:21 +0000
Subject: [PATCH 08/26] check and add all template, remove bloom-560m, add
 mixtral, change Qwen to version 1.5

---
 .github/workflows/workflow_inference_gaudi2.yml   |  4 ++--
 .../inference/models/hpu/CodeLlama-7b-hf-hpu.yaml |  2 +-
 .../models/hpu/MindChat-Qwen2-4B-hpu.yaml         | 13 -------------
 .../hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml       | 12 ++++++++++++
 .../inference/models/hpu/Qwen1.5-110B-hpu.yaml    | 12 ++++++++++++
 .../inference/models/hpu/bloom-560m-hpu.yaml      | 15 ---------------
 .../inference/models/hpu/bloom-7b1-hpu.yaml       |  2 +-
 .../inference/models/hpu/falcon-40b-hpu.yaml      |  2 +-
 .../inference/models/hpu/falcon-7b-hpu.yaml       |  2 +-
 llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml |  2 +-
 llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml |  4 ++--
 llm_on_ray/inference/models/hpu/gpt2-hpu.yaml     |  2 +-
 .../models/hpu/llama-2-70b-chat-hf-hpu.yaml       |  1 +
 .../models/hpu/llama-2-7b-chat-hf-hpu.yaml        |  1 +
 .../models/hpu/meta-llama-3-70b-instruct-hpu.yaml |  2 +-
 .../models/hpu/meta-llama-3-8b-instruct-hpu.yaml  |  1 +
 .../inference/models/hpu/mistral-7b-v0.1-hpu.yaml |  2 +-
 llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml   |  2 +-
 18 files changed, 40 insertions(+), 41 deletions(-)
 delete mode 100644 llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml
 create mode 100644 llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml
 create mode 100644 llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml
 delete mode 100644 llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index 3e2c2fa51..35aaf0a7b 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -28,7 +28,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ bloom-7b1, bloom-560m, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, MindChat-Qwen2-4B, mistral-7b-v0.1, mpt-7b ]
+        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, MindChat-Qwen2-4B, mistral-7b-v0.1, Mixtral-8x7B-Instruct-v0.1, mpt-7b ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -37,7 +37,6 @@ jobs:
 
         include:
           - { model: "bloom-7b1"}
-          - { model: "bloom-560m"}
           - { model: "CodeLlama-7b-hf"}
           - { model: "falcon-7b"}
           - { model: "falcon-40b"}
@@ -50,6 +49,7 @@ jobs:
           - { model: "meta-llama-3-70b-instruct"}
           - { model: "MindChat-Qwen2-4B"}
           - { model: "mistral-7b-v0.1"}
+          - { model: "Mixtral-8x7B-Instruct-v0.1"}
           - { model: "mpt-7b"}
 
     runs-on: gaudi2
diff --git a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
index 3f93b2c92..1f9d40c2b 100644
--- a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
@@ -8,6 +8,6 @@ device: hpu
 model_description:
   model_id_or_path: codellama/CodeLlama-7b-hf
   tokenizer_name_or_path: codellama/CodeLlama-7b-hf
-  chat_processor: ChatModelGptJ
+  chat_template: "llm_on_ray/inference/models/templates/template_codellama.jinja"
   config:
     use_auth_token: ''
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml b/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml
deleted file mode 100644
index 2f26846cf..000000000
--- a/llm_on_ray/inference/models/hpu/MindChat-Qwen2-4B-hpu.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-port: 8000
-name: MindChat-Qwen2-4B 
-route_prefix: /MindChat-Qwen2-4B 
-num_replicas: 1
-cpus_per_worker: 8
-hpus_per_worker: 1
-device: hpu
-model_description:
-  model_id_or_path: X-D-Lab/MindChat-Qwen2-4B
-  tokenizer_name_or_path: X-D-Lab/MindChat-Qwen2-4B
-  chat_processor: ChatModelGptJ
-  config:
-    use_auth_token: ''
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml b/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml
new file mode 100644
index 000000000..da331511e
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml
@@ -0,0 +1,12 @@
+port: 8000
+name: Mixtral-8x7B-Instruct-v0.1
+route_prefix: /Mixtral-8x7B-Instruct-v0.1
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: mistralai/Mixtral-8x7B-Instruct-v0.1
+  tokenizer_name_or_path: bmistralai/Mixtral-8x7B-Instruct-v0.1
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml b/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml
new file mode 100644
index 000000000..f8836e4b6
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml
@@ -0,0 +1,12 @@
+port: 8000
+name: Qwen1.5-110B
+route_prefix: /Qwen1.5-110B
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: Qwen/Qwen1.5-110B
+  tokenizer_name_or_path: Qwen/Qwen1.5-110B
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml
deleted file mode 100644
index a72fe4f72..000000000
--- a/llm_on_ray/inference/models/hpu/bloom-560m-hpu.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-port: 8000
-name: bloom-560m
-route_prefix: /bloom-560m
-num_replicas: 1
-cpus_per_worker: 8
-hpus_per_worker: 1
-deepspeed: true
-workers_per_group: 8
-device: hpu
-model_description:
-  model_id_or_path: bigscience/bloom-560m
-  tokenizer_name_or_path: bigscience/bloom-560m
-  chat_processor: ChatModelGptJ
-  config:
-    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
index 464540aae..9b2b69861 100644
--- a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
@@ -8,6 +8,6 @@ device: hpu
 model_description:
   model_id_or_path: bigscience/bloom-7b1
   tokenizer_name_or_path: bigscience/bloom-7b1
-  chat_processor: ChatModelGptJ
+  chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
index c900af72a..a017ca465 100644
--- a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
@@ -10,6 +10,6 @@ device: hpu
 model_description:
   model_id_or_path: tiiuae/falcon-40b
   tokenizer_name_or_path: tiiuae/falcon-40b
-  chat_processor: ChatModelGptJ
+  chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
index 17c6cf85c..9e0a8931f 100644
--- a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
@@ -8,6 +8,6 @@ device: hpu
 model_description:
   model_id_or_path: tiiuae/falcon-7b
   tokenizer_name_or_path: tiiuae/falcon-7b
-  chat_processor: ChatModelGptJ
+  chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml b/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml
index 15f13c890..3b7e6d582 100644
--- a/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml
@@ -8,6 +8,6 @@ device: hpu
 model_description:
   model_id_or_path: google/gemma-2b
   tokenizer_name_or_path: google/gemma-2b
-  chat_processor: ChatModelGemma
+  chat_template: "llm_on_ray/inference/models/templates/template_gemma.jinja"
   config:
     use_auth_token: ' '
diff --git a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
index 3177ad69a..47abb8829 100644
--- a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
@@ -8,7 +8,7 @@ device: hpu
 model_description:
   model_id_or_path: EleutherAI/gpt-j-6b
   tokenizer_name_or_path: EleutherAI/gpt-j-6b
-  chat_processor: ChatModelGptJ
+  chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja"
   gpt_base_model: true
   config:
-    use_auth_token: ''
\ No newline at end of file
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
index 4d03d004f..34803ad2d 100644
--- a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
@@ -8,7 +8,7 @@ device: hpu
 model_description:
   model_id_or_path: gpt2
   tokenizer_name_or_path: gpt2
-  chat_processor: ChatModelGptJ
+  chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja"
   gpt_base_model: true
   config:
     use_auth_token: ''
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml
index ab411ff0e..776a1b76a 100644
--- a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml
@@ -10,5 +10,6 @@ device: hpu
 model_description:
   model_id_or_path: meta-llama/Llama-2-70b-chat-hf
   tokenizer_name_or_path: meta-llama/Llama-2-70b-chat-hf
+  chat_template: "llm_on_ray/inference/models/templates/template_llama2.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml
index b7b19f02a..fb2027d12 100644
--- a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml
@@ -8,5 +8,6 @@ device: hpu
 model_description:
   model_id_or_path: meta-llama/Llama-2-7b-chat-hf
   tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
+  chat_template: "llm_on_ray/inference/models/templates/template_llama2.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml
index d8bdb1d7d..f1b6fac9c 100644
--- a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml
@@ -9,6 +9,6 @@ device: hpu
 model_description:
   model_id_or_path: meta-llama/Meta-Llama-3-70b-Instruct
   tokenizer_name_or_path: meta-llama/Meta-Llama-3-70b-Instruct
-  chat_processor: ChatModelLLama
+  chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml
index d57ffcc22..cb3d3ada6 100644
--- a/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml
@@ -8,5 +8,6 @@ device: hpu
 model_description:
   model_id_or_path: meta-llama/Meta-Llama-3-8b-Instruct
   tokenizer_name_or_path: meta-llama/Meta-Llama-3-8b-Instruct
+  chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml b/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml
index ad3bcbaa7..738ad49b2 100644
--- a/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml
@@ -8,6 +8,6 @@ device: hpu
 model_description:
   model_id_or_path: mistralai/Mistral-7B-v0.1
   tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
-  chat_processor: ChatModelLLama
+  chat_template: "llm_on_ray/inference/models/templates/template_mistral.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
index bcc2225b4..f9021b85f 100644
--- a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
@@ -8,7 +8,7 @@ device: hpu
 model_description:
   model_id_or_path: EleutherAI/gpt-neox-20b
   tokenizer_name_or_path: EleutherAI/gpt-neox-20b
-  chat_processor: ChatModelLLama
+  chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja"
   config:
     use_auth_token: ''
     trust_remote_code: true

From ecf40e68fae314ed2b6b2f84d0f414f583b9c711 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Tue, 21 May 2024 05:03:26 +0000
Subject: [PATCH 09/26] nit

---
 llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml              | 2 +-
 llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml             | 2 +-
 llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml              | 2 +-
 llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml               | 2 +-
 .../inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml     | 2 +-
 .../inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml      | 2 +-
 llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml                 | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
index 9b2b69861..b6f4d949e 100644
--- a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
@@ -8,6 +8,6 @@ device: hpu
 model_description:
   model_id_or_path: bigscience/bloom-7b1
   tokenizer_name_or_path: bigscience/bloom-7b1
-  chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja"
+  chat_template: "llm_on_ray/inference/models/templates/default_template.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
index a017ca465..ecf0cd303 100644
--- a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
@@ -10,6 +10,6 @@ device: hpu
 model_description:
   model_id_or_path: tiiuae/falcon-40b
   tokenizer_name_or_path: tiiuae/falcon-40b
-  chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja"
+  chat_template: "llm_on_ray/inference/models/templates/default_template.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
index 9e0a8931f..86b7efaae 100644
--- a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
@@ -8,6 +8,6 @@ device: hpu
 model_description:
   model_id_or_path: tiiuae/falcon-7b
   tokenizer_name_or_path: tiiuae/falcon-7b
-  chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja"
+  chat_template: "llm_on_ray/inference/models/templates/default_template.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
index 47abb8829..6f675f05e 100644
--- a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
@@ -8,7 +8,7 @@ device: hpu
 model_description:
   model_id_or_path: EleutherAI/gpt-j-6b
   tokenizer_name_or_path: EleutherAI/gpt-j-6b
-  chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja"
+  chat_template: "llm_on_ray/inference/models/templates/default_template.jinja"
   gpt_base_model: true
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml
index f1b6fac9c..58f96b110 100644
--- a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml
@@ -9,6 +9,6 @@ device: hpu
 model_description:
   model_id_or_path: meta-llama/Meta-Llama-3-70b-Instruct
   tokenizer_name_or_path: meta-llama/Meta-Llama-3-70b-Instruct
-  chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja"
+  chat_template: "llm_on_ray/inference/models/templates/default_template.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml
index cb3d3ada6..b2b31221c 100644
--- a/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml
@@ -8,6 +8,6 @@ device: hpu
 model_description:
   model_id_or_path: meta-llama/Meta-Llama-3-8b-Instruct
   tokenizer_name_or_path: meta-llama/Meta-Llama-3-8b-Instruct
-  chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja"
+  chat_template: "llm_on_ray/inference/models/templates/default_template.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
index f9021b85f..6f6701b97 100644
--- a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
@@ -8,7 +8,7 @@ device: hpu
 model_description:
   model_id_or_path: EleutherAI/gpt-neox-20b
   tokenizer_name_or_path: EleutherAI/gpt-neox-20b
-  chat_template: "llm_on_ray/inference/models/templates/default_codellama.jinja"
+  chat_template: "llm_on_ray/inference/models/templates/default_template.jinja"
   config:
     use_auth_token: ''
     trust_remote_code: true

From f4e02ff40600e71ccf4de3593d456f84418c6009 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Tue, 21 May 2024 07:53:37 +0000
Subject: [PATCH 10/26] fix

---
 .github/workflows/workflow_inference_gaudi2.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index 35aaf0a7b..700ab65e4 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -28,7 +28,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, MindChat-Qwen2-4B, mistral-7b-v0.1, Mixtral-8x7B-Instruct-v0.1, mpt-7b ]
+        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-8x7B-Instruct-v0.1, mpt-7b ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -47,7 +47,7 @@ jobs:
           - { model: "llama-2-70b-chat-hf"}
           - { model: "meta-llama-3-8b-instruct"}
           - { model: "meta-llama-3-70b-instruct"}
-          - { model: "MindChat-Qwen2-4B"}
+          - { model: "Qwen1.5-110B"}
           - { model: "mistral-7b-v0.1"}
           - { model: "Mixtral-8x7B-Instruct-v0.1"}
           - { model: "mpt-7b"}

From 8218531270cd513207533177c5e67f14d2463780 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Tue, 21 May 2024 08:25:09 +0000
Subject: [PATCH 11/26] fix

---
 .../inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml b/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml
index da331511e..e8424b60e 100644
--- a/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml
@@ -7,6 +7,6 @@ hpus_per_worker: 1
 device: hpu
 model_description:
   model_id_or_path: mistralai/Mixtral-8x7B-Instruct-v0.1
-  tokenizer_name_or_path: bmistralai/Mixtral-8x7B-Instruct-v0.1
+  tokenizer_name_or_path: mistralai/Mixtral-8x7B-Instruct-v0.1
   config:
     use_auth_token: ''

From 0baa303fb386a4942109e85eb67c3b09618a5f23 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Wed, 22 May 2024 02:18:30 +0000
Subject: [PATCH 12/26] fix

---
 .github/workflows/workflow_inference_gaudi2.yml               | 4 ++--
 ...ixtral-8x7B-Instruct-v0.1-hpu.yaml => Mixtral-7B-hpu.yaml} | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)
 rename llm_on_ray/inference/models/hpu/{Mixtral-8x7B-Instruct-v0.1-hpu.yaml => Mixtral-7B-hpu.yaml} (76%)

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index 700ab65e4..b4d8f691f 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -28,7 +28,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-8x7B-Instruct-v0.1, mpt-7b ]
+        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-7B, mpt-7b ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -49,7 +49,7 @@ jobs:
           - { model: "meta-llama-3-70b-instruct"}
           - { model: "Qwen1.5-110B"}
           - { model: "mistral-7b-v0.1"}
-          - { model: "Mixtral-8x7B-Instruct-v0.1"}
+          - { model: "Mixtral-7B"}
           - { model: "mpt-7b"}
 
     runs-on: gaudi2
diff --git a/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml b/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml
similarity index 76%
rename from llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml
rename to llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml
index e8424b60e..fbd6ccaeb 100644
--- a/llm_on_ray/inference/models/hpu/Mixtral-8x7B-Instruct-v0.1-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml
@@ -1,6 +1,6 @@
 port: 8000
-name: Mixtral-8x7B-Instruct-v0.1
-route_prefix: /Mixtral-8x7B-Instruct-v0.1
+name: Mixtral-7B
+route_prefix: /Mixtral-7B
 num_replicas: 1
 cpus_per_worker: 8
 hpus_per_worker: 1

From 96b36bd3139e73d4dbe0ae8392d667866bafc936 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Wed, 22 May 2024 02:40:00 +0000
Subject: [PATCH 13/26] remove default template

---
 llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml               | 1 -
 llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml              | 1 -
 llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml               | 1 -
 llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml                | 1 -
 .../inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml      | 1 -
 .../inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml       | 1 -
 llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml                  | 1 -
 7 files changed, 7 deletions(-)

diff --git a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
index b6f4d949e..5de2a3e30 100644
--- a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
@@ -8,6 +8,5 @@ device: hpu
 model_description:
   model_id_or_path: bigscience/bloom-7b1
   tokenizer_name_or_path: bigscience/bloom-7b1
-  chat_template: "llm_on_ray/inference/models/templates/default_template.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
index ecf0cd303..dd6bd2acf 100644
--- a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
@@ -10,6 +10,5 @@ device: hpu
 model_description:
   model_id_or_path: tiiuae/falcon-40b
   tokenizer_name_or_path: tiiuae/falcon-40b
-  chat_template: "llm_on_ray/inference/models/templates/default_template.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
index 86b7efaae..e21110c08 100644
--- a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
@@ -8,6 +8,5 @@ device: hpu
 model_description:
   model_id_or_path: tiiuae/falcon-7b
   tokenizer_name_or_path: tiiuae/falcon-7b
-  chat_template: "llm_on_ray/inference/models/templates/default_template.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
index 6f675f05e..8260a6445 100644
--- a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
@@ -8,7 +8,6 @@ device: hpu
 model_description:
   model_id_or_path: EleutherAI/gpt-j-6b
   tokenizer_name_or_path: EleutherAI/gpt-j-6b
-  chat_template: "llm_on_ray/inference/models/templates/default_template.jinja"
   gpt_base_model: true
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml
index 58f96b110..32cf9bb4e 100644
--- a/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml
@@ -9,6 +9,5 @@ device: hpu
 model_description:
   model_id_or_path: meta-llama/Meta-Llama-3-70b-Instruct
   tokenizer_name_or_path: meta-llama/Meta-Llama-3-70b-Instruct
-  chat_template: "llm_on_ray/inference/models/templates/default_template.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml
index b2b31221c..d57ffcc22 100644
--- a/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml
@@ -8,6 +8,5 @@ device: hpu
 model_description:
   model_id_or_path: meta-llama/Meta-Llama-3-8b-Instruct
   tokenizer_name_or_path: meta-llama/Meta-Llama-3-8b-Instruct
-  chat_template: "llm_on_ray/inference/models/templates/default_template.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
index 6f6701b97..df4908376 100644
--- a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
@@ -8,7 +8,6 @@ device: hpu
 model_description:
   model_id_or_path: EleutherAI/gpt-neox-20b
   tokenizer_name_or_path: EleutherAI/gpt-neox-20b
-  chat_template: "llm_on_ray/inference/models/templates/default_template.jinja"
   config:
     use_auth_token: ''
     trust_remote_code: true

From 1de2ebb6c047d06e2afec13bbd711ca26b497ca6 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Thu, 23 May 2024 02:31:26 +0000
Subject: [PATCH 14/26] fix when list length is 1

---
 llm_on_ray/inference/predictor_deployment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index f5ac35d80..319eb5e09 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -335,7 +335,7 @@ def preprocess_prompts(
             HTTPException: If the input prompt format is invalid or not supported.
         """
 
-        if isinstance(input, str):
+        if isinstance(input, str) or (isinstance(input, List) and len(input) == 1):
             return input
         elif isinstance(input, List):
             prompts = []

From 9b8e57d7fa5ea944619e7cd27e2679233e7a7964 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Thu, 23 May 2024 02:50:53 +0000
Subject: [PATCH 15/26] fix

---
 llm_on_ray/inference/predictor_deployment.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index 319eb5e09..3bb6a4ffc 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -335,8 +335,10 @@ def preprocess_prompts(
             HTTPException: If the input prompt format is invalid or not supported.
         """
 
-        if isinstance(input, str) or (isinstance(input, List) and len(input) == 1):
+        if isinstance(input, str):
             return input
+        elif isinstance(input, List) and len(input) == 1:
+            return input[0]
         elif isinstance(input, List):
             prompts = []
             images = []

From 8940d0de5af44e0d05f8b0958470327f6a471799 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Mon, 27 May 2024 01:43:47 +0000
Subject: [PATCH 16/26] fix target

---
 .github/workflows/workflow_inference_gaudi2.yml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index cc678f361..fb22630cd 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -72,13 +72,11 @@ jobs:
       - name: Determine Target
         id: "target"
         run: |
-          target="${target}_gaudi2"
-          if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf" ]]; then
-            target="${target}_gaudi2"
-          elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then
-            target="${target}_gaudi2"
-          elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
+          target="inference"
+          if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
             target="${target}_vllm_gaudi2"
+          else
+            target="${target}_gaudi2"
           fi
           echo "target is ${target}"
           echo "target=$target" >> $GITHUB_OUTPUT

From 50c4988fc007c83764a6a3594dd4fcd40cf454e1 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Mon, 27 May 2024 12:20:20 +0000
Subject: [PATCH 17/26] change cache dir

---
 .github/workflows/workflow_inference_gaudi2.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index fb22630cd..47878a205 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -17,7 +17,7 @@ on:
         default: '/home/ci/actions-runner/_work/llm-on-ray/llm-on-ray'
       model_cache_path:
         type: string
-        default: '/mnt/DP_disk1/huggingface/cache'
+        default: '/scratch-2/huggingface/cache'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-inf-gaudi2

From 762e84c1a099b381e405bf8a3fd4ebb68c7b4252 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Wed, 29 May 2024 01:09:19 +0000
Subject: [PATCH 18/26] remove Mixtral

---
 .github/workflows/workflow_inference_gaudi2.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index 47878a205..bbdd9ac8a 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -28,7 +28,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-7B, mpt-7b, llama-2-7b-chat-hf-vllm ]
+        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, mpt-7b, llama-2-7b-chat-hf-vllm ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -49,7 +49,6 @@ jobs:
           - { model: "meta-llama-3-70b-instruct"}
           - { model: "Qwen1.5-110B"}
           - { model: "mistral-7b-v0.1"}
-          - { model: "Mixtral-7B"}
           - { model: "mpt-7b"}
           - { model: "llama-2-7b-chat-hf-vllm"}
 

From 012bac2c2950fc781b21ba4d62dc8f751ab6f47e Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Wed, 29 May 2024 03:49:20 +0000
Subject: [PATCH 19/26] change to 8 cards

---
 llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml b/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml
index f8836e4b6..5086c3d2e 100644
--- a/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml
@@ -4,6 +4,8 @@ route_prefix: /Qwen1.5-110B
 num_replicas: 1
 cpus_per_worker: 8
 hpus_per_worker: 1
+deepspeed: true
+workers_per_group: 8
 device: hpu
 model_description:
   model_id_or_path: Qwen/Qwen1.5-110B

From 4496e73d1a38ae254fc68eabbf549e028a57bb8b Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Thu, 30 May 2024 02:00:59 +0000
Subject: [PATCH 20/26] remove Qwen and fix

---
 .github/workflows/workflow_inference_gaudi2.yml              | 3 +--
 llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml     | 2 +-
 llm_on_ray/inference/models/hpu/gpt2-hpu.yaml                | 2 +-
 llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml | 2 +-
 llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml     | 2 +-
 5 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index bbdd9ac8a..eea90fb8d 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -28,7 +28,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, mpt-7b, llama-2-7b-chat-hf-vllm ]
+        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-7B, mpt-7b, llama-2-7b-chat-hf-vllm ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -47,7 +47,6 @@ jobs:
           - { model: "llama-2-70b-chat-hf"}
           - { model: "meta-llama-3-8b-instruct"}
           - { model: "meta-llama-3-70b-instruct"}
-          - { model: "Qwen1.5-110B"}
           - { model: "mistral-7b-v0.1"}
           - { model: "mpt-7b"}
           - { model: "llama-2-7b-chat-hf-vllm"}
diff --git a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
index 1f9d40c2b..717627563 100644
--- a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
@@ -10,4 +10,4 @@ model_description:
   tokenizer_name_or_path: codellama/CodeLlama-7b-hf
   chat_template: "llm_on_ray/inference/models/templates/template_codellama.jinja"
   config:
-    use_auth_token: ''
\ No newline at end of file
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
index 34803ad2d..b25903cf7 100644
--- a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
@@ -11,4 +11,4 @@ model_description:
   chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja"
   gpt_base_model: true
   config:
-    use_auth_token: ''
\ No newline at end of file
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml
index bc147f105..1973ae1a2 100644
--- a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml
@@ -16,4 +16,4 @@ model_description:
   tokenizer_name_or_path: Intel/neural-chat-7b-v3-3
   chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja"
   config:
-    use_auth_token: ''
\ No newline at end of file
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
index bc147f105..1973ae1a2 100644
--- a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
+++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
@@ -16,4 +16,4 @@ model_description:
   tokenizer_name_or_path: Intel/neural-chat-7b-v3-3
   chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja"
   config:
-    use_auth_token: ''
\ No newline at end of file
+    use_auth_token: ''

From 33a1478479237f78ef4b194515a4187b4e1c3c72 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Tue, 4 Jun 2024 07:11:18 +0000
Subject: [PATCH 21/26] revert and add Qwen&Mixtral back

---
 .github/workflows/workflow_inference_gaudi2.yml     | 2 ++
 llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml | 2 ++
 llm_on_ray/inference/predictor_deployment.py        | 2 --
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index eea90fb8d..63fc34b1e 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -48,6 +48,8 @@ jobs:
           - { model: "meta-llama-3-8b-instruct"}
           - { model: "meta-llama-3-70b-instruct"}
           - { model: "mistral-7b-v0.1"}
+          - { model: "Qwen1.5-110B"}
+          - { model: "Mixtral-7B"}
           - { model: "mpt-7b"}
           - { model: "llama-2-7b-chat-hf-vllm"}
 
diff --git a/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml b/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml
index fbd6ccaeb..2d52e217b 100644
--- a/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml
@@ -4,6 +4,8 @@ route_prefix: /Mixtral-7B
 num_replicas: 1
 cpus_per_worker: 8
 hpus_per_worker: 1
+deepspeed: true
+workers_per_group: 8
 device: hpu
 model_description:
   model_id_or_path: mistralai/Mixtral-8x7B-Instruct-v0.1
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index 44b11f20b..a1055915d 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -337,8 +337,6 @@ def preprocess_prompts(
 
         if isinstance(input, str):
             return input
-        elif isinstance(input, List) and len(input) == 1:
-            return input[0]
         elif isinstance(input, List):
             prompts = []
             images = []

From 43a75bcaffbadc5d86f110e6d314ea643926b122 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Wed, 5 Jun 2024 07:13:41 +0000
Subject: [PATCH 22/26] nit

---
 .github/workflows/workflow_inference_gaudi2.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index 63fc34b1e..eea90fb8d 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -48,8 +48,6 @@ jobs:
           - { model: "meta-llama-3-8b-instruct"}
           - { model: "meta-llama-3-70b-instruct"}
           - { model: "mistral-7b-v0.1"}
-          - { model: "Qwen1.5-110B"}
-          - { model: "Mixtral-7B"}
           - { model: "mpt-7b"}
           - { model: "llama-2-7b-chat-hf-vllm"}
 

From 2b868ca7c20402aa188becaa74aabbf9c395aaa5 Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Wed, 12 Jun 2024 05:49:02 +0000
Subject: [PATCH 23/26] add Qwen1.5-7B-Chat

---
 .github/workflows/workflow_inference_gaudi2.yml      |  3 ++-
 .../inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml    | 12 ++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 llm_on_ray/inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index eea90fb8d..2bb6ee9b3 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -28,7 +28,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-7B, mpt-7b, llama-2-7b-chat-hf-vllm ]
+        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-7B, mpt-7b, Qwen1.5-7B-Chat, llama-2-7b-chat-hf-vllm ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -49,6 +49,7 @@ jobs:
           - { model: "meta-llama-3-70b-instruct"}
           - { model: "mistral-7b-v0.1"}
           - { model: "mpt-7b"}
+          - { model: "Qwen1.5-7B-Chat"}
           - { model: "llama-2-7b-chat-hf-vllm"}
 
     runs-on: gaudi2
diff --git a/llm_on_ray/inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml b/llm_on_ray/inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml
new file mode 100644
index 000000000..15680b4fb
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml
@@ -0,0 +1,12 @@
+port: 8000
+name: Qwen1.5-7B-Chat
+route_prefix: /Qwen1.5-7B-Chat
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: Qwen/Qwen1.5-7B-Chat
+  tokenizer_name_or_path: Qwen/Qwen1.5-7B-Chat
+  config:
+    use_auth_token: ''

From 755593501be92f01f67774ab7c6a8d14b258a1ae Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Wed, 12 Jun 2024 05:56:46 +0000
Subject: [PATCH 24/26] add Qwen2-7B-Instruct

---
 .github/workflows/workflow_inference_gaudi2.yml      |  3 ++-
 .../inference/models/hpu/Qwen2-7B-Instruct-hpu.yaml  | 12 ++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 llm_on_ray/inference/models/hpu/Qwen2-7B-Instruct-hpu.yaml

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index 2bb6ee9b3..0847eeefb 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -28,7 +28,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-7B, mpt-7b, Qwen1.5-7B-Chat, llama-2-7b-chat-hf-vllm ]
+        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-7B, mpt-7b, Qwen1.5-7B-Chat, Qwen2-7B-Instruct, llama-2-7b-chat-hf-vllm ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -50,6 +50,7 @@ jobs:
           - { model: "mistral-7b-v0.1"}
           - { model: "mpt-7b"}
           - { model: "Qwen1.5-7B-Chat"}
+          - { model: "Qwen2-7B-Instruct"}
           - { model: "llama-2-7b-chat-hf-vllm"}
 
     runs-on: gaudi2
diff --git a/llm_on_ray/inference/models/hpu/Qwen2-7B-Instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/Qwen2-7B-Instruct-hpu.yaml
new file mode 100644
index 000000000..09f705f12
--- /dev/null
+++ b/llm_on_ray/inference/models/hpu/Qwen2-7B-Instruct-hpu.yaml
@@ -0,0 +1,12 @@
+port: 8000
+name: Qwen2-7B-Instruct
+route_prefix: /Qwen2-7B-Instruct
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: Qwen/Qwen2-7B-Instruct
+  tokenizer_name_or_path: Qwen/Qwen2-7B-Instruct
+  config:
+    use_auth_token: ''

From 53187b598ba931ae77f6d87caf7fd5c2509d835f Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Thu, 13 Jun 2024 06:02:07 +0000
Subject: [PATCH 25/26] remove several models

---
 .github/workflows/workflow_inference_gaudi2.yml    |  3 +--
 .../inference/models/hpu/Mixtral-7B-hpu.yaml       | 14 --------------
 .../inference/models/hpu/Qwen1.5-110B-hpu.yaml     | 14 --------------
 .../inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml  | 12 ------------
 4 files changed, 1 insertion(+), 42 deletions(-)
 delete mode 100644 llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml
 delete mode 100644 llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml
 delete mode 100644 llm_on_ray/inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml

diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
index 0847eeefb..055d59026 100644
--- a/.github/workflows/workflow_inference_gaudi2.yml
+++ b/.github/workflows/workflow_inference_gaudi2.yml
@@ -28,7 +28,7 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, Qwen1.5-110B, mistral-7b-v0.1, Mixtral-7B, mpt-7b, Qwen1.5-7B-Chat, Qwen2-7B-Instruct, llama-2-7b-chat-hf-vllm ]
+        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, mistral-7b-v0.1, mpt-7b, Qwen2-7B-Instruct, llama-2-7b-chat-hf-vllm ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -49,7 +49,6 @@ jobs:
           - { model: "meta-llama-3-70b-instruct"}
           - { model: "mistral-7b-v0.1"}
           - { model: "mpt-7b"}
-          - { model: "Qwen1.5-7B-Chat"}
           - { model: "Qwen2-7B-Instruct"}
           - { model: "llama-2-7b-chat-hf-vllm"}
 
diff --git a/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml b/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml
deleted file mode 100644
index 2d52e217b..000000000
--- a/llm_on_ray/inference/models/hpu/Mixtral-7B-hpu.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-port: 8000
-name: Mixtral-7B
-route_prefix: /Mixtral-7B
-num_replicas: 1
-cpus_per_worker: 8
-hpus_per_worker: 1
-deepspeed: true
-workers_per_group: 8
-device: hpu
-model_description:
-  model_id_or_path: mistralai/Mixtral-8x7B-Instruct-v0.1
-  tokenizer_name_or_path: mistralai/Mixtral-8x7B-Instruct-v0.1
-  config:
-    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml b/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml
deleted file mode 100644
index 5086c3d2e..000000000
--- a/llm_on_ray/inference/models/hpu/Qwen1.5-110B-hpu.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-port: 8000
-name: Qwen1.5-110B
-route_prefix: /Qwen1.5-110B
-num_replicas: 1
-cpus_per_worker: 8
-hpus_per_worker: 1
-deepspeed: true
-workers_per_group: 8
-device: hpu
-model_description:
-  model_id_or_path: Qwen/Qwen1.5-110B
-  tokenizer_name_or_path: Qwen/Qwen1.5-110B
-  config:
-    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml b/llm_on_ray/inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml
deleted file mode 100644
index 15680b4fb..000000000
--- a/llm_on_ray/inference/models/hpu/Qwen1.5-7B-Chat-hpu.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-port: 8000
-name: Qwen1.5-7B-Chat
-route_prefix: /Qwen1.5-7B-Chat
-num_replicas: 1
-cpus_per_worker: 8
-hpus_per_worker: 1
-device: hpu
-model_description:
-  model_id_or_path: Qwen/Qwen1.5-7B-Chat
-  tokenizer_name_or_path: Qwen/Qwen1.5-7B-Chat
-  config:
-    use_auth_token: ''

From 6d16dd4fe91390f6c63c4b8d2160403cc0a0b19f Mon Sep 17 00:00:00 2001
From: Deegue <zyzzxycj@163.com>
Date: Tue, 18 Jun 2024 05:19:28 +0000
Subject: [PATCH 26/26] add falcon qwen linear all reduce to hpu_predictor

---
 llm_on_ray/inference/predictors/hpu_predictor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py
index 4710e0bf9..5292a2ca8 100644
--- a/llm_on_ray/inference/predictors/hpu_predictor.py
+++ b/llm_on_ray/inference/predictors/hpu_predictor.py
@@ -337,7 +337,7 @@ def load_model(self):
         engine = deepspeed.init_inference(model, **ds_inference_kwargs)
         self.model = engine.module
 
-        if self.model.config.model_type == "llama":
+        if self.model.config.model_type in ["llama", "falcon", "qwen2"]:
 
             def patch_scoped_linear_all_reduce(model):
                 from deepspeed.module_inject.layers import LinearAllreduce