From 94df92c6fafefc7104c55f718b61904056e3a5ed Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Fri, 19 Apr 2024 14:36:48 +0000
Subject: [PATCH 01/47] integrate inference chat template

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 llm_on_ray/inference/inference_config.py      |  19 +++
 .../inference/models/CodeLlama-7b-hf.yaml     |   8 +-
 llm_on_ray/inference/models/bloom-560m.yaml   |  10 +-
 .../models/deepseek-coder-33b-instruct.yaml   |  12 +-
 llm_on_ray/inference/models/deplot.yaml       |  15 +-
 llm_on_ray/inference/models/falcon-7b.yaml    |   8 +-
 llm_on_ray/inference/models/fuyu8b.yaml       |  15 +-
 llm_on_ray/inference/models/gemma-2b.yaml     |  10 +-
 llm_on_ray/inference/models/gpt-j-6b.yaml     |  17 +--
 llm_on_ray/inference/models/gpt2.yaml         |   8 +-
 .../inference/models/llama-2-7b-chat-hf.yaml  |  10 +-
 .../models/mistral-7b-Instruct-v0.2.yaml      |  10 +-
 .../inference/models/mistral-7b-v0.1.yaml     |  11 +-
 llm_on_ray/inference/models/mpt-7b.yaml       |  15 +-
 .../inference/models/neural-chat-7b-v3-1.yaml |  13 +-
 llm_on_ray/inference/models/opt-125m.yaml     |   8 +-
 .../inference/models/sqlcoder-7b-2.yaml       |   8 +-
 llm_on_ray/inference/models/starcoder.yaml    |   8 +-
 llm_on_ray/inference/predictor_deployment.py  | 144 ++++++++++++------
 19 files changed, 145 insertions(+), 204 deletions(-)

diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py
index 96833c24b..6842fe63e 100644
--- a/llm_on_ray/inference/inference_config.py
+++ b/llm_on_ray/inference/inference_config.py
@@ -112,6 +112,25 @@ class ModelDescription(BaseModel):
     input_processor: str = "AutoProcessor"
     model_loader: str = "AutoModel"
 
+    chat_model_with_image: bool = False
+    chat_template: Union[str, None] = None
+    default_chat_template: str = (
+        "{{ bos_token }}"
+        "{% if messages[0]['role'] == 'system' %}"
+        "{{ raise_exception('System role not supported') }}"
+        "{% endif %}"
+        "{% for message in messages %}"
+        "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
+        "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
+        "{% endif %}"
+        "{% if message['role'] == 'user' %}"
+        "{{ '### Instruction: ' + message['content'] + eos_token }}"
+        "{% elif message['role'] == 'assistant' %}"
+        "{{ '### Response:'  + message['content'] + eos_token }}"
+        "{% endif %}{% endfor %}"
+        "{{'### End \n'}}"
+    )
+
     @validator("quantization_type")
     def _check_quant_type(cls, v: str):
         if v:
diff --git a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
index 9ea2d77db..3c93da6b6 100644
--- a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
+++ b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
@@ -6,16 +6,10 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: "cpu"
 ipex:
   enabled: false
   precision: bf16
 model_description:
   model_id_or_path: codellama/CodeLlama-7b-hf
   tokenizer_name_or_path: codellama/CodeLlama-7b-hf
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
diff --git a/llm_on_ray/inference/models/bloom-560m.yaml b/llm_on_ray/inference/models/bloom-560m.yaml
index ba2a6d962..92dbb8809 100644
--- a/llm_on_ray/inference/models/bloom-560m.yaml
+++ b/llm_on_ray/inference/models/bloom-560m.yaml
@@ -6,16 +6,10 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: CPU
 ipex:
-  enabled: true
+  enabled: false
   precision: bf16
 model_description:
   model_id_or_path: bigscience/bloom-560m
   tokenizer_name_or_path: bigscience/bloom-560m
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
diff --git a/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml b/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml
index 75e646a44..84f6d2a43 100644
--- a/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml
+++ b/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml
@@ -10,16 +10,6 @@ device: cpu
 ipex:
   enabled: false
   precision: bf16
-model_description:  
+model_description:
   model_id_or_path: deepseek-ai/deepseek-coder-33b-instruct
   tokenizer_name_or_path: deepseek-ai/deepseek-coder-33b-instruct
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: ['<|EOT|>', "<human>"]
-
-
-
-
diff --git a/llm_on_ray/inference/models/deplot.yaml b/llm_on_ray/inference/models/deplot.yaml
index 4e732a4fe..6b518def9 100644
--- a/llm_on_ray/inference/models/deplot.yaml
+++ b/llm_on_ray/inference/models/deplot.yaml
@@ -6,22 +6,11 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: "cpu"
 ipex:
   enabled: false
   precision: bf16
 model_description:
   model_id_or_path: google/deplot
   tokenizer_name_or_path: google/deplot
-  chat_processor: ChatModelwithImage
-  input_processor: 'AutoProcessor'
-  model_loader: 'Pix2StructForConditionalGeneration'
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
-  config:
-    use_auth_token: ''
+  chat_model_with_image: true
diff --git a/llm_on_ray/inference/models/falcon-7b.yaml b/llm_on_ray/inference/models/falcon-7b.yaml
index 8176a2689..5a59e6e47 100644
--- a/llm_on_ray/inference/models/falcon-7b.yaml
+++ b/llm_on_ray/inference/models/falcon-7b.yaml
@@ -6,16 +6,10 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: "cpu"
 ipex:
   enabled: false
   precision: bf16
 model_description:
   model_id_or_path: tiiuae/falcon-7b
   tokenizer_name_or_path: tiiuae/falcon-7b
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
diff --git a/llm_on_ray/inference/models/fuyu8b.yaml b/llm_on_ray/inference/models/fuyu8b.yaml
index 551a85789..c45affbae 100644
--- a/llm_on_ray/inference/models/fuyu8b.yaml
+++ b/llm_on_ray/inference/models/fuyu8b.yaml
@@ -6,22 +6,11 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: "cpu"
 ipex:
   enabled: false
   precision: bf16
 model_description:
   model_id_or_path: adept/fuyu-8b
   tokenizer_name_or_path: adept/fuyu-8b
-  chat_processor: ChatModelwithImage
-  input_processor: FuyuProcessor
-  model_loader: FuyuForCausalLM
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
-  config:
-    use_auth_token: ''
+  chat_model_with_image: true
diff --git a/llm_on_ray/inference/models/gemma-2b.yaml b/llm_on_ray/inference/models/gemma-2b.yaml
index 8335857ca..c08bd571b 100644
--- a/llm_on_ray/inference/models/gemma-2b.yaml
+++ b/llm_on_ray/inference/models/gemma-2b.yaml
@@ -6,20 +6,12 @@ cpus_per_worker: 2
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: CPU
 ipex:
   enabled: true
   precision: bf16
 model_description:
   model_id_or_path: google/gemma-2b
   tokenizer_name_or_path: google/gemma-2b
-  chat_processor: ChatModelGemma
-  prompt:
-    intro: ''
-    human_id: '<bos><start_of_turn>user
-    {msg}<end_of_turn>'
-    bot_id: '<bos><start_of_turn>model
-    {msg}<end_of_turn>'
-    stop_words: []
   config:
     use_auth_token: ' '
diff --git a/llm_on_ray/inference/models/gpt-j-6b.yaml b/llm_on_ray/inference/models/gpt-j-6b.yaml
index c7778c12e..b2a7a04df 100644
--- a/llm_on_ray/inference/models/gpt-j-6b.yaml
+++ b/llm_on_ray/inference/models/gpt-j-6b.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: "cpu"
 ipex:
   # false here for ci coverage
   enabled: false
@@ -14,17 +14,4 @@ ipex:
 model_description:
   model_id_or_path: EleutherAI/gpt-j-6b
   tokenizer_name_or_path: EleutherAI/gpt-j-6b
-  chat_processor: ChatModelGptJ
-  gpt_base_model: true
-  prompt:
-    intro: 'Below is an instruction that describes a task. Write a response that appropriately
-      completes the request.
-
-      '
-    human_id: '
-
-      ### Instruction'
-    bot_id: '
-
-      ### Response'
-    stop_words: []
+  gpt_base_model: true
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml
index 48287670a..2d44b9882 100644
--- a/llm_on_ray/inference/models/gpt2.yaml
+++ b/llm_on_ray/inference/models/gpt2.yaml
@@ -6,17 +6,11 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: CPU
 ipex:
   enabled: true
   precision: bf16
 model_description:
   model_id_or_path: gpt2
   tokenizer_name_or_path: gpt2
-  chat_processor: ChatModelGptJ
   gpt_base_model: true
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
diff --git a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
index 4b3e11e98..4f2ed0194 100644
--- a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
+++ b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
@@ -6,20 +6,12 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: "cpu"
 ipex:
   enabled: false
   precision: bf16
 model_description:
   model_id_or_path: meta-llama/Llama-2-7b-chat-hf
   tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml
index 1af9aad1b..ab901eb95 100644
--- a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml
+++ b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml
@@ -5,19 +5,13 @@ cpus_per_worker: 48
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: CPU
 ipex:
   enabled: false
   precision: bf16
 model_description:
   model_id_or_path: mistralai/Mistral-7B-Instruct-v0.2
-  ipexllm: false
+  bigdl: false
   tokenizer_name_or_path: mistralai/Mistral-7B-Instruct-v0.2
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '<s>[INST] {msg} [/INST]'
-    bot_id: ''
-    stop_words: []
   config:
     trust_remote_code: true
diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
index c8a0ff385..d5dbec146 100644
--- a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
+++ b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
@@ -6,19 +6,14 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: CPU
 ipex:
   enabled: false
   precision: bf16
 model_description:
   model_id_or_path: mistralai/Mistral-7B-v0.1
-  ipexllm: false
+  bigdl: false
   tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '<s>[INST] {msg} [/INST]'
-    bot_id: ''
-    stop_words: []
+
   config:
     trust_remote_code: true
diff --git a/llm_on_ray/inference/models/mpt-7b.yaml b/llm_on_ray/inference/models/mpt-7b.yaml
index 4ea12adb3..80f062a82 100644
--- a/llm_on_ray/inference/models/mpt-7b.yaml
+++ b/llm_on_ray/inference/models/mpt-7b.yaml
@@ -6,25 +6,12 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: CPU
 ipex:
   enabled: false
   precision: bf16
 model_description:
   model_id_or_path: mosaicml/mpt-7b
   tokenizer_name_or_path: EleutherAI/gpt-neox-20b
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: 'Below is an instruction that describes a task, paired with an input that
-      provides further context. Write a response that appropriately completes the request.
-
-      '
-    human_id: '
-
-      ### Instruction'
-    bot_id: '
-
-      ### Response'
-    stop_words: []
   config:
     trust_remote_code: true
diff --git a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
index 13a29676c..670654715 100644
--- a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
+++ b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
@@ -6,20 +6,11 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: CPU
 ipex:
   enabled: false
   precision: bf16
 model_description:
   model_id_or_path: Intel/neural-chat-7b-v3-1
   tokenizer_name_or_path: Intel/neural-chat-7b-v3-1
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: '### System:
-      You are a chatbot developed by Intel. Please answer all questions to the best of your ability.'
-    human_id: '
-
-      ### User'
-    bot_id: '
-
-      ### Assistant'
+  chat_template: "{{ bos_token }}'### System:You are a chatbot developed by Intel. Please answer all questions to the best of your ability.\n'{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### User: ' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '### Assistant:'  + message['content'] + eos_token }}{% endif %}{% endfor %}"
diff --git a/llm_on_ray/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml
index 545cd2145..e5f431b42 100644
--- a/llm_on_ray/inference/models/opt-125m.yaml
+++ b/llm_on_ray/inference/models/opt-125m.yaml
@@ -6,16 +6,10 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: CPU
 ipex:
   enabled: false
   precision: bf16
 model_description:
   model_id_or_path: facebook/opt-125m
   tokenizer_name_or_path: facebook/opt-125m
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
diff --git a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
index 7130148a3..be85f8463 100644
--- a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
+++ b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
@@ -5,18 +5,12 @@ cpus_per_worker: 22
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: "cpu"
 ipex:
   enabled: false
   precision: bf16
 model_description:
   model_id_or_path: defog/sqlcoder-7b-2
   tokenizer_name_or_path: defog/sqlcoder-7b-2
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: ["```"]
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/starcoder.yaml b/llm_on_ray/inference/models/starcoder.yaml
index 0da59ac02..2044cf109 100644
--- a/llm_on_ray/inference/models/starcoder.yaml
+++ b/llm_on_ray/inference/models/starcoder.yaml
@@ -9,15 +9,9 @@ workers_per_group: 2
 ipex:
   enabled: false
   precision: bf16
-device: cpu
+device: "cpu"
 model_description:
   model_id_or_path: bigcode/starcoder
   tokenizer_name_or_path: bigcode/starcoder
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index 18b23d86b..bf5cc35af 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -51,31 +51,11 @@ def __init__(
         max_batch_size=_DEFAULT_MAX_BATCH_SIZE,
     ):
         self.device = torch.device(infer_conf.device)
-        self.process_tool = None
-        chat_processor_name = infer_conf.model_description.chat_processor
-        prompt = infer_conf.model_description.prompt
 
         self.handle_dynamic_batch.set_max_batch_size(max_batch_size)
-
-        if chat_processor_name:
-            try:
-                module = __import__("chat_process")
-            except Exception:
-                sys.path.append(os.path.dirname(__file__))
-                module = __import__("chat_process")
-            chat_processor = getattr(module, chat_processor_name, None)
-            if chat_processor is None:
-                raise ValueError(
-                    infer_conf.name
-                    + " deployment failed. chat_processor("
-                    + chat_processor_name
-                    + ") does not exist."
-                )
-            self.process_tool = chat_processor(**prompt.dict())
-
         self.use_deepspeed = infer_conf.deepspeed
         self.use_vllm = infer_conf.vllm.enabled
-        self.is_mllm = True if chat_processor_name in ["ChatModelwithImage"] else False
+        self.is_mllm = infer_conf.model_description.chat_model_with_image
 
         # Used to determine if openai backend is used
         self.use_openai = False
@@ -305,12 +285,12 @@ async def handle_static_batch(self, prompts: List[str], **config: Dict[str, Any]
                     preprocessing_time=0,
                 )
 
-    def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=None):
+    def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=None):
         """
         Preprocesses the input prompts.
 
         Args:
-            input (Union[str, List[str]]): The input prompt(s) to be preprocessed.
+            input (Union[str, List[dict]]): The input prompt(s) to be preprocessed.
             tools (List[str]): The list of tools to be used.
             tool_choice: The choice of tool to be used.
 
@@ -327,12 +307,14 @@ def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=No
         Raises:
             HTTPException: If the input prompt format is invalid or not supported.
         """
+
+        logger.info("preprocess_prompts")
+        logger.info(type(input))
+        logger.info(input)
+
         if isinstance(input, str):
             return input
-        elif isinstance(input, List):
-            prompts = []
-            images = []
-
+        elif isinstance(input, list):
             prompt_format = get_prompt_format(input)
             if prompt_format == PromptFormat.CHAT_FORMAT:
                 # Process the input prompts with tools
@@ -349,27 +331,55 @@ def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=No
                             m.content = self.openai_tools_prompter.content_from_assistant(m)  # type: ignore
                         elif m.tool_call_id is not None:  # type: ignore
                             m.content = self.openai_tools_prompter.content_from_tool(m)  # type: ignore
-                # Process the input prompts with MLLM tool
-                if self.process_tool is not None:
-                    if self.is_mllm:
-                        input, image = self.process_tool.get_prompt(input)
-                        prompts.append(input)
-                        images.extend(image)
-                        return prompts, images
-                    else:
-                        prompt = self.process_tool.get_prompt(input)
-                        return prompt
+
+                if self.predictor.infer_conf.model_description.chat_template is not None:
+                    self.predictor.tokenizer.chat_template = self.predictor.infer_conf.model_description.chat_template
+                elif self.predictor.tokenizer.chat_template is None:
+                    self.predictor.tokenizer.chat_template = self.predictor.infer_conf.model_description.default_chat_template
+
+                if self.is_mllm:
+                    if isinstance(input, list):
+                        if isinstance(input, list) and input and isinstance(input[0], ChatMessage):
+                            messages = []
+                            for chat_message in input:
+                                message = {"role": chat_message.role, "content": chat_message.content}
+                                messages.append(message)
+                            texts, images = self._extract_messages(messages)
+                        elif isinstance(input, list) and input and isinstance(input[0], dict):
+                            texts, images = self._extract_messages(input)
+                        elif isinstance(input, list) and input and isinstance(input[0], list):
+                            texts, images = [self._extract_messages(p) for p in input]
+
+                        image = self._prepare_image(images)
+                        prompt = self.tokenize_inputs(texts)
+                        return prompt, image
                 else:
-                    prompts.extend(input)
+                    if isinstance(input, list) and input and isinstance(input[0], dict):
+                        prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False)
+                    elif isinstance(input, list) and input and isinstance(input[0], list):
+                        prompt = [self.predictor.tokenizer.apply_chat_template(t, tokenize=False) for t in input]
+                    elif isinstance(input, list) and input and isinstance(input[0], ChatMessage):
+                        messages = []
+                        for chat_message in input:
+                            message = {"role": chat_message.role, "content": chat_message.content}
+                            messages.append(message)
+                        prompt = self.predictor.tokenizer.apply_chat_template(messages, tokenize=False)
+                    elif isinstance(input, list) and input and isinstance(input[0], str):
+                        prompt = input
+                    elif isinstance(input, str):
+                        prompt = input
+                    else:
+                        raise TypeError(f"Unsupported type {type(input)} for text. Expected dict or list of dicts.")
+                logger.info(prompt)
+                return prompt
             elif prompt_format == PromptFormat.PROMPTS_FORMAT:
-                prompts.extend(input)
-            else:
                 raise HTTPException(400, "Invalid prompt format.")
-            return prompts
+            return input
         else:
             raise HTTPException(400, "Invalid prompt format.")
 
     async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSONResponse, str]:
+        logger.info("PredictorDeployment call")
         self.use_openai = False
 
         try:
@@ -379,7 +389,6 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON
                 status_code=400,
                 content="Invalid JSON format from http request.",
             )
-
         streaming_response = json_request["stream"] if "stream" in json_request else False
         input = json_request["text"] if "text" in json_request else ""
         if input == "":
@@ -388,7 +397,7 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON
                 content="Empty prompt is not supported.",
             )
         config = json_request["config"] if "config" in json_request else {}
-
+        logger.info(input)
         # return prompt or list of prompts preprocessed
         prompts = self.preprocess_prompts(input)
 
@@ -418,3 +427,52 @@ async def openai_call(
                 yield result
         else:
             yield await self.handle_non_streaming(prompts, config)
+
+
+    def _extract_messages(messages):
+        texts, images = [], []
+        for message in messages:
+            if message['role'] == 'user' and isinstance(message['content'], list):
+                texts.append({"role": "user", "content": message['content'][0]['text']})
+                images.append({"role": "user", "content": message['content'][1]['image_url']['url']})
+            else:
+                texts.append(message)
+        return texts, images
+
+    def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]):
+        """Prepare image from history messages."""
+        from PIL import Image
+        import requests
+        from io import BytesIO
+        import base64
+        import re
+
+        # prepare images
+        images = []
+        if isinstance(messages[0], list):
+            for i in len(messages):
+                for msg in messages[i]:
+                    msg = dict(msg)
+                    role, content = msg["role"], msg["content"]
+                    if "url" not in content:
+                        continue
+                    is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
+                    if is_data:
+                        encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
+                        images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str))))
+                    else:
+                        images[i].append(Image.open(requests.get(content["url"], stream=True).raw))
+        else:
+            for msg in messages:
+                msg = dict(msg)
+                role, content = msg["role"], msg["content"]
+                if "url" not in content:
+                    continue
+                is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
+                if is_data:
+                    encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
+                    images.append(Image.open(BytesIO(base64.b64decode(encoded_str))))
+                else:
+                    images.append(Image.open(requests.get(content["url"], stream=True).raw))
+
+        return images
\ No newline at end of file

From f84756980cd5ef8596aba1d3918b79cc834d88e8 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 22 Apr 2024 01:00:41 +0000
Subject: [PATCH 02/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 .github/workflows/config/mpt_deltatuner.yaml      | 15 ---------------
 .../models/hpu/llama-2-70b-chat-hf-hpu.yaml       |  8 --------
 .../models/hpu/llama-2-7b-chat-hf-hpu.yaml        |  8 --------
 .../inference/models/hpu/neural-chat-7b-v3-3.yaml | 10 ----------
 .../models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml |  6 ------
 .../models/ipex-llm/mpt-7b-ipex-llm.yaml          | 13 -------------
 llm_on_ray/inference/models/mistral-7b-v0.1.yaml  |  1 -
 .../models/vllm/llama-2-7b-chat-hf-vllm.yaml      |  8 --------
 llm_on_ray/inference/utils.py                     |  6 +++---
 9 files changed, 3 insertions(+), 72 deletions(-)

diff --git a/.github/workflows/config/mpt_deltatuner.yaml b/.github/workflows/config/mpt_deltatuner.yaml
index 250004dc2..7399d587b 100644
--- a/.github/workflows/config/mpt_deltatuner.yaml
+++ b/.github/workflows/config/mpt_deltatuner.yaml
@@ -13,20 +13,5 @@ ipex:
 model_description:
   model_id_or_path: mosaicml/mpt-7b
   tokenizer_name_or_path: EleutherAI/gpt-neox-20b
-  chat_processor: ChatModelGptJ
-  peft_model_id_or_path: nathan0/mpt-7b-deltatuner-model
-  peft_type: deltatuner
-  prompt:
-    intro: 'Below is an instruction that describes a task, paired with an input that
-      provides further context. Write a response that appropriately completes the request.
-
-      '
-    human_id: '
-
-      ### Instruction'
-    bot_id: '
-
-      ### Response'
-    stop_words: []
   config:
     trust_remote_code: true
diff --git a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml
index d68da8428..ab411ff0e 100644
--- a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml
@@ -10,13 +10,5 @@ device: hpu
 model_description:
   model_id_or_path: meta-llama/Llama-2-70b-chat-hf
   tokenizer_name_or_path: meta-llama/Llama-2-70b-chat-hf
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml
index 374a98f77..b7b19f02a 100644
--- a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml
+++ b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml
@@ -8,13 +8,5 @@ device: hpu
 model_description:
   model_id_or_path: meta-llama/Llama-2-7b-chat-hf
   tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
index 848358bec..00ff121c5 100644
--- a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
+++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
@@ -14,13 +14,3 @@ ipex:
 model_description:
   model_id_or_path: Intel/neural-chat-7b-v3-3
   tokenizer_name_or_path: Intel/neural-chat-7b-v3-3
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: '### System:
-      You are a chatbot developed by Intel. Please answer all questions to the best of your ability.'
-    human_id: '
-
-      ### User'
-    bot_id: '
-
-      ### Assistant'
diff --git a/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml b/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml
index 6a8523467..5ec652bba 100644
--- a/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml
+++ b/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml
@@ -14,12 +14,6 @@ model_description:
   model_id_or_path: mistralai/Mistral-7B-v0.1
   ipexllm: true
   tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '<s>[INST] {msg} [/INST]'
-    bot_id: ''
-    stop_words: []
   config:
     trust_remote_code: true
     load_in_4bit: true
diff --git a/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml b/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml
index d352a6517..ecb129973 100644
--- a/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml
+++ b/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml
@@ -14,19 +14,6 @@ model_description:
   model_id_or_path: mosaicml/mpt-7b-chat
   ipexllm: true
   tokenizer_name_or_path: EleutherAI/gpt-neox-20b
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: 'Below is an instruction that describes a task, paired with an input that
-      provides further context. Write a response that appropriately completes the request.
-
-      '
-    human_id: '
-
-      ### Instruction'
-    bot_id: '
-
-      ### Response'
-    stop_words: []
   config:
     trust_remote_code: true
     load_in_4bit: true
diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
index d5dbec146..12de7c3bb 100644
--- a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
+++ b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
@@ -14,6 +14,5 @@ model_description:
   model_id_or_path: mistralai/Mistral-7B-v0.1
   bigdl: false
   tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
-
   config:
     trust_remote_code: true
diff --git a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml
index acbf58455..9302b9be2 100644
--- a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml
+++ b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml
@@ -16,13 +16,5 @@ ipex:
 model_description:
   model_id_or_path: meta-llama/Llama-2-7b-chat-hf
   tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/utils.py b/llm_on_ray/inference/utils.py
index 91e311088..5a8db0401 100644
--- a/llm_on_ray/inference/utils.py
+++ b/llm_on_ray/inference/utils.py
@@ -162,13 +162,13 @@ def is_cpu_without_ipex(infer_conf: InferenceConfig) -> bool:
     return (not infer_conf.ipex.enabled) and infer_conf.device == DEVICE_CPU
 
 
-def get_prompt_format(input: Union[List[str], List[dict], List[ChatMessage]]):
+def get_prompt_format(input: Union[List[str], List[dict], List[List[dict]], List[ChatMessage]]):
     chat_format = True
     prompts_format = True
     for item in input:
-        if isinstance(item, str) or isinstance(item, list):
+        if isinstance(item, str):
             chat_format = False
-        elif isinstance(item, dict) or isinstance(item, ChatMessage):
+        elif isinstance(item, dict) or isinstance(item, ChatMessage) or isinstance(item, list):
             prompts_format = False
         else:
             chat_format = False

From 0df70f1fa0c1f676398971eea06517ea22b9a8af Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 22 Apr 2024 02:31:09 +0000
Subject: [PATCH 03/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 llm_on_ray/inference/predictor_deployment.py | 56 +++++++++++++-------
 1 file changed, 36 insertions(+), 20 deletions(-)

diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index bf5cc35af..1d693f272 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -285,6 +285,7 @@ async def handle_static_batch(self, prompts: List[str], **config: Dict[str, Any]
                     preprocessing_time=0,
                 )
 
+    # TODO:Abstract the preprocess_prompts function into a class for handling chat templates
     def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=None):
         """
         Preprocesses the input prompts.
@@ -333,16 +334,23 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No
                             m.content = self.openai_tools_prompter.content_from_tool(m)  # type: ignore
 
                 if self.predictor.infer_conf.model_description.chat_template is not None:
-                    self.predictor.tokenizer.chat_template = self.predictor.infer_conf.model_description.chat_template
+                    self.predictor.tokenizer.chat_template = (
+                        self.predictor.infer_conf.model_description.chat_template
+                    )
                 elif self.predictor.tokenizer.chat_template is None:
-                    self.predictor.tokenizer.chat_template = self.predictor.infer_conf.model_description.default_chat_template
+                    self.predictor.tokenizer.chat_template = (
+                        self.predictor.infer_conf.model_description.default_chat_template
+                    )
 
                 if self.is_mllm:
                     if isinstance(input, list):
                         if isinstance(input, list) and input and isinstance(input[0], ChatMessage):
                             messages = []
                             for chat_message in input:
-                                message = {"role": chat_message.role, "content": chat_message.content}
+                                message = {
+                                    "role": chat_message.role,
+                                    "content": chat_message.content,
+                                }
                                 messages.append(message)
                             texts, images = self._extract_messages(messages)
                         elif isinstance(input, list) and input and isinstance(input[0], dict):
@@ -351,25 +359,32 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No
                             texts, images = [self._extract_messages(p) for p in input]
 
                         image = self._prepare_image(images)
-                        prompt = self.tokenize_inputs(texts)
+                        prompt = self.predictor.tokenizer.apply_chat_template(texts, tokenize=False)
                         return prompt, image
                 else:
                     if isinstance(input, list) and input and isinstance(input[0], dict):
                         prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False)
                     elif isinstance(input, list) and input and isinstance(input[0], list):
-                        prompt = [self.predictor.tokenizer.apply_chat_template(t, tokenize=False) for t in input]
+                        prompt = [
+                            self.predictor.tokenizer.apply_chat_template(t, tokenize=False)
+                            for t in input
+                        ]
                     elif isinstance(input, list) and input and isinstance(input[0], ChatMessage):
                         messages = []
                         for chat_message in input:
                             message = {"role": chat_message.role, "content": chat_message.content}
                             messages.append(message)
-                        prompt = self.predictor.tokenizer.apply_chat_template(messages, tokenize=False)
+                        prompt = self.predictor.tokenizer.apply_chat_template(
+                            messages, tokenize=False
+                        )
                     elif isinstance(input, list) and input and isinstance(input[0], str):
                         prompt = input
                     elif isinstance(input, str):
                         prompt = input
                     else:
-                        raise TypeError(f"Unsupported type {type(input)} for text. Expected dict or list of dicts.")
+                        raise TypeError(
+                            f"Unsupported type {type(input)} for text. Expected dict or list of dicts."
+                        )
                 logger.info(prompt)
                 return prompt
             elif prompt_format == PromptFormat.PROMPTS_FORMAT:
@@ -428,18 +443,19 @@ async def openai_call(
         else:
             yield await self.handle_non_streaming(prompts, config)
 
-
-    def _extract_messages(messages):
+    def _extract_messages(self, messages):
         texts, images = [], []
         for message in messages:
-            if message['role'] == 'user' and isinstance(message['content'], list):
-                texts.append({"role": "user", "content": message['content'][0]['text']})
-                images.append({"role": "user", "content": message['content'][1]['image_url']['url']})
+            if message["role"] == "user" and isinstance(message["content"], list):
+                texts.append({"role": "user", "content": message["content"][0]["text"]})
+                images.append(
+                    {"role": "user", "content": message["content"][1]["image_url"]["url"]}
+                )
             else:
                 texts.append(message)
         return texts, images
 
-    def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]):
+    def _prepare_image(self, messages: list):
         """Prepare image from history messages."""
         from PIL import Image
         import requests
@@ -448,12 +464,12 @@ def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]):
         import re
 
         # prepare images
-        images = []
-        if isinstance(messages[0], list):
-            for i in len(messages):
+        images: List = []
+        if isinstance(messages[0], List):
+            for i in range(len(messages)):
                 for msg in messages[i]:
                     msg = dict(msg)
-                    role, content = msg["role"], msg["content"]
+                    content = msg["content"]
                     if "url" not in content:
                         continue
                     is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
@@ -462,10 +478,10 @@ def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]):
                         images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str))))
                     else:
                         images[i].append(Image.open(requests.get(content["url"], stream=True).raw))
-        else:
+        elif isinstance(messages[0], dict):
             for msg in messages:
                 msg = dict(msg)
-                role, content = msg["role"], msg["content"]
+                content = msg["content"]
                 if "url" not in content:
                     continue
                 is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
@@ -475,4 +491,4 @@ def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]):
                 else:
                     images.append(Image.open(requests.get(content["url"], stream=True).raw))
 
-        return images
\ No newline at end of file
+        return images

From 6534808bf85df964a853be4cf0377f5db5f4bca7 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 22 Apr 2024 03:24:45 +0000
Subject: [PATCH 04/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 .github/workflows/config/bloom-560m-ci.yaml         |  6 ------
 .github/workflows/config/gpt2-ci.yaml               |  6 ------
 .../config/llama-2-7b-chat-hf-vllm-fp32.yaml        |  8 --------
 .../workflows/config/mpt_deltatuner_deepspeed.yaml  | 13 -------------
 .github/workflows/config/opt-125m-ci.yaml           |  6 ------
 llm_on_ray/inference/models/opt-125m.yaml           |  2 +-
 llm_on_ray/inference/predictor_deployment.py        |  6 ------
 7 files changed, 1 insertion(+), 46 deletions(-)

diff --git a/.github/workflows/config/bloom-560m-ci.yaml b/.github/workflows/config/bloom-560m-ci.yaml
index 16a97d896..674644798 100644
--- a/.github/workflows/config/bloom-560m-ci.yaml
+++ b/.github/workflows/config/bloom-560m-ci.yaml
@@ -13,9 +13,3 @@ ipex:
 model_description:  
   model_id_or_path: bigscience/bloom-560m
   tokenizer_name_or_path: bigscience/bloom-560m
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
diff --git a/.github/workflows/config/gpt2-ci.yaml b/.github/workflows/config/gpt2-ci.yaml
index 1e6df57cb..e9bed1366 100644
--- a/.github/workflows/config/gpt2-ci.yaml
+++ b/.github/workflows/config/gpt2-ci.yaml
@@ -12,10 +12,4 @@ ipex:
 model_description:
   model_id_or_path: gpt2
   tokenizer_name_or_path: gpt2
-  chat_processor: ChatModelGptJ
   gpt_base_model: true
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
diff --git a/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml b/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml
index 46be6eb57..d3d96a0e1 100644
--- a/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml
+++ b/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml
@@ -16,13 +16,5 @@ ipex:
 model_description:
   model_id_or_path: meta-llama/Llama-2-7b-chat-hf
   tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf
-  chat_processor: ChatModelLLama
-  prompt:
-    intro: ''
-    human_id: '[INST] {msg} [/INST]
-
-      '
-    bot_id: ''
-    stop_words: []
   config:
     use_auth_token: ''
diff --git a/.github/workflows/config/mpt_deltatuner_deepspeed.yaml b/.github/workflows/config/mpt_deltatuner_deepspeed.yaml
index 40051e0fa..a4fdd0709 100644
--- a/.github/workflows/config/mpt_deltatuner_deepspeed.yaml
+++ b/.github/workflows/config/mpt_deltatuner_deepspeed.yaml
@@ -13,20 +13,7 @@ ipex:
 model_description:
   model_id_or_path: mosaicml/mpt-7b
   tokenizer_name_or_path: EleutherAI/gpt-neox-20b
-  chat_processor: ChatModelGptJ
   peft_model_id_or_path: nathan0/mpt-7b-deltatuner-model
   peft_type: deltatuner
-  prompt:
-    intro: 'Below is an instruction that describes a task, paired with an input that
-      provides further context. Write a response that appropriately completes the request.
-
-      '
-    human_id: '
-
-      ### Instruction'
-    bot_id: '
-
-      ### Response'
-    stop_words: []
   config:
     trust_remote_code: true
diff --git a/.github/workflows/config/opt-125m-ci.yaml b/.github/workflows/config/opt-125m-ci.yaml
index 047d0008c..e5ab095a6 100644
--- a/.github/workflows/config/opt-125m-ci.yaml
+++ b/.github/workflows/config/opt-125m-ci.yaml
@@ -13,9 +13,3 @@ ipex:
 model_description:
   model_id_or_path: facebook/opt-125m
   tokenizer_name_or_path: facebook/opt-125m
-  chat_processor: ChatModelGptJ
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
diff --git a/llm_on_ray/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml
index e5f431b42..65a6d6bf7 100644
--- a/llm_on_ray/inference/models/opt-125m.yaml
+++ b/llm_on_ray/inference/models/opt-125m.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: CPU
+device: cpu
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index 1d693f272..92d8435a0 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -308,11 +308,6 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No
         Raises:
             HTTPException: If the input prompt format is invalid or not supported.
         """
-
-        logger.info("preprocess_prompts")
-        logger.info(type(input))
-        logger.info(input)
-
         if isinstance(input, str):
             return input
         elif isinstance(input, list):
@@ -385,7 +380,6 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No
                         raise TypeError(
                             f"Unsupported type {type(input)} for text. Expected dict or list of dicts."
                         )
-                logger.info(prompt)
                 return prompt
             elif prompt_format == PromptFormat.PROMPTS_FORMAT:
                 raise HTTPException(400, "Invalid prompt format.")

From 5a864dc73bed276a0f5125aab78c4670cd2b0d23 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 22 Apr 2024 03:27:15 +0000
Subject: [PATCH 05/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 llm_on_ray/inference/models/mistral-7b-v0.1.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
index 12de7c3bb..0ed664efd 100644
--- a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
+++ b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
@@ -6,13 +6,13 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: CPU
+device: cpu
 ipex:
   enabled: false
   precision: bf16
 model_description:
   model_id_or_path: mistralai/Mistral-7B-v0.1
-  bigdl: false
+  ipexllm: false
   tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
   config:
     trust_remote_code: true

From e06105ee534f8c49dfbf4c5520bd866a19cbc4f6 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 22 Apr 2024 05:33:28 +0000
Subject: [PATCH 06/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 tests/test_getting_started.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_getting_started.sh b/tests/test_getting_started.sh
index 6a900a553..052ac51bb 100755
--- a/tests/test_getting_started.sh
+++ b/tests/test_getting_started.sh
@@ -33,7 +33,7 @@ curl $ENDPOINT_URL/chat/completions \
     -H "Content-Type: application/json" \
     -d '{
     "model": "gpt2",
-    "messages": [{"role": "assistant", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}],
+    "messages": [{"role": "user", "content": "Hello!"}],
     "temperature": 0.7
     }'
 

From 9a11e52c4329904204b064c0e17f70815f8018e6 Mon Sep 17 00:00:00 2001
From: minmingzhu <45281494+minmingzhu@users.noreply.github.com>
Date: Mon, 22 Apr 2024 14:21:41 +0800
Subject: [PATCH 07/47] Update query_http_requests.py

---
 examples/inference/api_server_openai/query_http_requests.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/inference/api_server_openai/query_http_requests.py b/examples/inference/api_server_openai/query_http_requests.py
index 536deb30e..a2be3873f 100644
--- a/examples/inference/api_server_openai/query_http_requests.py
+++ b/examples/inference/api_server_openai/query_http_requests.py
@@ -58,7 +58,6 @@
 body = {
     "model": args.model_name,
     "messages": [
-        {"role": "assistant", "content": "You are a helpful assistant."},
         {"role": "user", "content": args.input_text},
     ],
     "stream": args.streaming_response,

From 02ee02d724097537db292ccc38edd816324c2fc7 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Fri, 26 Apr 2024 03:52:24 +0000
Subject: [PATCH 08/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 llm_on_ray/inference/chat_process.py          | 222 ------------------
 llm_on_ray/inference/chat_template_process.py |  77 ++++++
 .../inference/models/CodeLlama-7b-hf.yaml     |   2 +-
 llm_on_ray/inference/models/bloom-560m.yaml   |   4 +-
 llm_on_ray/inference/models/deplot.yaml       |   2 +-
 llm_on_ray/inference/models/falcon-7b.yaml    |   2 +-
 llm_on_ray/inference/models/fuyu8b.yaml       |   2 +-
 llm_on_ray/inference/models/gemma-2b.yaml     |   2 +-
 llm_on_ray/inference/models/gpt-j-6b.yaml     |   2 +-
 llm_on_ray/inference/models/gpt2.yaml         |   2 +-
 .../inference/models/llama-2-7b-chat-hf.yaml  |   2 +-
 .../models/mistral-7b-Instruct-v0.2.yaml      |   4 +-
 llm_on_ray/inference/models/mpt-7b.yaml       |   2 +-
 .../inference/models/neural-chat-7b-v3-1.yaml |   2 +-
 .../inference/models/sqlcoder-7b-2.yaml       |   2 +-
 llm_on_ray/inference/models/starcoder.yaml    |   2 +-
 llm_on_ray/inference/predictor_deployment.py  |  48 ----
 llm_on_ray/ui/start_ui.py                     |   2 +-
 18 files changed, 94 insertions(+), 287 deletions(-)
 delete mode 100644 llm_on_ray/inference/chat_process.py
 create mode 100644 llm_on_ray/inference/chat_template_process.py

diff --git a/llm_on_ray/inference/chat_process.py b/llm_on_ray/inference/chat_process.py
deleted file mode 100644
index 3ee238fb7..000000000
--- a/llm_on_ray/inference/chat_process.py
+++ /dev/null
@@ -1,222 +0,0 @@
-#
-# Copyright 2023 The LLM-on-Ray Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-class ChatModel:
-    human_id = "<human>"
-    bot_id = "<bot>"
-    unknown_id = "<unknown>"
-    MEANINGLESS_WORDS = ["<pad>", "</s>", "<|endoftext|>", "<br>"]
-    stop_words = ["<human>"]
-
-    def __init__(self, intro, human_id, bot_id, stop_words) -> None:
-        self.intro = intro
-        self.human_id = human_id
-        self.bot_id = bot_id
-        self.stop_words = stop_words
-        self.MEANINGLESS_WORDS.extend(self.stop_words)
-
-    def prepare_prompt(self, messages: list):
-        """Prepare prompt from history messages."""
-        prompt = ""
-        for msg in messages:
-            role, content = msg.role, msg.content
-            if role == "user":
-                prompt += f"{self.human_id}: {content}\n"
-            elif role == "assistant":
-                prompt += f"{self.bot_id}: {content}\n"
-            else:
-                prompt += f"{self.unknown_id}: {content}\n"
-        prompt += f"{self.bot_id}:"
-        return prompt
-
-    def convert_output(self, output: str):
-        """Convert the model output to final answer."""
-        human_id = self.human_id.strip()
-        bot_id = self.bot_id.strip()
-        if human_id != "":
-            output = output.split(human_id)[0]
-        if bot_id != "":
-            output = output.split(bot_id)[0]
-        for word in self.MEANINGLESS_WORDS:
-            output = output.replace(word, "")
-        text = output
-        # remove partial human_id or bot id
-        if "\n" in text and (
-            human_id.startswith(text[text.rfind("\n") + 1 :])
-            or bot_id.startswith(text[text.rfind("\n") + 1])
-        ):
-            text = text[: text.rfind("\n")]
-        return text
-
-    def get_prompt(self, messages):
-        """Generate response based on messages."""
-        prompt = self.prepare_prompt(messages)
-        return prompt
-
-
-class ChatModelGptJ(ChatModel):
-    def __init__(self, intro, human_id, bot_id, stop_words):
-        super().__init__(intro, human_id, bot_id, stop_words)
-
-    def prepare_prompt(self, messages: list):
-        """Prepare prompt from history messages."""
-        prompt = self.intro
-        for msg in messages:
-            msg = dict(msg)
-            role, content = msg["role"], msg["content"]
-            if role == "user":
-                if self.human_id != "":
-                    prompt += f"{self.human_id}:\n{content}\n"
-                else:
-                    prompt += f"{content}\n"
-            elif role == "assistant":
-                if self.bot_id != "":
-                    prompt += f"{self.bot_id}:\n{content}\n"
-                else:
-                    prompt += f"{content}\n"
-            else:
-                prompt += f"### Unknown:\n{content}\n"
-        if self.bot_id != "":
-            prompt += f"{self.bot_id}:\n"
-        return prompt
-
-
-class ChatModelLLama(ChatModel):
-    def __init__(self, intro, human_id, bot_id, stop_words):
-        super().__init__(intro, human_id, bot_id, stop_words)
-
-    def prepare_prompt(self, messages: list):
-        """Prepare prompt from history messages."""
-        prompt = self.intro
-        for msg in messages:
-            msg = dict(msg)
-            role, content = msg["role"], msg["content"]
-            if role == "user":
-                if self.human_id != "":
-                    prompt += self.human_id.format(msg=content)
-                else:
-                    prompt += f"{content}\n"
-            elif role == "assistant":
-                prompt += f"{content}\n"
-            elif role == "tool":
-                prompt += f"{content}\n"
-            elif role == "system":
-                prompt += f"### system:\n{content}\n"
-            else:
-                prompt += f"### Unknown:\n{content}\n"
-        if self.bot_id != "":
-            prompt += f"{self.bot_id}:\n"
-        return prompt
-
-
-class ChatModelwithImage(ChatModel):
-    def __init__(self, intro, human_id, bot_id, stop_words):
-        super().__init__(intro, human_id, bot_id, stop_words)
-
-    def prepare_prompt(self, messages: list):
-        """Prepare prompt from history messages."""
-        from PIL import Image
-        import requests
-        from io import BytesIO
-        import base64
-        import re
-
-        prompt = self.intro
-        for msg in messages:
-            msg = dict(msg)
-            role, content = msg["role"], msg["content"]
-            text_prompt = []
-            image_prompt = []
-            for item in content:
-                if item["type"] == "text":
-                    text_prompt.append(item["text"])
-                elif item["type"] == "image_url":
-                    image_prompt.append(item["image_url"])
-                else:
-                    raise ValueError(f"Unknown content type {item['type']}")
-
-            content = "\n".join(text_prompt)
-            # prepare images
-            images = []
-            for img in image_prompt:
-                if "url" not in img:
-                    continue
-                is_data = len(re.findall("^data:image/.+;base64,", img["url"])) > 0
-                if is_data:
-                    encoded_str = re.sub("^data:image/.+;base64,", "", img["url"])
-                    images.append(Image.open(BytesIO(base64.b64decode(encoded_str))))
-                else:
-                    images.append(Image.open(requests.get(img["url"], stream=True).raw))
-
-            if role == "user":
-                if self.human_id != "":
-                    prompt += self.human_id.format(msg=content)
-                else:
-                    prompt += f"{content}\n"
-            elif role == "assistant":
-                prompt += f"{content}\n"
-            else:
-                prompt += f"### Unknown:\n{content}\n"
-        if self.bot_id != "":
-            prompt += f"{self.bot_id}:\n"
-        return prompt, images
-
-
-class ChatModelGemma(ChatModel):
-    def __init__(self, intro, human_id, bot_id, stop_words):
-        super().__init__(intro, human_id, bot_id, stop_words)
-
-    def prepare_prompt(self, messages: list):
-        """Prepare prompt from history messages."""
-        prompt = self.intro
-        for msg in messages:
-            msg = dict(msg)
-            role, content = msg["role"], msg["content"]
-            if role == "user":
-                if self.human_id != "":
-                    prompt += f"{self.human_id} {content}\n"
-                else:
-                    prompt += f"{content}\n"
-            elif role == "assistant":
-                if self.bot_id != "":
-                    prompt += f"{self.bot_id} {content}\n"
-                else:
-                    prompt += f"{content}\n"
-            else:
-                prompt += f"### Unknown:\n{content}\n"
-        if self.bot_id != "":
-            prompt += f"{self.bot_id}:\n"
-        return prompt
-
-
-class ChatModelNoFormat(ChatModel):
-    def __init__(self, intro, human_id, bot_id, stop_words):
-        super().__init__(intro, human_id, bot_id, stop_words)
-
-    def prepare_prompt(self, messages: list):
-        """Prepare prompt from history messages."""
-        prompt = ""
-        for msg in messages:
-            msg = dict(msg)
-            prompt += msg["content"]
-        return prompt
-
-
-if __name__ == "__main__":
-    process_tool = ChatModelGptJ(
-        "", "### Instruction", "### Response", stop_words=["##", "### Instruction"]
-    )
diff --git a/llm_on_ray/inference/chat_template_process.py b/llm_on_ray/inference/chat_template_process.py
new file mode 100644
index 000000000..25ff1056d
--- /dev/null
+++ b/llm_on_ray/inference/chat_template_process.py
@@ -0,0 +1,77 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+class ChatModel:
+
+    def __init__(self, tokenizer) -> None:
+        self.tokenizer = tokenizer
+
+    def get_prompt(self, messages):
+        """Generate response based on messages."""
+        prompt = self.prepare_prompt(messages)
+        return prompt
+
+
+    def _extract_messages(self, messages):
+        texts, images = [], []
+        for message in messages:
+            if message["role"] == "user" and isinstance(message["content"], list):
+                texts.append({"role": "user", "content": message["content"][0]["text"]})
+                images.append(
+                    {"role": "user", "content": message["content"][1]["image_url"]["url"]}
+                )
+            else:
+                texts.append(message)
+        return texts, images
+
+    def _prepare_image(self, messages: list):
+        """Prepare image from history messages."""
+        from PIL import Image
+        import requests
+        from io import BytesIO
+        import base64
+        import re
+
+        # prepare images
+        images: List = []
+        if isinstance(messages[0], List):
+            for i in range(len(messages)):
+                for msg in messages[i]:
+                    msg = dict(msg)
+                    content = msg["content"]
+                    if "url" not in content:
+                        continue
+                    is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
+                    if is_data:
+                        encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
+                        images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str))))
+                    else:
+                        images[i].append(Image.open(requests.get(content["url"], stream=True).raw))
+        elif isinstance(messages[0], dict):
+            for msg in messages:
+                msg = dict(msg)
+                content = msg["content"]
+                if "url" not in content:
+                    continue
+                is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
+                if is_data:
+                    encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
+                    images.append(Image.open(BytesIO(base64.b64decode(encoded_str))))
+                else:
+                    images.append(Image.open(requests.get(content["url"], stream=True).raw))
+
+        return images
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
index 3c93da6b6..eff253e46 100644
--- a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
+++ b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: "cpu"
+device: cpu
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/bloom-560m.yaml b/llm_on_ray/inference/models/bloom-560m.yaml
index 92dbb8809..be5e9414e 100644
--- a/llm_on_ray/inference/models/bloom-560m.yaml
+++ b/llm_on_ray/inference/models/bloom-560m.yaml
@@ -6,9 +6,9 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: CPU
+device: cpu
 ipex:
-  enabled: false
+  enabled: true
   precision: bf16
 model_description:
   model_id_or_path: bigscience/bloom-560m
diff --git a/llm_on_ray/inference/models/deplot.yaml b/llm_on_ray/inference/models/deplot.yaml
index 6b518def9..8f8edd47c 100644
--- a/llm_on_ray/inference/models/deplot.yaml
+++ b/llm_on_ray/inference/models/deplot.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: "cpu"
+device: cpu
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/falcon-7b.yaml b/llm_on_ray/inference/models/falcon-7b.yaml
index 5a59e6e47..5801f12be 100644
--- a/llm_on_ray/inference/models/falcon-7b.yaml
+++ b/llm_on_ray/inference/models/falcon-7b.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: "cpu"
+device: cpu
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/fuyu8b.yaml b/llm_on_ray/inference/models/fuyu8b.yaml
index c45affbae..e62303d83 100644
--- a/llm_on_ray/inference/models/fuyu8b.yaml
+++ b/llm_on_ray/inference/models/fuyu8b.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: "cpu"
+device: cpu
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/gemma-2b.yaml b/llm_on_ray/inference/models/gemma-2b.yaml
index c08bd571b..a27b6bc0f 100644
--- a/llm_on_ray/inference/models/gemma-2b.yaml
+++ b/llm_on_ray/inference/models/gemma-2b.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 2
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: CPU
+device: cpu
 ipex:
   enabled: true
   precision: bf16
diff --git a/llm_on_ray/inference/models/gpt-j-6b.yaml b/llm_on_ray/inference/models/gpt-j-6b.yaml
index b2a7a04df..6560f1623 100644
--- a/llm_on_ray/inference/models/gpt-j-6b.yaml
+++ b/llm_on_ray/inference/models/gpt-j-6b.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: "cpu"
+device: cpu
 ipex:
   # false here for ci coverage
   enabled: false
diff --git a/llm_on_ray/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml
index 2d44b9882..96737288f 100644
--- a/llm_on_ray/inference/models/gpt2.yaml
+++ b/llm_on_ray/inference/models/gpt2.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: CPU
+device: cpu
 ipex:
   enabled: true
   precision: bf16
diff --git a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
index 4f2ed0194..7fdae3933 100644
--- a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
+++ b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: "cpu"
+device: cpu
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml
index ab901eb95..ea50f6af7 100644
--- a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml
+++ b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml
@@ -5,13 +5,13 @@ cpus_per_worker: 48
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: CPU
+device: cpu
 ipex:
   enabled: false
   precision: bf16
 model_description:
   model_id_or_path: mistralai/Mistral-7B-Instruct-v0.2
-  bigdl: false
+  ipexllm: false
   tokenizer_name_or_path: mistralai/Mistral-7B-Instruct-v0.2
   config:
     trust_remote_code: true
diff --git a/llm_on_ray/inference/models/mpt-7b.yaml b/llm_on_ray/inference/models/mpt-7b.yaml
index 80f062a82..89ce086ed 100644
--- a/llm_on_ray/inference/models/mpt-7b.yaml
+++ b/llm_on_ray/inference/models/mpt-7b.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: CPU
+device: cpu
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
index 670654715..bd49ce189 100644
--- a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
+++ b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: CPU
+device: cpu
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
index be85f8463..6d12b35df 100644
--- a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
+++ b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
@@ -5,7 +5,7 @@ cpus_per_worker: 22
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: "cpu"
+device: cpu
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/starcoder.yaml b/llm_on_ray/inference/models/starcoder.yaml
index 2044cf109..adbc91fc0 100644
--- a/llm_on_ray/inference/models/starcoder.yaml
+++ b/llm_on_ray/inference/models/starcoder.yaml
@@ -9,7 +9,7 @@ workers_per_group: 2
 ipex:
   enabled: false
   precision: bf16
-device: "cpu"
+device: cpu
 model_description:
   model_id_or_path: bigcode/starcoder
   tokenizer_name_or_path: bigcode/starcoder
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index 92d8435a0..f32946b26 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -437,52 +437,4 @@ async def openai_call(
         else:
             yield await self.handle_non_streaming(prompts, config)
 
-    def _extract_messages(self, messages):
-        texts, images = [], []
-        for message in messages:
-            if message["role"] == "user" and isinstance(message["content"], list):
-                texts.append({"role": "user", "content": message["content"][0]["text"]})
-                images.append(
-                    {"role": "user", "content": message["content"][1]["image_url"]["url"]}
-                )
-            else:
-                texts.append(message)
-        return texts, images
-
-    def _prepare_image(self, messages: list):
-        """Prepare image from history messages."""
-        from PIL import Image
-        import requests
-        from io import BytesIO
-        import base64
-        import re
-
-        # prepare images
-        images: List = []
-        if isinstance(messages[0], List):
-            for i in range(len(messages)):
-                for msg in messages[i]:
-                    msg = dict(msg)
-                    content = msg["content"]
-                    if "url" not in content:
-                        continue
-                    is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
-                    if is_data:
-                        encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
-                        images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str))))
-                    else:
-                        images[i].append(Image.open(requests.get(content["url"], stream=True).raw))
-        elif isinstance(messages[0], dict):
-            for msg in messages:
-                msg = dict(msg)
-                content = msg["content"]
-                if "url" not in content:
-                    continue
-                is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
-                if is_data:
-                    encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
-                    images.append(Image.open(BytesIO(base64.b64decode(encoded_str))))
-                else:
-                    images.append(Image.open(requests.get(content["url"], stream=True).raw))
 
-        return images
diff --git a/llm_on_ray/ui/start_ui.py b/llm_on_ray/ui/start_ui.py
index c30851a8e..9f76a2696 100644
--- a/llm_on_ray/ui/start_ui.py
+++ b/llm_on_ray/ui/start_ui.py
@@ -31,7 +31,7 @@
 from ray.util import queue
 from llm_on_ray.inference.inference_config import all_models, ModelDescription, Prompt
 from llm_on_ray.inference.inference_config import InferenceConfig as FinetunedConfig
-from llm_on_ray.inference.chat_process import (
+from llm_on_ray.inference.chat_template_process import (
     ChatModelGptJ,
     ChatModelLLama,
     ChatModelwithImage,

From 5d11e45ee48793fcd772605b0c0d8cbf4799e174 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Fri, 26 Apr 2024 07:43:10 +0000
Subject: [PATCH 09/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 llm_on_ray/inference/chat_template_process.py | 63 ++++++++++++--
 llm_on_ray/inference/predictor_deployment.py  | 82 ++++++-------------
 2 files changed, 82 insertions(+), 63 deletions(-)

diff --git a/llm_on_ray/inference/chat_template_process.py b/llm_on_ray/inference/chat_template_process.py
index 25ff1056d..c39a10b84 100644
--- a/llm_on_ray/inference/chat_template_process.py
+++ b/llm_on_ray/inference/chat_template_process.py
@@ -13,18 +13,71 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from typing import List, Union
 
+from llm_on_ray.inference.api_openai_backend.openai_protocol import ChatMessage
 
-class ChatModel:
+
+class ChatTemplatePreprocess:
 
     def __init__(self, tokenizer) -> None:
         self.tokenizer = tokenizer
 
-    def get_prompt(self, messages):
-        """Generate response based on messages."""
-        prompt = self.prepare_prompt(messages)
-        return prompt
+    def get_prompt(self, input: List, is_mllm=False):
+        """Generate response based on input."""
+        if self.predictor.infer_conf.model_description.chat_template is not None:
+            self.predictor.tokenizer.chat_template = (
+                self.predictor.infer_conf.model_description.chat_template
+            )
+        elif self.predictor.tokenizer.chat_template is None:
+            self.predictor.tokenizer.chat_template = (
+                self.predictor.infer_conf.model_description.default_chat_template
+            )
 
+        if is_mllm:
+            if isinstance(input, List):
+                if isinstance(input, list) and input and isinstance(input[0], ChatMessage):
+                    messages = []
+                    for chat_message in input:
+                        message = {
+                            "role": chat_message.role,
+                            "content": chat_message.content,
+                        }
+                        messages.append(message)
+                    texts, images = self._extract_messages(messages)
+                elif isinstance(input, list) and input and isinstance(input[0], dict):
+                    texts, images = self._extract_messages(input)
+                elif isinstance(input, list) and input and isinstance(input[0], list):
+                    texts, images = [self._extract_messages(p) for p in input]
+
+                image = self._prepare_image(images)
+                prompt = self.predictor.tokenizer.apply_chat_template(texts, tokenize=False)
+                return prompt, image
+        else:
+            if isinstance(input, list) and input and isinstance(input[0], dict):
+                prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False)
+            elif isinstance(input, list) and input and isinstance(input[0], list):
+                prompt = [
+                    self.predictor.tokenizer.apply_chat_template(t, tokenize=False)
+                    for t in input
+                ]
+            elif isinstance(input, list) and input and isinstance(input[0], ChatMessage):
+                messages = []
+                for chat_message in input:
+                    message = {"role": chat_message.role, "content": chat_message.content}
+                    messages.append(message)
+                prompt = self.predictor.tokenizer.apply_chat_template(
+                    messages, tokenize=False
+                )
+            elif isinstance(input, list) and input and isinstance(input[0], str):
+                prompt = input
+            elif isinstance(input, str):
+                prompt = input
+            else:
+                raise TypeError(
+                    f"Unsupported type {type(input)} for text. Expected dict or list of dicts."
+                )
+        return prompt
 
     def _extract_messages(self, messages):
         texts, images = [], []
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index f32946b26..d5e211c37 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -26,6 +26,8 @@
 from starlette.requests import Request
 from starlette.responses import StreamingResponse, JSONResponse
 from fastapi import HTTPException
+
+from llm_on_ray.inference.chat_template_process import ChatTemplatePreprocess
 from llm_on_ray.inference.inference_config import InferenceConfig
 from llm_on_ray.inference.api_openai_backend.openai_protocol import (
     ChatMessage,
@@ -82,6 +84,8 @@ def __init__(
             self.predictor = TransformerPredictor(infer_conf)
 
         self.loop = asyncio.get_running_loop()
+        self.process_tool = ChatTemplatePreprocess(self.predictor.tokenizer)
+
 
     def consume_streamer(self, streamer):
         for text in streamer:
@@ -285,13 +289,12 @@ async def handle_static_batch(self, prompts: List[str], **config: Dict[str, Any]
                     preprocessing_time=0,
                 )
 
-    # TODO:Abstract the preprocess_prompts function into a class for handling chat templates
-    def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=None):
+    def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=None):
         """
         Preprocesses the input prompts.
 
         Args:
-            input (Union[str, List[dict]]): The input prompt(s) to be preprocessed.
+            input (Union[str, List[str]]): The input prompt(s) to be preprocessed.
             tools (List[str]): The list of tools to be used.
             tool_choice: The choice of tool to be used.
 
@@ -310,7 +313,10 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No
         """
         if isinstance(input, str):
             return input
-        elif isinstance(input, list):
+        elif isinstance(input, List):
+            prompts = []
+            images = []
+
             prompt_format = get_prompt_format(input)
             if prompt_format == PromptFormat.CHAT_FORMAT:
                 # Process the input prompts with tools
@@ -327,63 +333,23 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No
                             m.content = self.openai_tools_prompter.content_from_assistant(m)  # type: ignore
                         elif m.tool_call_id is not None:  # type: ignore
                             m.content = self.openai_tools_prompter.content_from_tool(m)  # type: ignore
-
-                if self.predictor.infer_conf.model_description.chat_template is not None:
-                    self.predictor.tokenizer.chat_template = (
-                        self.predictor.infer_conf.model_description.chat_template
-                    )
-                elif self.predictor.tokenizer.chat_template is None:
-                    self.predictor.tokenizer.chat_template = (
-                        self.predictor.infer_conf.model_description.default_chat_template
-                    )
-
-                if self.is_mllm:
-                    if isinstance(input, list):
-                        if isinstance(input, list) and input and isinstance(input[0], ChatMessage):
-                            messages = []
-                            for chat_message in input:
-                                message = {
-                                    "role": chat_message.role,
-                                    "content": chat_message.content,
-                                }
-                                messages.append(message)
-                            texts, images = self._extract_messages(messages)
-                        elif isinstance(input, list) and input and isinstance(input[0], dict):
-                            texts, images = self._extract_messages(input)
-                        elif isinstance(input, list) and input and isinstance(input[0], list):
-                            texts, images = [self._extract_messages(p) for p in input]
-
-                        image = self._prepare_image(images)
-                        prompt = self.predictor.tokenizer.apply_chat_template(texts, tokenize=False)
-                        return prompt, image
-                else:
-                    if isinstance(input, list) and input and isinstance(input[0], dict):
-                        prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False)
-                    elif isinstance(input, list) and input and isinstance(input[0], list):
-                        prompt = [
-                            self.predictor.tokenizer.apply_chat_template(t, tokenize=False)
-                            for t in input
-                        ]
-                    elif isinstance(input, list) and input and isinstance(input[0], ChatMessage):
-                        messages = []
-                        for chat_message in input:
-                            message = {"role": chat_message.role, "content": chat_message.content}
-                            messages.append(message)
-                        prompt = self.predictor.tokenizer.apply_chat_template(
-                            messages, tokenize=False
-                        )
-                    elif isinstance(input, list) and input and isinstance(input[0], str):
-                        prompt = input
-                    elif isinstance(input, str):
-                        prompt = input
+                # Process the input prompts with MLLM tool
+                if self.process_tool is not None:
+                    if self.is_mllm:
+                        input, image = self.process_tool.get_prompt(input, self.is_mllm)
+                        prompts.append(input)
+                        images.extend(image)
+                        return prompts, images
                     else:
-                        raise TypeError(
-                            f"Unsupported type {type(input)} for text. Expected dict or list of dicts."
-                        )
-                return prompt
+                        prompt = self.process_tool.get_prompt(input)
+                        return prompt
+                else:
+                    prompts.extend(input)
             elif prompt_format == PromptFormat.PROMPTS_FORMAT:
+                prompts.extend(input)
+            else:
                 raise HTTPException(400, "Invalid prompt format.")
-            return input
+            return prompts
         else:
             raise HTTPException(400, "Invalid prompt format.")
 

From 62ab1bfbf810221ecc76e4b602bb10da652a008c Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Sat, 27 Apr 2024 22:09:17 +0800
Subject: [PATCH 10/47] update

---
 llm_on_ray/inference/chat_template_process.py | 14 +++++---------
 llm_on_ray/inference/predictor_deployment.py  | 10 ++++++----
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/llm_on_ray/inference/chat_template_process.py b/llm_on_ray/inference/chat_template_process.py
index c39a10b84..388c850f5 100644
--- a/llm_on_ray/inference/chat_template_process.py
+++ b/llm_on_ray/inference/chat_template_process.py
@@ -19,9 +19,8 @@
 
 
 class ChatTemplatePreprocess:
-
-    def __init__(self, tokenizer) -> None:
-        self.tokenizer = tokenizer
+    def __init__(self, predictor) -> None:
+        self.predictor = predictor
 
     def get_prompt(self, input: List, is_mllm=False):
         """Generate response based on input."""
@@ -58,17 +57,14 @@ def get_prompt(self, input: List, is_mllm=False):
                 prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False)
             elif isinstance(input, list) and input and isinstance(input[0], list):
                 prompt = [
-                    self.predictor.tokenizer.apply_chat_template(t, tokenize=False)
-                    for t in input
+                    self.predictor.tokenizer.apply_chat_template(t, tokenize=False) for t in input
                 ]
             elif isinstance(input, list) and input and isinstance(input[0], ChatMessage):
                 messages = []
                 for chat_message in input:
                     message = {"role": chat_message.role, "content": chat_message.content}
                     messages.append(message)
-                prompt = self.predictor.tokenizer.apply_chat_template(
-                    messages, tokenize=False
-                )
+                prompt = self.predictor.tokenizer.apply_chat_template(messages, tokenize=False)
             elif isinstance(input, list) and input and isinstance(input[0], str):
                 prompt = input
             elif isinstance(input, str):
@@ -127,4 +123,4 @@ def _prepare_image(self, messages: list):
                 else:
                     images.append(Image.open(requests.get(content["url"], stream=True).raw))
 
-        return images
\ No newline at end of file
+        return images
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index d5e211c37..6f371681a 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -84,8 +84,7 @@ def __init__(
             self.predictor = TransformerPredictor(infer_conf)
 
         self.loop = asyncio.get_running_loop()
-        self.process_tool = ChatTemplatePreprocess(self.predictor.tokenizer)
-
+        self.process_tool = ChatTemplatePreprocess(self.predictor)
 
     def consume_streamer(self, streamer):
         for text in streamer:
@@ -311,6 +310,10 @@ def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=No
         Raises:
             HTTPException: If the input prompt format is invalid or not supported.
         """
+        logger.info("preprocess_prompts")
+        logger.info(input)
+        logger.info(type(input))
+
         if isinstance(input, str):
             return input
         elif isinstance(input, List):
@@ -366,6 +369,7 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON
             )
         streaming_response = json_request["stream"] if "stream" in json_request else False
         input = json_request["text"] if "text" in json_request else ""
+
         if input == "":
             return JSONResponse(
                 status_code=400,
@@ -402,5 +406,3 @@ async def openai_call(
                 yield result
         else:
             yield await self.handle_non_streaming(prompts, config)
-
-

From cc356f6fa68c14e52302c8511290f40458233616 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Sun, 28 Apr 2024 11:54:12 +0800
Subject: [PATCH 11/47] update

---
 .../inference/api_server_openai/query_http_requests_tool.py | 3 +--
 llm_on_ray/inference/models/gpt-j-6b.yaml                   | 2 +-
 llm_on_ray/inference/predictor_deployment.py                | 5 -----
 llm_on_ray/ui/start_ui.py                                   | 6 +-----
 4 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/examples/inference/api_server_openai/query_http_requests_tool.py b/examples/inference/api_server_openai/query_http_requests_tool.py
index 217f2b792..c9efd222d 100644
--- a/examples/inference/api_server_openai/query_http_requests_tool.py
+++ b/examples/inference/api_server_openai/query_http_requests_tool.py
@@ -73,7 +73,6 @@
 
 messages = [
     [
-        {"role": "user", "content": "You are a helpful assistant"},
         {"role": "user", "content": "What's the weather like in Boston today?"},
     ],
 ]
@@ -81,7 +80,7 @@
 proxies = {"http": None, "https": None}
 
 for message in messages:
-    print(f"User: {message[1]['content']}")
+    print(f"User: {message[0]['content']}")
     print("Assistant:", end=" ", flush=True)
 
     body = {
diff --git a/llm_on_ray/inference/models/gpt-j-6b.yaml b/llm_on_ray/inference/models/gpt-j-6b.yaml
index 6560f1623..9719b2f7e 100644
--- a/llm_on_ray/inference/models/gpt-j-6b.yaml
+++ b/llm_on_ray/inference/models/gpt-j-6b.yaml
@@ -14,4 +14,4 @@ ipex:
 model_description:
   model_id_or_path: EleutherAI/gpt-j-6b
   tokenizer_name_or_path: EleutherAI/gpt-j-6b
-  gpt_base_model: true
\ No newline at end of file
+  gpt_base_model: true
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index 6f371681a..2e642bfff 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -310,9 +310,6 @@ def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=No
         Raises:
             HTTPException: If the input prompt format is invalid or not supported.
         """
-        logger.info("preprocess_prompts")
-        logger.info(input)
-        logger.info(type(input))
 
         if isinstance(input, str):
             return input
@@ -357,7 +354,6 @@ def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=No
             raise HTTPException(400, "Invalid prompt format.")
 
     async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSONResponse, str]:
-        logger.info("PredictorDeployment call")
         self.use_openai = False
 
         try:
@@ -376,7 +372,6 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON
                 content="Empty prompt is not supported.",
             )
         config = json_request["config"] if "config" in json_request else {}
-        logger.info(input)
         # return prompt or list of prompts preprocessed
         prompts = self.preprocess_prompts(input)
 
diff --git a/llm_on_ray/ui/start_ui.py b/llm_on_ray/ui/start_ui.py
index 9f76a2696..5cdece259 100644
--- a/llm_on_ray/ui/start_ui.py
+++ b/llm_on_ray/ui/start_ui.py
@@ -31,11 +31,7 @@
 from ray.util import queue
 from llm_on_ray.inference.inference_config import all_models, ModelDescription, Prompt
 from llm_on_ray.inference.inference_config import InferenceConfig as FinetunedConfig
-from llm_on_ray.inference.chat_template_process import (
-    ChatModelGptJ,
-    ChatModelLLama,
-    ChatModelwithImage,
-)
+
 from llm_on_ray.inference.predictor_deployment import PredictorDeployment
 from llm_on_ray.ui.html_format import cpu_memory_html, ray_status_html, custom_css
 from langchain.vectorstores import FAISS

From 11718e80f7f8b355bcfe3f6401b82b72571244ee Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 29 Apr 2024 17:22:59 +0800
Subject: [PATCH 12/47] update

---
 llm_on_ray/inference/chat_template_process.py | 105 ++++++------------
 llm_on_ray/inference/inference_config.py      |  16 +--
 llm_on_ray/inference/models/gemma-2b.yaml     |   1 +
 .../inference/models/neural-chat-7b-v3-1.yaml |   3 +-
 llm_on_ray/inference/utils.py                 |   4 +-
 5 files changed, 47 insertions(+), 82 deletions(-)

diff --git a/llm_on_ray/inference/chat_template_process.py b/llm_on_ray/inference/chat_template_process.py
index 388c850f5..2f7a64d27 100644
--- a/llm_on_ray/inference/chat_template_process.py
+++ b/llm_on_ray/inference/chat_template_process.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from typing import List, Union
+from typing import List
 
 from llm_on_ray.inference.api_openai_backend.openai_protocol import ChatMessage
 
@@ -24,56 +24,31 @@ def __init__(self, predictor) -> None:
 
     def get_prompt(self, input: List, is_mllm=False):
         """Generate response based on input."""
-        if self.predictor.infer_conf.model_description.chat_template is not None:
-            self.predictor.tokenizer.chat_template = (
-                self.predictor.infer_conf.model_description.chat_template
+        self.predictor.tokenizer.chat_template = (
+            self.predictor.infer_conf.model_description.chat_template
+            or self.predictor.tokenizer.chat_template
+            or self.predictor.infer_conf.model_description.default_chat_template
+        )
+
+        if isinstance(input, list) and input and isinstance(input[0], (ChatMessage, dict)):
+            messages = (
+                [dict(chat_message) for chat_message in input]
+                if isinstance(input[0], ChatMessage)
+                else input
             )
-        elif self.predictor.tokenizer.chat_template is None:
-            self.predictor.tokenizer.chat_template = (
-                self.predictor.infer_conf.model_description.default_chat_template
+            prompt = self.predictor.tokenizer.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=False
             )
-
-        if is_mllm:
-            if isinstance(input, List):
-                if isinstance(input, list) and input and isinstance(input[0], ChatMessage):
-                    messages = []
-                    for chat_message in input:
-                        message = {
-                            "role": chat_message.role,
-                            "content": chat_message.content,
-                        }
-                        messages.append(message)
-                    texts, images = self._extract_messages(messages)
-                elif isinstance(input, list) and input and isinstance(input[0], dict):
-                    texts, images = self._extract_messages(input)
-                elif isinstance(input, list) and input and isinstance(input[0], list):
-                    texts, images = [self._extract_messages(p) for p in input]
-
+            if is_mllm:
+                texts, images = self._extract_messages(messages)
                 image = self._prepare_image(images)
-                prompt = self.predictor.tokenizer.apply_chat_template(texts, tokenize=False)
-                return prompt, image
-        else:
-            if isinstance(input, list) and input and isinstance(input[0], dict):
-                prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False)
-            elif isinstance(input, list) and input and isinstance(input[0], list):
-                prompt = [
-                    self.predictor.tokenizer.apply_chat_template(t, tokenize=False) for t in input
-                ]
-            elif isinstance(input, list) and input and isinstance(input[0], ChatMessage):
-                messages = []
-                for chat_message in input:
-                    message = {"role": chat_message.role, "content": chat_message.content}
-                    messages.append(message)
-                prompt = self.predictor.tokenizer.apply_chat_template(messages, tokenize=False)
-            elif isinstance(input, list) and input and isinstance(input[0], str):
-                prompt = input
-            elif isinstance(input, str):
-                prompt = input
-            else:
-                raise TypeError(
-                    f"Unsupported type {type(input)} for text. Expected dict or list of dicts."
+                prompt = self.predictor.tokenizer.apply_chat_template(
+                    texts, add_generation_prompt=True, tokenize=False
                 )
-        return prompt
+                return prompt, image
+            return prompt
+
+        raise TypeError(f"Unsupported type {type(input)} for text. Expected dict or list of dicts.")
 
     def _extract_messages(self, messages):
         texts, images = [], []
@@ -97,30 +72,16 @@ def _prepare_image(self, messages: list):
 
         # prepare images
         images: List = []
-        if isinstance(messages[0], List):
-            for i in range(len(messages)):
-                for msg in messages[i]:
-                    msg = dict(msg)
-                    content = msg["content"]
-                    if "url" not in content:
-                        continue
-                    is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
-                    if is_data:
-                        encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
-                        images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str))))
-                    else:
-                        images[i].append(Image.open(requests.get(content["url"], stream=True).raw))
-        elif isinstance(messages[0], dict):
-            for msg in messages:
-                msg = dict(msg)
-                content = msg["content"]
-                if "url" not in content:
-                    continue
-                is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
-                if is_data:
-                    encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
-                    images.append(Image.open(BytesIO(base64.b64decode(encoded_str))))
-                else:
-                    images.append(Image.open(requests.get(content["url"], stream=True).raw))
+        for msg in messages:
+            msg = dict(msg)
+            content = msg["content"]
+            if "url" not in content:
+                continue
+            is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
+            if is_data:
+                encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
+                images.append(Image.open(BytesIO(base64.b64decode(encoded_str))))
+            else:
+                images.append(Image.open(requests.get(content["url"], stream=True).raw))
 
         return images
diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py
index 6842fe63e..e1ea2e2c3 100644
--- a/llm_on_ray/inference/inference_config.py
+++ b/llm_on_ray/inference/inference_config.py
@@ -115,20 +115,22 @@ class ModelDescription(BaseModel):
     chat_model_with_image: bool = False
     chat_template: Union[str, None] = None
     default_chat_template: str = (
-        "{{ bos_token }}"
+        "Below is an instruction that describes a task. Write a response that appropriately completes the request."
         "{% if messages[0]['role'] == 'system' %}"
-        "{{ raise_exception('System role not supported') }}"
-        "{% endif %}"
-        "{% for message in messages %}"
+        "{% set loop_messages = messages[1:] %}"
+        "{% set system_message = messages[0]['content'] %}"
+        "{% else %}{% set loop_messages = messages %}"
+        "{% set system_message = false %}{% endif %}"
+        "{% for message in loop_messages %}"
         "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
         "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
         "{% endif %}"
         "{% if message['role'] == 'user' %}"
-        "{{ '### Instruction: ' + message['content'] + eos_token }}"
+        "{{ '### Instruction: ' + message['content'].strip() }}"
         "{% elif message['role'] == 'assistant' %}"
-        "{{ '### Response:'  + message['content'] + eos_token }}"
+        "{{ '### Response:'  + message['content'].strip() }}"
         "{% endif %}{% endfor %}"
-        "{{'### End \n'}}"
+        "{% if add_generation_prompt %}{{'### Response:\n'}}{% endif %}"
     )
 
     @validator("quantization_type")
diff --git a/llm_on_ray/inference/models/gemma-2b.yaml b/llm_on_ray/inference/models/gemma-2b.yaml
index a27b6bc0f..09e971081 100644
--- a/llm_on_ray/inference/models/gemma-2b.yaml
+++ b/llm_on_ray/inference/models/gemma-2b.yaml
@@ -15,3 +15,4 @@ model_description:
   tokenizer_name_or_path: google/gemma-2b
   config:
     use_auth_token: ' '
+  chat_template: "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}"
diff --git a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
index bd49ce189..08945e865 100644
--- a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
+++ b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
@@ -13,4 +13,5 @@ ipex:
 model_description:
   model_id_or_path: Intel/neural-chat-7b-v3-1
   tokenizer_name_or_path: Intel/neural-chat-7b-v3-1
-  chat_template: "{{ bos_token }}'### System:You are a chatbot developed by Intel. Please answer all questions to the best of your ability.\n'{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### User: ' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '### Assistant:'  + message['content'] + eos_token }}{% endif %}{% endfor %}"
+  chat_template: "'### System:You are a chatbot developed by Intel. Please answer all questions to the best of your ability.\n'{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:]}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### User: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '### Assistant:'  + message['content'].strip() }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'### Assistant:\n'}}{% endif %}"
+
diff --git a/llm_on_ray/inference/utils.py b/llm_on_ray/inference/utils.py
index 5a8db0401..56b9146e5 100644
--- a/llm_on_ray/inference/utils.py
+++ b/llm_on_ray/inference/utils.py
@@ -162,13 +162,13 @@ def is_cpu_without_ipex(infer_conf: InferenceConfig) -> bool:
     return (not infer_conf.ipex.enabled) and infer_conf.device == DEVICE_CPU
 
 
-def get_prompt_format(input: Union[List[str], List[dict], List[List[dict]], List[ChatMessage]]):
+def get_prompt_format(input: Union[List[str], List[dict], List[ChatMessage]]):
     chat_format = True
     prompts_format = True
     for item in input:
         if isinstance(item, str):
             chat_format = False
-        elif isinstance(item, dict) or isinstance(item, ChatMessage) or isinstance(item, list):
+        elif isinstance(item, dict) or isinstance(item, ChatMessage):
             prompts_format = False
         else:
             chat_format = False

From d254f266ccd06c9cceec4b04c95ad93eafe1c9b8 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 29 Apr 2024 17:43:58 +0800
Subject: [PATCH 13/47] update yaml file

---
 llm_on_ray/inference/models/gpt2.yaml                            | 1 +
 llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml         | 1 +
 .../inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml      | 1 +
 llm_on_ray/inference/models/mistral-7b-v0.1.yaml                 | 1 +
 llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml             | 1 -
 llm_on_ray/inference/models/opt-125m.yaml                        | 1 +
 llm_on_ray/inference/models/sqlcoder-7b-2.yaml                   | 1 +
 llm_on_ray/inference/models/starcoder.yaml                       | 1 +
 8 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/llm_on_ray/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml
index 96737288f..354f1c348 100644
--- a/llm_on_ray/inference/models/gpt2.yaml
+++ b/llm_on_ray/inference/models/gpt2.yaml
@@ -14,3 +14,4 @@ model_description:
   model_id_or_path: gpt2
   tokenizer_name_or_path: gpt2
   gpt_base_model: true
+  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
index 00ff121c5..35fadb820 100644
--- a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
+++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
@@ -14,3 +14,4 @@ ipex:
 model_description:
   model_id_or_path: Intel/neural-chat-7b-v3-3
   tokenizer_name_or_path: Intel/neural-chat-7b-v3-3
+  chat_template: "'### System:You are a chatbot developed by Intel. Please answer all questions to the best of your ability.\n'{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:]}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### User: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '### Assistant:'  + message['content'].strip() }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'### Assistant:\n'}}{% endif %}"
diff --git a/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml b/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml
index 5ec652bba..2ad30d0b8 100644
--- a/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml
+++ b/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml
@@ -17,3 +17,4 @@ model_description:
   config:
     trust_remote_code: true
     load_in_4bit: true
+  chat_template: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}""
diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
index 0ed664efd..db2eec1e4 100644
--- a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
+++ b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
@@ -16,3 +16,4 @@ model_description:
   tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
   config:
     trust_remote_code: true
+  chat_template: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
diff --git a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
index 08945e865..973aa46f7 100644
--- a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
+++ b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
@@ -14,4 +14,3 @@ model_description:
   model_id_or_path: Intel/neural-chat-7b-v3-1
   tokenizer_name_or_path: Intel/neural-chat-7b-v3-1
   chat_template: "'### System:You are a chatbot developed by Intel. Please answer all questions to the best of your ability.\n'{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:]}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### User: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '### Assistant:'  + message['content'].strip() }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'### Assistant:\n'}}{% endif %}"
-
diff --git a/llm_on_ray/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml
index 65a6d6bf7..171b6fcd3 100644
--- a/llm_on_ray/inference/models/opt-125m.yaml
+++ b/llm_on_ray/inference/models/opt-125m.yaml
@@ -13,3 +13,4 @@ ipex:
 model_description:
   model_id_or_path: facebook/opt-125m
   tokenizer_name_or_path: facebook/opt-125m
+  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
index 6d12b35df..d723a185a 100644
--- a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
+++ b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
@@ -14,3 +14,4 @@ model_description:
   tokenizer_name_or_path: defog/sqlcoder-7b-2
   config:
     use_auth_token: ''
+    chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/starcoder.yaml b/llm_on_ray/inference/models/starcoder.yaml
index adbc91fc0..a42967c7f 100644
--- a/llm_on_ray/inference/models/starcoder.yaml
+++ b/llm_on_ray/inference/models/starcoder.yaml
@@ -15,3 +15,4 @@ model_description:
   tokenizer_name_or_path: bigcode/starcoder
   config:
     use_auth_token: ''
+  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
\ No newline at end of file

From 94f061a2c44c500d31634d738af1ca782f43f093 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 29 Apr 2024 22:07:05 +0800
Subject: [PATCH 14/47] update yaml

---
 llm_on_ray/inference/models/CodeLlama-7b-hf.yaml | 1 +
 llm_on_ray/inference/models/deplot.yaml          | 1 +
 llm_on_ray/inference/models/falcon-7b.yaml       | 1 +
 llm_on_ray/inference/models/fuyu8b.yaml          | 1 +
 4 files changed, 4 insertions(+)

diff --git a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
index eff253e46..a5828f6ec 100644
--- a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
+++ b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
@@ -13,3 +13,4 @@ ipex:
 model_description:
   model_id_or_path: codellama/CodeLlama-7b-hf
   tokenizer_name_or_path: codellama/CodeLlama-7b-hf
+  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/deplot.yaml b/llm_on_ray/inference/models/deplot.yaml
index 8f8edd47c..52e0f1a91 100644
--- a/llm_on_ray/inference/models/deplot.yaml
+++ b/llm_on_ray/inference/models/deplot.yaml
@@ -14,3 +14,4 @@ model_description:
   model_id_or_path: google/deplot
   tokenizer_name_or_path: google/deplot
   chat_model_with_image: true
+  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/falcon-7b.yaml b/llm_on_ray/inference/models/falcon-7b.yaml
index 5801f12be..3f879bfe1 100644
--- a/llm_on_ray/inference/models/falcon-7b.yaml
+++ b/llm_on_ray/inference/models/falcon-7b.yaml
@@ -13,3 +13,4 @@ ipex:
 model_description:
   model_id_or_path: tiiuae/falcon-7b
   tokenizer_name_or_path: tiiuae/falcon-7b
+  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/fuyu8b.yaml b/llm_on_ray/inference/models/fuyu8b.yaml
index e62303d83..2b5504d76 100644
--- a/llm_on_ray/inference/models/fuyu8b.yaml
+++ b/llm_on_ray/inference/models/fuyu8b.yaml
@@ -14,3 +14,4 @@ model_description:
   model_id_or_path: adept/fuyu-8b
   tokenizer_name_or_path: adept/fuyu-8b
   chat_model_with_image: true
+  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
\ No newline at end of file

From 06c65791062df79e922c9c8ceb764177af49cdcc Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 29 Apr 2024 22:12:05 +0800
Subject: [PATCH 15/47] format yaml

---
 llm_on_ray/inference/models/CodeLlama-7b-hf.yaml | 3 ++-
 llm_on_ray/inference/models/deplot.yaml          | 2 +-
 llm_on_ray/inference/models/falcon-7b.yaml       | 2 +-
 llm_on_ray/inference/models/fuyu8b.yaml          | 2 +-
 llm_on_ray/inference/models/gpt2.yaml            | 2 +-
 llm_on_ray/inference/models/opt-125m.yaml        | 2 +-
 llm_on_ray/inference/models/sqlcoder-7b-2.yaml   | 2 +-
 llm_on_ray/inference/models/starcoder.yaml       | 2 +-
 llm_on_ray/inference/predictor_deployment.py     | 7 ++++++-
 9 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
index a5828f6ec..9d243226f 100644
--- a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
+++ b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
@@ -13,4 +13,5 @@ ipex:
 model_description:
   model_id_or_path: codellama/CodeLlama-7b-hf
   tokenizer_name_or_path: codellama/CodeLlama-7b-hf
-  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
\ No newline at end of file
+  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
+  
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/deplot.yaml b/llm_on_ray/inference/models/deplot.yaml
index 52e0f1a91..bb4ea7ec5 100644
--- a/llm_on_ray/inference/models/deplot.yaml
+++ b/llm_on_ray/inference/models/deplot.yaml
@@ -14,4 +14,4 @@ model_description:
   model_id_or_path: google/deplot
   tokenizer_name_or_path: google/deplot
   chat_model_with_image: true
-  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
\ No newline at end of file
+  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
diff --git a/llm_on_ray/inference/models/falcon-7b.yaml b/llm_on_ray/inference/models/falcon-7b.yaml
index 3f879bfe1..88a088350 100644
--- a/llm_on_ray/inference/models/falcon-7b.yaml
+++ b/llm_on_ray/inference/models/falcon-7b.yaml
@@ -13,4 +13,4 @@ ipex:
 model_description:
   model_id_or_path: tiiuae/falcon-7b
   tokenizer_name_or_path: tiiuae/falcon-7b
-  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
\ No newline at end of file
+  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
diff --git a/llm_on_ray/inference/models/fuyu8b.yaml b/llm_on_ray/inference/models/fuyu8b.yaml
index 2b5504d76..7c4977adc 100644
--- a/llm_on_ray/inference/models/fuyu8b.yaml
+++ b/llm_on_ray/inference/models/fuyu8b.yaml
@@ -14,4 +14,4 @@ model_description:
   model_id_or_path: adept/fuyu-8b
   tokenizer_name_or_path: adept/fuyu-8b
   chat_model_with_image: true
-  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
\ No newline at end of file
+  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
diff --git a/llm_on_ray/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml
index 354f1c348..ca008cba2 100644
--- a/llm_on_ray/inference/models/gpt2.yaml
+++ b/llm_on_ray/inference/models/gpt2.yaml
@@ -14,4 +14,4 @@ model_description:
   model_id_or_path: gpt2
   tokenizer_name_or_path: gpt2
   gpt_base_model: true
-  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
\ No newline at end of file
+  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
diff --git a/llm_on_ray/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml
index 171b6fcd3..92fd30260 100644
--- a/llm_on_ray/inference/models/opt-125m.yaml
+++ b/llm_on_ray/inference/models/opt-125m.yaml
@@ -13,4 +13,4 @@ ipex:
 model_description:
   model_id_or_path: facebook/opt-125m
   tokenizer_name_or_path: facebook/opt-125m
-  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
\ No newline at end of file
+  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
diff --git a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
index d723a185a..0a1b43766 100644
--- a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
+++ b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
@@ -14,4 +14,4 @@ model_description:
   tokenizer_name_or_path: defog/sqlcoder-7b-2
   config:
     use_auth_token: ''
-    chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
\ No newline at end of file
+    chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
diff --git a/llm_on_ray/inference/models/starcoder.yaml b/llm_on_ray/inference/models/starcoder.yaml
index a42967c7f..660b10ba8 100644
--- a/llm_on_ray/inference/models/starcoder.yaml
+++ b/llm_on_ray/inference/models/starcoder.yaml
@@ -15,4 +15,4 @@ model_description:
   tokenizer_name_or_path: bigcode/starcoder
   config:
     use_auth_token: ''
-  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
\ No newline at end of file
+  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index 2e642bfff..f92baca85 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -391,9 +391,14 @@ async def openai_call(
         tool_choice=None,
     ):
         self.use_openai = True
-
+        print("openai_call")
+        print(input)
+        print(type(input))
         # return prompt or list of prompts preprocessed
         prompts = self.preprocess_prompts(input, tools, tool_choice)
+        print("preprocess_prompts")
+        print(prompts)
+        print(type(prompts))
 
         # Handle streaming response
         if streaming_response:

From c5766a10dd763444438bcfd5a8b50719f539e6fa Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 29 Apr 2024 22:16:05 +0800
Subject: [PATCH 16/47] update

---
 llm_on_ray/inference/models/CodeLlama-7b-hf.yaml           | 1 -
 .../models/template/inference_config_template.yaml         | 7 +------
 llm_on_ray/inference/utils.py                              | 2 +-
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
index 9d243226f..8a2ef79fd 100644
--- a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
+++ b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
@@ -14,4 +14,3 @@ model_description:
   model_id_or_path: codellama/CodeLlama-7b-hf
   tokenizer_name_or_path: codellama/CodeLlama-7b-hf
   chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
-  
\ No newline at end of file
diff --git a/llm_on_ray/inference/models/template/inference_config_template.yaml b/llm_on_ray/inference/models/template/inference_config_template.yaml
index 137ddb2dc..1e6726a12 100644
--- a/llm_on_ray/inference/models/template/inference_config_template.yaml
+++ b/llm_on_ray/inference/models/template/inference_config_template.yaml
@@ -13,7 +13,7 @@ ipex:
   precision: bf16
 model_description:
   model_id_or_path: null
-  ipexllm:: false
+  ipexllm: false
   tokenizer_name_or_path: null
   chat_processor: null
   gpt_base_model: false
@@ -22,11 +22,6 @@ model_description:
   peft_model_id_or_path: null
   peft_type: null
   use_hpu_graphs: true
-  prompt:
-    intro: ''
-    human_id: ''
-    bot_id: ''
-    stop_words: []
   config:
     trust_remote_code: false
     use_auth_token: null
diff --git a/llm_on_ray/inference/utils.py b/llm_on_ray/inference/utils.py
index 56b9146e5..91e311088 100644
--- a/llm_on_ray/inference/utils.py
+++ b/llm_on_ray/inference/utils.py
@@ -166,7 +166,7 @@ def get_prompt_format(input: Union[List[str], List[dict], List[ChatMessage]]):
     chat_format = True
     prompts_format = True
     for item in input:
-        if isinstance(item, str):
+        if isinstance(item, str) or isinstance(item, list):
             chat_format = False
         elif isinstance(item, dict) or isinstance(item, ChatMessage):
             prompts_format = False

From dad4224f77ba7c978aef9f5ab2f3303876c41a60 Mon Sep 17 00:00:00 2001
From: minmingzhu <45281494+minmingzhu@users.noreply.github.com>
Date: Mon, 29 Apr 2024 22:28:06 +0800
Subject: [PATCH 17/47] Update mpt_deltatuner.yaml

---
 .github/workflows/config/mpt_deltatuner.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/config/mpt_deltatuner.yaml b/.github/workflows/config/mpt_deltatuner.yaml
index 7399d587b..e0c0d6946 100644
--- a/.github/workflows/config/mpt_deltatuner.yaml
+++ b/.github/workflows/config/mpt_deltatuner.yaml
@@ -13,5 +13,7 @@ ipex:
 model_description:
   model_id_or_path: mosaicml/mpt-7b
   tokenizer_name_or_path: EleutherAI/gpt-neox-20b
+  peft_model_id_or_path: nathan0/mpt-7b-deltatuner-model
+  peft_type: deltatuner
   config:
     trust_remote_code: true

From f28f4cdb175e1d54dd9e280b6ae8bbfbdafb1119 Mon Sep 17 00:00:00 2001
From: minmingzhu <45281494+minmingzhu@users.noreply.github.com>
Date: Mon, 29 Apr 2024 23:44:19 +0800
Subject: [PATCH 18/47] Update neural-chat-7b-v3-1.yaml

---
 llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
index 973aa46f7..be7b8d611 100644
--- a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
+++ b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
@@ -13,4 +13,4 @@ ipex:
 model_description:
   model_id_or_path: Intel/neural-chat-7b-v3-1
   tokenizer_name_or_path: Intel/neural-chat-7b-v3-1
-  chat_template: "'### System:You are a chatbot developed by Intel. Please answer all questions to the best of your ability.\n'{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:]}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### User: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '### Assistant:'  + message['content'].strip() }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'### Assistant:\n'}}{% endif %}"
+  chat_template: "'### System:You are a chatbot developed by Intel. Please answer all questions to the best of your ability.\n'{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:]%}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### User: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '### Assistant:'  + message['content'].strip() }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'### Assistant:\n'}}{% endif %}"

From eec2124c2f26e96e5acd0f5e2e00d3a70f48b471 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Tue, 30 Apr 2024 14:08:54 +0800
Subject: [PATCH 19/47] update

---
 .github/workflows/config/gpt2-ci.yaml     | 1 +
 .github/workflows/config/opt-125m-ci.yaml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/config/gpt2-ci.yaml b/.github/workflows/config/gpt2-ci.yaml
index e9bed1366..7ed3f6972 100644
--- a/.github/workflows/config/gpt2-ci.yaml
+++ b/.github/workflows/config/gpt2-ci.yaml
@@ -13,3 +13,4 @@ model_description:
   model_id_or_path: gpt2
   tokenizer_name_or_path: gpt2
   gpt_base_model: true
+  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
diff --git a/.github/workflows/config/opt-125m-ci.yaml b/.github/workflows/config/opt-125m-ci.yaml
index e5ab095a6..96c9c345b 100644
--- a/.github/workflows/config/opt-125m-ci.yaml
+++ b/.github/workflows/config/opt-125m-ci.yaml
@@ -13,3 +13,4 @@ ipex:
 model_description:
   model_id_or_path: facebook/opt-125m
   tokenizer_name_or_path: facebook/opt-125m
+  chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"

From 419aea3ced2aff42998d7013960874cb7731b1c5 Mon Sep 17 00:00:00 2001
From: minmingzhu <45281494+minmingzhu@users.noreply.github.com>
Date: Mon, 6 May 2024 09:56:35 +0800
Subject: [PATCH 20/47] Update predictor_deployment.py

---
 llm_on_ray/inference/predictor_deployment.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index f92baca85..2e642bfff 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -391,14 +391,9 @@ async def openai_call(
         tool_choice=None,
     ):
         self.use_openai = True
-        print("openai_call")
-        print(input)
-        print(type(input))
+
         # return prompt or list of prompts preprocessed
         prompts = self.preprocess_prompts(input, tools, tool_choice)
-        print("preprocess_prompts")
-        print(prompts)
-        print(type(prompts))
 
         # Handle streaming response
         if streaming_response:

From dc6bb3bbfbbcbea1e26593839d5725466eb18e98 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 1 Apr 2024 02:29:30 +0000
Subject: [PATCH 21/47] implement fine-tuning chat template function

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 .../common/dataprocesser/general_processer.py | 91 +++++--------------
 llm_on_ray/finetune/finetune.py               |  1 +
 llm_on_ray/finetune/finetune.yaml             |  2 +
 llm_on_ray/finetune/finetune_config.py        |  9 +-
 4 files changed, 32 insertions(+), 71 deletions(-)

diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py
index b963611e7..8eea6f2c0 100644
--- a/llm_on_ray/common/dataprocesser/general_processer.py
+++ b/llm_on_ray/common/dataprocesser/general_processer.py
@@ -23,53 +23,9 @@
 
 from llm_on_ray.common.dataprocesser import DataProcesser
 
-INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
-INSTRUCTION_KEY = "### Instruction:"
-INPUT_KEY = "Input:"
 RESPONSE_KEY = "### Response:"
-END_KEY = "### End"
 RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"
 
-PROMPT_NO_INPUT_FORMAT = """{intro}
-
-{instruction_key}
-{instruction}
-
-{response_key}
-{response}
-
-{end_key}""".format(
-    intro=INTRO_BLURB,
-    instruction_key=INSTRUCTION_KEY,
-    instruction="{instruction}",
-    response_key=RESPONSE_KEY,
-    response="{response}",
-    end_key=END_KEY,
-)
-
-PROMPT_WITH_INPUT_FORMAT = """{intro}
-
-{instruction_key}
-{instruction}
-
-{input_key}
-{input}
-
-{response_key}
-{response}
-
-{end_key}""".format(
-    intro=INTRO_BLURB,
-    instruction_key=INSTRUCTION_KEY,
-    instruction="{instruction}",
-    input_key=INPUT_KEY,
-    input="{input}",
-    response_key=RESPONSE_KEY,
-    response="{response}",
-    end_key=END_KEY,
-)
-TEXT_COLUMN_NAME = "text"
-
 
 class DataCollatorForCompletionOnlyLM(transformers.DataCollatorForLanguageModeling):
     def torch_call(self, examples):
@@ -103,6 +59,7 @@ def prepare(self, tokenizer, dataset):
         per_device_train_batch_size = self.config.get("per_device_train_batch_size")
         per_device_eval_batch_size = self.config.get("per_device_eval_batch_size")
         max_length = self.config.get("max_length")
+        custom_chat_template = self.config.get("custom_chat_template")
         group = self.config.get("group")
         block_size = self.config.get("block_size")
         shuffle = self.config.get("shuffle")
@@ -114,35 +71,29 @@ def prepare(self, tokenizer, dataset):
         if isinstance(dataset, datasets.DatasetDict):
             column_names = dataset["train"].column_names
 
-        if column_names and TEXT_COLUMN_NAME not in column_names:
-
-            def prompt(rec):
-                instruction = rec["instruction"]
-                response = rec["response"]
-                context = rec.get("context")
-                if not instruction:
-                    raise ValueError(f"Expected an instruction in: {rec}")
-                if not response:
-                    raise ValueError(f"Expected a response in: {rec}")
-                if context:
-                    rec["text"] = PROMPT_WITH_INPUT_FORMAT.format(
-                        instruction=instruction, response=response, input=context
+        def tokenize_function(examples):
+            if self.config.get("is_base_model"):
+                if custom_chat_template:
+                     new_tokenizer = tokenizer.apply_chat_template(
+                        examples,
+                        chat_template=custom_chat_template,
+                        tokenize=True,
+                        max_length=max_length,
                     )
                 else:
-                    rec["text"] = PROMPT_NO_INPUT_FORMAT.format(
-                        instruction=instruction, response=response
+                    new_tokenizer = tokenizer.apply_chat_template(
+                        examples,
+                        chat_template=self.config.get("default_chat_template"),
+                        tokenize=True,
+                        max_length=max_length,
                     )
-                return rec
-
-            dataset = dataset.map(
-                prompt,
-                load_from_cache_file=False,
-                desc="Prompt",
-            )
-            column_names += [TEXT_COLUMN_NAME]
-
-        def tokenize_function(examples):
-            return tokenizer(examples[TEXT_COLUMN_NAME], max_length=max_length)
+            else:
+                new_tokenizer = tokenizer.apply_chat_template(
+                    examples, tokenize=False, max_length=max_length
+                )
+            print(new_tokenizer)
+            print(new_tokenizer.default_chat_template)
+            return new_tokenizer
 
         tokenized_datasets = dataset.map(
             tokenize_function,
diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py
index 37c0481d6..febb97231 100644
--- a/llm_on_ray/finetune/finetune.py
+++ b/llm_on_ray/finetune/finetune.py
@@ -177,6 +177,7 @@ def train_func(config: Dict[str, Any]):
         config={
             "name": tokenizer_name,
             "config": config["General"]["config"],
+            "custom_chat_template": config["General"]["custom_chat_template"],
         }
     )
 
diff --git a/llm_on_ray/finetune/finetune.yaml b/llm_on_ray/finetune/finetune.yaml
index 1f1cc46ca..d8e46331a 100644
--- a/llm_on_ray/finetune/finetune.yaml
+++ b/llm_on_ray/finetune/finetune.yaml
@@ -1,5 +1,6 @@
 General:
   base_model: EleutherAI/gpt-j-6b
+  is_base_model: false
   gpt_base_model: true
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: null
@@ -12,6 +13,7 @@ General:
     lora_alpha: 32
     lora_dropout: 0.1
   enable_gradient_checkpointing: false
+  custom_chat_template: null
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   group: true
diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py
index a01095c16..3a638aeab 100644
--- a/llm_on_ray/finetune/finetune_config.py
+++ b/llm_on_ray/finetune/finetune_config.py
@@ -17,7 +17,6 @@
 from pydantic import BaseModel, validator
 from typing import Optional, List
 
-
 PRECISION_BF16 = "bf16"
 PRECISION_FP16 = "fp16"
 PRECISION_NO = "no"
@@ -60,6 +59,14 @@ class General(BaseModel):
     lora_config: Optional[LoraConfig] = None
     deltatuner_config: Optional[DeltatunerConfig] = None
     enable_gradient_checkpointing: bool = False
+    custom_chat_template: Optional[str] = None
+    default_chat_template: str = (
+        "{{'### Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request. \n'}}"
+        "{% for message in messages %}{{'### Instruction: ' + message['instruction'] "
+        "+ ' Input:' + message['context'] + ' ### Response:' + message['response'] "
+        "+ '### End \n'}}{% endfor %}"
+    )
 
 
 class Dataset(BaseModel):

From 22b0ae5ae2169fd227415f0a78008e270d9e8d01 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Tue, 2 Apr 2024 08:25:34 +0000
Subject: [PATCH 22/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 llm_on_ray/common/trainer/default_trainer.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/llm_on_ray/common/trainer/default_trainer.py b/llm_on_ray/common/trainer/default_trainer.py
index 366d6f28b..e3800333c 100644
--- a/llm_on_ray/common/trainer/default_trainer.py
+++ b/llm_on_ray/common/trainer/default_trainer.py
@@ -33,6 +33,7 @@
 class DefaultTrainer(Trainer):
     def __init__(self, config):
         self.model = None
+        self.tokenizer = None
         self.config = config
         dataprocesser_config = config.get("dataprocesser")
         dataprocesser_type = dataprocesser_config.get("type")
@@ -121,7 +122,7 @@ def _get_lr_scheduler(
 
     def prepare(self, model, tokenizer, dataset, optimizer, accelerator):
         self._coordinate(accelerator)
-
+        self.tokenizer = tokenizer
         embedding_size = model.get_input_embeddings().weight.shape[0]
         logger.info(f"model embedding size: {embedding_size}")
         if len(tokenizer) > embedding_size:
@@ -288,6 +289,11 @@ def train(self):
                 is_main_process=self.accelerator.is_main_process,
                 save_function=self.accelerator.save,
             )
+            self.tokenizer.save_pretrained(
+                output,
+                is_main_process=self.accelerator.is_main_process,
+                save_function=self.accelerator.save,
+            )
             logger.info(f"finish save model to {output}")
 
         self.accelerator.wait_for_everyone()

From 1768e2afc793e06573b050316e5dbe4aa7d5950a Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 8 Apr 2024 05:41:44 +0000
Subject: [PATCH 23/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 .../common/dataprocesser/general_processer.py | 41 ++++++++++++++-----
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py
index 8eea6f2c0..5fafa3694 100644
--- a/llm_on_ray/common/dataprocesser/general_processer.py
+++ b/llm_on_ray/common/dataprocesser/general_processer.py
@@ -60,6 +60,8 @@ def prepare(self, tokenizer, dataset):
         per_device_eval_batch_size = self.config.get("per_device_eval_batch_size")
         max_length = self.config.get("max_length")
         custom_chat_template = self.config.get("custom_chat_template")
+        model_default_chat_template = self.config.get("model_default_chat_template")
+
         group = self.config.get("group")
         block_size = self.config.get("block_size")
         shuffle = self.config.get("shuffle")
@@ -74,25 +76,44 @@ def prepare(self, tokenizer, dataset):
         def tokenize_function(examples):
             if self.config.get("is_base_model"):
                 if custom_chat_template:
-                     new_tokenizer = tokenizer.apply_chat_template(
+                    tokenizer.chat_template = custom_chat_template
+                    new_tokenizer = tokenizer.apply_chat_template(
                         examples,
-                        chat_template=custom_chat_template,
-                        tokenize=True,
+                        tokenize=False,
                         max_length=max_length,
                     )
                 else:
+                    tokenizer.chat_template = self.config.get("default_chat_template")
                     new_tokenizer = tokenizer.apply_chat_template(
                         examples,
-                        chat_template=self.config.get("default_chat_template"),
-                        tokenize=True,
+                        tokenize=False,
                         max_length=max_length,
                     )
             else:
-                new_tokenizer = tokenizer.apply_chat_template(
-                    examples, tokenize=False, max_length=max_length
-                )
-            print(new_tokenizer)
-            print(new_tokenizer.default_chat_template)
+                if model_default_chat_template:
+                    tokenizer.chat_template = model_default_chat_template
+                    new_tokenizer = tokenizer.apply_chat_template(
+                        examples,
+                        tokenize=False,
+                        max_length=max_length,
+                    )
+                else:
+                    new_messages = [
+                        {
+                            "role": "user",
+                            "content": "instruction: "
+                            + examples["instruction"]
+                            + " context: "
+                            + examples["context"],
+                        },
+                        {"role": "assistant", "content": "response: " + examples["response"]},
+                    ]
+
+                    new_tokenizer = tokenizer.apply_chat_template(
+                        new_messages,
+                        tokenize=False,
+                        max_length=max_length,
+                    )
             return new_tokenizer
 
         tokenized_datasets = dataset.map(

From 2f256e5bb53e91e1c4eb864c22539a0d682223fd Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 8 Apr 2024 05:59:44 +0000
Subject: [PATCH 24/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 llm_on_ray/finetune/finetune.py        |  5 ++++-
 llm_on_ray/finetune/finetune_config.py | 11 ++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py
index febb97231..502a985f6 100644
--- a/llm_on_ray/finetune/finetune.py
+++ b/llm_on_ray/finetune/finetune.py
@@ -177,7 +177,6 @@ def train_func(config: Dict[str, Any]):
         config={
             "name": tokenizer_name,
             "config": config["General"]["config"],
-            "custom_chat_template": config["General"]["custom_chat_template"],
         }
     )
 
@@ -249,6 +248,10 @@ def train_func(config: Dict[str, Any]):
                 "group": config["Dataset"].get("group", True),
                 "block_size": config["Dataset"].get("block_size", 512),
                 "shuffle": config["Dataset"].get("shuffle", False),
+                "is_base_model": config["General"]["is_base_model"],
+                "custom_chat_template": config["General"]["custom_chat_template"],
+                "default_chat_template": config["General"]["default_chat_template"],
+                "model_default_chat_template": config["General"]["model_default_chat_template"],
             },
             "lr_scheduler": {
                 "enable": True,
diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py
index 3a638aeab..fe6f2b34b 100644
--- a/llm_on_ray/finetune/finetune_config.py
+++ b/llm_on_ray/finetune/finetune_config.py
@@ -61,12 +61,13 @@ class General(BaseModel):
     enable_gradient_checkpointing: bool = False
     custom_chat_template: Optional[str] = None
     default_chat_template: str = (
-        "{{'### Below is an instruction that describes a task. "
-        "Write a response that appropriately completes the request. \n'}}"
-        "{% for message in messages %}{{'### Instruction: ' + message['instruction'] "
-        "+ ' Input:' + message['context'] + ' ### Response:' + message['response'] "
-        "+ '### End \n'}}{% endfor %}"
+        "{{'### Below is an instruction that describes a task."
+        "Write a response that appropriately completes the request. '}}"
+        "{{'### Instruction: ' + messages['instruction'] "
+        "+ ' Input:' + messages['context'] + ' ### Response:' + messages['response'] "
+        "+ '### End \n'}}"
     )
+    model_default_chat_template: Optional[str] = None
 
 
 class Dataset(BaseModel):

From 0e5aca8cca3602038867b067ab540fcd5d26140b Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 8 Apr 2024 06:18:40 +0000
Subject: [PATCH 25/47] integrate gbt for transformer 4.26.0

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 .../common/dataprocesser/general_processer.py | 61 +++++++++++++++++++
 llm_on_ray/finetune/finetune.py               |  1 +
 2 files changed, 62 insertions(+)

diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py
index 5fafa3694..3aef3b7b7 100644
--- a/llm_on_ray/common/dataprocesser/general_processer.py
+++ b/llm_on_ray/common/dataprocesser/general_processer.py
@@ -23,9 +23,53 @@
 
 from llm_on_ray.common.dataprocesser import DataProcesser
 
+INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
+INSTRUCTION_KEY = "### Instruction:"
+INPUT_KEY = "Input:"
 RESPONSE_KEY = "### Response:"
+END_KEY = "### End"
 RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"
 
+PROMPT_NO_INPUT_FORMAT = """{intro}
+
+{instruction_key}
+{instruction}
+
+{response_key}
+{response}
+
+{end_key}""".format(
+    intro=INTRO_BLURB,
+    instruction_key=INSTRUCTION_KEY,
+    instruction="{instruction}",
+    response_key=RESPONSE_KEY,
+    response="{response}",
+    end_key=END_KEY,
+)
+
+PROMPT_WITH_INPUT_FORMAT = """{intro}
+
+{instruction_key}
+{instruction}
+
+{input_key}
+{input}
+
+{response_key}
+{response}
+
+{end_key}""".format(
+    intro=INTRO_BLURB,
+    instruction_key=INSTRUCTION_KEY,
+    instruction="{instruction}",
+    input_key=INPUT_KEY,
+    input="{input}",
+    response_key=RESPONSE_KEY,
+    response="{response}",
+    end_key=END_KEY,
+)
+TEXT_COLUMN_NAME = "text"
+
 
 class DataCollatorForCompletionOnlyLM(transformers.DataCollatorForLanguageModeling):
     def torch_call(self, examples):
@@ -74,6 +118,23 @@ def prepare(self, tokenizer, dataset):
             column_names = dataset["train"].column_names
 
         def tokenize_function(examples):
+            if self.config.get("gpt_base_model"):
+                instruction = examples["instruction"]
+                response = examples["response"]
+                context = examples.get("context")
+                if not instruction:
+                    raise ValueError(f"Expected an instruction in: {examples}")
+                if not response:
+                    raise ValueError(f"Expected a response in: {examples}")
+                if context:
+                    examples["text"] = PROMPT_WITH_INPUT_FORMAT.format(
+                        instruction=instruction, response=response, input=context
+                    )
+                else:
+                    examples["text"] = PROMPT_NO_INPUT_FORMAT.format(
+                        instruction=instruction, response=response
+                    )
+                return tokenizer(examples["text"], max_length=max_length, truncation=True)
             if self.config.get("is_base_model"):
                 if custom_chat_template:
                     tokenizer.chat_template = custom_chat_template
diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py
index 502a985f6..ac679d0d7 100644
--- a/llm_on_ray/finetune/finetune.py
+++ b/llm_on_ray/finetune/finetune.py
@@ -248,6 +248,7 @@ def train_func(config: Dict[str, Any]):
                 "group": config["Dataset"].get("group", True),
                 "block_size": config["Dataset"].get("block_size", 512),
                 "shuffle": config["Dataset"].get("shuffle", False),
+                "gpt_base_model": config["General"].get("gpt_base_model", False),
                 "is_base_model": config["General"]["is_base_model"],
                 "custom_chat_template": config["General"]["custom_chat_template"],
                 "default_chat_template": config["General"]["default_chat_template"],

From df9e84e5339b2ff078108e1a1558c5b37de22a79 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 8 Apr 2024 06:40:23 +0000
Subject: [PATCH 26/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 .../common/dataprocesser/general_processer.py | 85 ++++++++++---------
 1 file changed, 43 insertions(+), 42 deletions(-)

diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py
index 3aef3b7b7..9163b19df 100644
--- a/llm_on_ray/common/dataprocesser/general_processer.py
+++ b/llm_on_ray/common/dataprocesser/general_processer.py
@@ -127,55 +127,56 @@ def tokenize_function(examples):
                 if not response:
                     raise ValueError(f"Expected a response in: {examples}")
                 if context:
-                    examples["text"] = PROMPT_WITH_INPUT_FORMAT.format(
+                    new_message = PROMPT_WITH_INPUT_FORMAT.format(
                         instruction=instruction, response=response, input=context
                     )
                 else:
-                    examples["text"] = PROMPT_NO_INPUT_FORMAT.format(
+                    new_message = PROMPT_NO_INPUT_FORMAT.format(
                         instruction=instruction, response=response
                     )
-                return tokenizer(examples["text"], max_length=max_length, truncation=True)
-            if self.config.get("is_base_model"):
-                if custom_chat_template:
-                    tokenizer.chat_template = custom_chat_template
-                    new_tokenizer = tokenizer.apply_chat_template(
-                        examples,
-                        tokenize=False,
-                        max_length=max_length,
-                    )
-                else:
-                    tokenizer.chat_template = self.config.get("default_chat_template")
-                    new_tokenizer = tokenizer.apply_chat_template(
-                        examples,
-                        tokenize=False,
-                        max_length=max_length,
-                    )
+                return tokenizer.tokenize(new_message, max_length=max_length)
             else:
-                if model_default_chat_template:
-                    tokenizer.chat_template = model_default_chat_template
-                    new_tokenizer = tokenizer.apply_chat_template(
-                        examples,
-                        tokenize=False,
-                        max_length=max_length,
-                    )
+                if self.config.get("is_base_model"):
+                    if custom_chat_template:
+                        tokenizer.chat_template = custom_chat_template
+                        new_tokenizer = tokenizer.apply_chat_template(
+                            examples,
+                            tokenize=False,
+                            max_length=max_length,
+                        )
+                    else:
+                        tokenizer.chat_template = self.config.get("default_chat_template")
+                        new_tokenizer = tokenizer.apply_chat_template(
+                            examples,
+                            tokenize=False,
+                            max_length=max_length,
+                        )
                 else:
-                    new_messages = [
-                        {
-                            "role": "user",
-                            "content": "instruction: "
-                            + examples["instruction"]
-                            + " context: "
-                            + examples["context"],
-                        },
-                        {"role": "assistant", "content": "response: " + examples["response"]},
-                    ]
-
-                    new_tokenizer = tokenizer.apply_chat_template(
-                        new_messages,
-                        tokenize=False,
-                        max_length=max_length,
-                    )
-            return new_tokenizer
+                    if model_default_chat_template:
+                        tokenizer.chat_template = model_default_chat_template
+                        new_tokenizer = tokenizer.apply_chat_template(
+                            examples,
+                            tokenize=False,
+                            max_length=max_length,
+                        )
+                    else:
+                        new_messages = [
+                            {
+                                "role": "user",
+                                "content": "instruction: "
+                                + examples["instruction"]
+                                + " context: "
+                                + examples["context"],
+                            },
+                            {"role": "assistant", "content": "response: " + examples["response"]},
+                        ]
+
+                        new_tokenizer = tokenizer.apply_chat_template(
+                            new_messages,
+                            tokenize=False,
+                            max_length=max_length,
+                        )
+                return new_tokenizer
 
         tokenized_datasets = dataset.map(
             tokenize_function,

From 0a603790f8097935b304a838bea4b5776595c0aa Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 8 Apr 2024 08:56:23 +0000
Subject: [PATCH 27/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 llm_on_ray/common/dataprocesser/general_processer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py
index 9163b19df..84af7973c 100644
--- a/llm_on_ray/common/dataprocesser/general_processer.py
+++ b/llm_on_ray/common/dataprocesser/general_processer.py
@@ -134,7 +134,7 @@ def tokenize_function(examples):
                     new_message = PROMPT_NO_INPUT_FORMAT.format(
                         instruction=instruction, response=response
                     )
-                return tokenizer.tokenize(new_message, max_length=max_length)
+                return tokenizer(new_message, max_length=max_length)
             else:
                 if self.config.get("is_base_model"):
                     if custom_chat_template:
@@ -176,7 +176,7 @@ def tokenize_function(examples):
                             tokenize=False,
                             max_length=max_length,
                         )
-                return new_tokenizer
+                return tokenizer(new_tokenizer, max_length=max_length)
 
         tokenized_datasets = dataset.map(
             tokenize_function,

From b2429930d7896975ca202f8110cae8e5436a08cc Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Tue, 9 Apr 2024 07:10:41 +0000
Subject: [PATCH 28/47] 1. remove is_base_model tag 2. modify chat template

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 .../common/dataprocesser/general_processer.py | 72 ++++++++-----------
 llm_on_ray/finetune/finetune.py               |  4 +-
 llm_on_ray/finetune/finetune.yaml             |  1 -
 llm_on_ray/finetune/finetune_config.py        | 22 ++++--
 llm_on_ray/finetune/models/mpt-7b.yaml        |  1 +
 5 files changed, 48 insertions(+), 52 deletions(-)

diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py
index 84af7973c..b7282ca7e 100644
--- a/llm_on_ray/common/dataprocesser/general_processer.py
+++ b/llm_on_ray/common/dataprocesser/general_processer.py
@@ -104,7 +104,6 @@ def prepare(self, tokenizer, dataset):
         per_device_eval_batch_size = self.config.get("per_device_eval_batch_size")
         max_length = self.config.get("max_length")
         custom_chat_template = self.config.get("custom_chat_template")
-        model_default_chat_template = self.config.get("model_default_chat_template")
 
         group = self.config.get("group")
         block_size = self.config.get("block_size")
@@ -136,46 +135,37 @@ def tokenize_function(examples):
                     )
                 return tokenizer(new_message, max_length=max_length)
             else:
-                if self.config.get("is_base_model"):
-                    if custom_chat_template:
-                        tokenizer.chat_template = custom_chat_template
-                        new_tokenizer = tokenizer.apply_chat_template(
-                            examples,
-                            tokenize=False,
-                            max_length=max_length,
-                        )
-                    else:
-                        tokenizer.chat_template = self.config.get("default_chat_template")
-                        new_tokenizer = tokenizer.apply_chat_template(
-                            examples,
-                            tokenize=False,
-                            max_length=max_length,
-                        )
+                new_messages = [
+                    {
+                        "role": "user",
+                        "content": INTRO_BLURB + "\n\n"
+                                   + "###Instruction:\n"
+                                   + examples["instruction"] + "\n\n"
+                                   + "###context:\n"
+                                   + examples["context"] + "\n\n",
+                    },
+                    {"role": "assistant", "content": examples["response"]},
+                ]
+                if custom_chat_template:
+                    tokenizer.chat_template = custom_chat_template
+                    new_tokenizer = tokenizer.apply_chat_template(
+                        new_messages,
+                        tokenize=False,
+                        max_length=max_length,
+                    )
+                elif tokenizer.chat_template is not None:
+                    new_tokenizer = tokenizer.apply_chat_template(
+                        new_messages,
+                        tokenize=False,
+                        max_length=max_length,
+                    )
                 else:
-                    if model_default_chat_template:
-                        tokenizer.chat_template = model_default_chat_template
-                        new_tokenizer = tokenizer.apply_chat_template(
-                            examples,
-                            tokenize=False,
-                            max_length=max_length,
-                        )
-                    else:
-                        new_messages = [
-                            {
-                                "role": "user",
-                                "content": "instruction: "
-                                + examples["instruction"]
-                                + " context: "
-                                + examples["context"],
-                            },
-                            {"role": "assistant", "content": "response: " + examples["response"]},
-                        ]
-
-                        new_tokenizer = tokenizer.apply_chat_template(
-                            new_messages,
-                            tokenize=False,
-                            max_length=max_length,
-                        )
+                    tokenizer.chat_template = self.config.get("chat_template")
+                    new_tokenizer = tokenizer.apply_chat_template(
+                        new_messages,
+                        tokenize=False,
+                        max_length=max_length,
+                    )
                 return tokenizer(new_tokenizer, max_length=max_length)
 
         tokenized_datasets = dataset.map(
@@ -197,7 +187,7 @@ def group_texts(examples):
                     total_length = (total_length // block_size) * block_size
                 # Split by chunks of max_len.
                 result = {
-                    k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+                    k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
                     for k, t in concatenated_examples.items()
                 }
                 result["labels"] = result["input_ids"].copy()
diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py
index ac679d0d7..02a6e189c 100644
--- a/llm_on_ray/finetune/finetune.py
+++ b/llm_on_ray/finetune/finetune.py
@@ -249,10 +249,8 @@ def train_func(config: Dict[str, Any]):
                 "block_size": config["Dataset"].get("block_size", 512),
                 "shuffle": config["Dataset"].get("shuffle", False),
                 "gpt_base_model": config["General"].get("gpt_base_model", False),
-                "is_base_model": config["General"]["is_base_model"],
                 "custom_chat_template": config["General"]["custom_chat_template"],
-                "default_chat_template": config["General"]["default_chat_template"],
-                "model_default_chat_template": config["General"]["model_default_chat_template"],
+                "chat_template": config["General"]["chat_template"],
             },
             "lr_scheduler": {
                 "enable": True,
diff --git a/llm_on_ray/finetune/finetune.yaml b/llm_on_ray/finetune/finetune.yaml
index d8e46331a..1dee1ebf8 100644
--- a/llm_on_ray/finetune/finetune.yaml
+++ b/llm_on_ray/finetune/finetune.yaml
@@ -1,6 +1,5 @@
 General:
   base_model: EleutherAI/gpt-j-6b
-  is_base_model: false
   gpt_base_model: true
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: null
diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py
index fe6f2b34b..4817f8c30 100644
--- a/llm_on_ray/finetune/finetune_config.py
+++ b/llm_on_ray/finetune/finetune_config.py
@@ -60,14 +60,22 @@ class General(BaseModel):
     deltatuner_config: Optional[DeltatunerConfig] = None
     enable_gradient_checkpointing: bool = False
     custom_chat_template: Optional[str] = None
-    default_chat_template: str = (
-        "{{'### Below is an instruction that describes a task."
-        "Write a response that appropriately completes the request. '}}"
-        "{{'### Instruction: ' + messages['instruction'] "
-        "+ ' Input:' + messages['context'] + ' ### Response:' + messages['response'] "
-        "+ '### End \n'}}"
+    chat_template: Optional[str] = (
+        "{{ bos_token }}"
+        "{% if messages[0]['role'] == 'system' %}"
+        "{{ raise_exception('System role not supported') }}"
+        "{% endif %}"
+        "{% for message in messages %}"
+        "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
+        "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
+        "{% endif %}"
+        "{% if message['role'] == 'user' %}"
+        "{{ '### Instruction: ' + message['content'] + eos_token }}"
+        "{% elif message['role'] == 'assistant' %}"
+        "{{ '### Response:'  + message['content'] + eos_token }}"
+        "{% endif %}{% endfor %}"
+        "{{'### End \n'}}"
     )
-    model_default_chat_template: Optional[str] = None
 
 
 class Dataset(BaseModel):
diff --git a/llm_on_ray/finetune/models/mpt-7b.yaml b/llm_on_ray/finetune/models/mpt-7b.yaml
index 5bceeee4d..eb2f8f119 100644
--- a/llm_on_ray/finetune/models/mpt-7b.yaml
+++ b/llm_on_ray/finetune/models/mpt-7b.yaml
@@ -1,6 +1,7 @@
 General:
   base_model: mosaicml/mpt-7b
   tokenizer_name: EleutherAI/gpt-neox-20b
+  is_base_model: false
   gpt_base_model: false
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: /tmp/llm-ray/checkpoint

From 5afd1586b9bbe268819e1a865b7952a3f599864c Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Wed, 10 Apr 2024 01:25:29 +0000
Subject: [PATCH 29/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 .../common/dataprocesser/general_processer.py | 114 +++++++++---------
 pyproject.toml                                |   3 +-
 2 files changed, 61 insertions(+), 56 deletions(-)

diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py
index b7282ca7e..7bf0d2804 100644
--- a/llm_on_ray/common/dataprocesser/general_processer.py
+++ b/llm_on_ray/common/dataprocesser/general_processer.py
@@ -99,11 +99,67 @@ def torch_call(self, examples):
 
 
 class GeneralProcesser(DataProcesser):
+    def tokenize_function(self, examples, tokenizer):
+        print(examples)
+        if self.config.get("gpt_base_model"):
+            instruction = examples["instruction"]
+            response = examples["response"]
+            context = examples.get("context")
+            if not instruction:
+                raise ValueError(f"Expected an instruction in: {examples}")
+            if not response:
+                raise ValueError(f"Expected a response in: {examples}")
+            if context:
+                new_message = PROMPT_WITH_INPUT_FORMAT.format(
+                    instruction=instruction, response=response, input=context
+                )
+            else:
+                new_message = PROMPT_NO_INPUT_FORMAT.format(
+                    instruction=instruction, response=response
+                )
+            return tokenizer(new_message, max_length=self.config.get("max_length"))
+        else:
+            new_messages = [
+                {
+                    "role": "user",
+                    "content": "###Instruction:\n"
+                            + examples["instruction"] + "\n\n"
+                            + "###context:\n"
+                            + examples["context"] + "\n\n",
+                },
+                {"role": "assistant", "content": examples["response"] + "\n\n"},
+            ]
+            print(new_messages)
+            if self.config.get("custom_chat_template") is not None:
+                print("custom_chat_template")
+                tokenizer.chat_template = self.config.get("custom_chat_template")
+                new_tokenizer = tokenizer.apply_chat_template(
+                    new_messages,
+                    tokenize=False,
+                    max_length=self.config.get("max_length"),
+                )
+            elif tokenizer.chat_template is not None:
+                print("tokenizer.chat_template")
+                new_tokenizer = tokenizer.apply_chat_template(
+                    new_messages,
+                    tokenize=False,
+                    max_length=self.config.get("max_length"),
+                )
+            else:
+                print("chat_template")
+                tokenizer.chat_template = self.config.get("chat_template")
+                new_tokenizer = tokenizer.apply_chat_template(
+                    new_messages,
+                    tokenize=False,
+                    max_length=self.config.get("max_length"),
+                )
+            tokenizer = tokenizer(new_tokenizer, max_length=self.config.get("max_length"))
+            print(tokenizer)
+            return tokenizer
+
     def prepare(self, tokenizer, dataset):
         per_device_train_batch_size = self.config.get("per_device_train_batch_size")
         per_device_eval_batch_size = self.config.get("per_device_eval_batch_size")
-        max_length = self.config.get("max_length")
-        custom_chat_template = self.config.get("custom_chat_template")
 
         group = self.config.get("group")
         block_size = self.config.get("block_size")
@@ -116,60 +172,8 @@ def prepare(self, tokenizer, dataset):
         if isinstance(dataset, datasets.DatasetDict):
             column_names = dataset["train"].column_names
 
-        def tokenize_function(examples):
-            if self.config.get("gpt_base_model"):
-                instruction = examples["instruction"]
-                response = examples["response"]
-                context = examples.get("context")
-                if not instruction:
-                    raise ValueError(f"Expected an instruction in: {examples}")
-                if not response:
-                    raise ValueError(f"Expected a response in: {examples}")
-                if context:
-                    new_message = PROMPT_WITH_INPUT_FORMAT.format(
-                        instruction=instruction, response=response, input=context
-                    )
-                else:
-                    new_message = PROMPT_NO_INPUT_FORMAT.format(
-                        instruction=instruction, response=response
-                    )
-                return tokenizer(new_message, max_length=max_length)
-            else:
-                new_messages = [
-                    {
-                        "role": "user",
-                        "content": INTRO_BLURB + "\n\n"
-                                   + "###Instruction:\n"
-                                   + examples["instruction"] + "\n\n"
-                                   + "###context:\n"
-                                   + examples["context"] + "\n\n",
-                    },
-                    {"role": "assistant", "content": examples["response"]},
-                ]
-                if custom_chat_template:
-                    tokenizer.chat_template = custom_chat_template
-                    new_tokenizer = tokenizer.apply_chat_template(
-                        new_messages,
-                        tokenize=False,
-                        max_length=max_length,
-                    )
-                elif tokenizer.chat_template is not None:
-                    new_tokenizer = tokenizer.apply_chat_template(
-                        new_messages,
-                        tokenize=False,
-                        max_length=max_length,
-                    )
-                else:
-                    tokenizer.chat_template = self.config.get("chat_template")
-                    new_tokenizer = tokenizer.apply_chat_template(
-                        new_messages,
-                        tokenize=False,
-                        max_length=max_length,
-                    )
-                return tokenizer(new_tokenizer, max_length=max_length)
-
         tokenized_datasets = dataset.map(
-            tokenize_function,
+            lambda examples: self.tokenize_function(examples, tokenizer),
             remove_columns=column_names,
             load_from_cache_file=False,
             desc="Tokenize dataset",
diff --git a/pyproject.toml b/pyproject.toml
index b319045cc..a18574675 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,7 +35,8 @@ dependencies = [
     "py-cpuinfo",
     "pydantic-yaml",
     "async_timeout",
-    "typer"
+    "typer",
+    "jinja2>=3.0.0"
 ]
 
 [project.optional-dependencies]

From bbf79251ea367ece3602b0a82c73471074f8c477 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Wed, 10 Apr 2024 08:32:22 +0000
Subject: [PATCH 30/47] 1. update doc/finetune_parameters.md 2. add unit test

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 docs/finetune_parameters.md                   |   2 +
 .../common/dataprocesser/general_processer.py |  15 +-
 tests/finetune/test_chat_template.py          | 139 ++++++++++++++++++
 3 files changed, 145 insertions(+), 11 deletions(-)
 create mode 100644 tests/finetune/test_chat_template.py

diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md
index 5d24f42e6..f9432b11b 100644
--- a/docs/finetune_parameters.md
+++ b/docs/finetune_parameters.md
@@ -15,6 +15,8 @@ The following are the parameters supported in the finetuning workflow.
 |lora_config|task_type: CAUSAL_LM<br>r: 8<br>lora_alpha: 32<br>lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.|
 |deltatuner_config|"algo": "lora"<br>"denas": True<br>"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.|
 |enable_gradient_checkpointing|False|enable gradient checkpointing to save GPU memory, but will cost more compute runtime|
+|chat_template|"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### Instruction: ' + message['content'] + eos_token }}{{ '### Response:'  + message['content'] + eos_token }}{% endif %}{% endfor %}{{'### End \n'}}"|LLM-on-Ray default chat default.|
+|custom_chat_template|None|User-defined chat template.|
 
 
 ## Dataset Parameters
diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py
index 7bf0d2804..12f172bd7 100644
--- a/llm_on_ray/common/dataprocesser/general_processer.py
+++ b/llm_on_ray/common/dataprocesser/general_processer.py
@@ -100,7 +100,6 @@ def torch_call(self, examples):
 
 class GeneralProcesser(DataProcesser):
     def tokenize_function(self, examples, tokenizer):
-        print(examples)
         if self.config.get("gpt_base_model"):
             instruction = examples["instruction"]
             response = examples["response"]
@@ -117,7 +116,7 @@ def tokenize_function(self, examples, tokenizer):
                 new_message = PROMPT_NO_INPUT_FORMAT.format(
                     instruction=instruction, response=response
                 )
-            return tokenizer(new_message, max_length=self.config.get("max_length"))
+            return tokenizer(new_message, add_special_tokens=False, max_length=self.config.get("max_length"))
         else:
             new_messages = [
                 {
@@ -129,32 +128,26 @@ def tokenize_function(self, examples, tokenizer):
                 },
                 {"role": "assistant", "content": examples["response"] + "\n\n"},
             ]
-            print(new_messages)
             if self.config.get("custom_chat_template") is not None:
-                print("custom_chat_template")
                 tokenizer.chat_template = self.config.get("custom_chat_template")
                 new_tokenizer = tokenizer.apply_chat_template(
                     new_messages,
                     tokenize=False,
-                    max_length=self.config.get("max_length"),
                 )
             elif tokenizer.chat_template is not None:
-                print("tokenizer.chat_template")
                 new_tokenizer = tokenizer.apply_chat_template(
                     new_messages,
                     tokenize=False,
-                    max_length=self.config.get("max_length"),
                 )
             else:
-                print("chat_template")
                 tokenizer.chat_template = self.config.get("chat_template")
                 new_tokenizer = tokenizer.apply_chat_template(
                     new_messages,
                     tokenize=False,
-                    max_length=self.config.get("max_length"),
                 )
-            tokenizer = tokenizer(new_tokenizer, max_length=self.config.get("max_length"))
-            print(tokenizer)
+            tokenizer = tokenizer(new_tokenizer,
+                                  add_special_tokens=False,
+                                  max_length=self.config.get("max_length"))
             return tokenizer
 
     def prepare(self, tokenizer, dataset):
diff --git a/tests/finetune/test_chat_template.py b/tests/finetune/test_chat_template.py
new file mode 100644
index 000000000..7cdda115c
--- /dev/null
+++ b/tests/finetune/test_chat_template.py
@@ -0,0 +1,139 @@
+import unittest
+
+import transformers
+from transformers import AutoTokenizer
+from llm_on_ray.common.dataprocesser.general_processer import GeneralProcesser
+
+
+class TestTokenizeFunction(unittest.TestCase):
+    def setUp(self):
+        self.tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf')
+        self.config = {
+            'gpt_base_model': True,
+            'max_length': 512,
+            'trust_remote_code': False,
+            'chat_template': "Below is an instruction that describes a task. Write a response that appropriately "
+                             "completes the request\n {% if messages[0]['role'] == 'system' %}{{ raise_exception("
+                             "'System role not supported') }}{% endif %}{% for message in messages %}{% if (message["
+                             "'role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles "
+                             "must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] "
+                             "== 'user' %}{{ '### Instruction: ' + message['content'] }}{% elif message['role'] == "
+                             "'assistant' %}{{ '### Response: '  + message['content'] }}{% endif %}{% endfor %}{{'### "
+                             "End \n'}}",
+        }
+        self.processer = GeneralProcesser(self.config)
+
+    def test_tokenize_function_with_gpt_model(self):
+        self.tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-j-6b')
+
+        examples = \
+            {
+                "instruction": "Test instruction",
+                "response": "Test response",
+                "context": "Test context",
+            }
+
+        # Verify the format of the result
+        expected_result = 'Below is an instruction that describes a task. Write a response that '\
+                          'appropriately completes the request.\n'\
+                          '\n'\
+                          '### Instruction:\n'\
+                          'Test instruction\n'\
+                          '\n'\
+                          'Input:\n'\
+                          'Test context\n'\
+                          '\n'\
+                          '### Response:\n'\
+                          'Test response\n'\
+                          '\n'\
+                          '### End'
+
+        result = self.processer.tokenize_function(examples, self.tokenizer)
+        self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result)
+
+    def test_tokenize_function_with_custom_chat_template(self):
+        examples = \
+            {
+                "instruction": "Test instruction",
+                "response": "Test response",
+                "context": "Test context",
+            }
+
+        # Verify the format of the result
+        expected_result = '<|im_start|>user\n' \
+                          '###Instruction:\n' \
+                          'Test instruction\n' \
+                          '\n' \
+                          '###context:\n' \
+                          'Test context\n' \
+                          '\n' \
+                          '<|im_end|><|im_start|>assistant\n' \
+                          'Test response\n' \
+                          '\n' \
+                          '<|im_end|>'
+        # Set custom chat template
+        self.config['custom_chat_template'] = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'"\
+                                              "+ message['content'] + '<|im_end|>'}}{% endfor %}"
+
+        self.config['gpt_base_model'] = False
+        result = self.processer.tokenize_function(examples, self.tokenizer)
+        self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result)
+
+    def test_tokenize_function_with_chat_template(self):
+        examples = \
+            {
+                "instruction": "Test instruction",
+                "response": "Test response",
+                "context": "Test context",
+            }
+
+        # Verify the format of the result
+        expected_result = 'Below is an instruction that describes a task. Write a response that '\
+                          'appropriately completes the request\n'\
+                          '### Instruction: ###Instruction:\n'\
+                          'Test instruction\n'\
+                          '\n'\
+                          '###context:\n'\
+                          'Test context\n'\
+                          '\n'\
+                          '### Response: Test response\n'\
+                          '\n'\
+                          '### End \n'\
+
+        self.config['gpt_base_model'] = False
+        result = self.processer.tokenize_function(examples, self.tokenizer)
+        self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result)
+
+    def test_tokenize_function_with_default_chat_template(self):
+        self.tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b-it')
+        examples = \
+            {
+                "instruction": "Test instruction",
+                "response": "Test response",
+                "context": "Test context",
+            }
+
+        chat_example = [
+            {
+                "role": "user",
+                "content": "###Instruction:\nTest instruction\n\n###context:\nTest context\n\n",
+
+            },
+            {
+                "role": "assistant",
+                "content": "Test response\n\n",
+            }
+        ]
+
+        # Verify the format of the result
+        expected_result = self.tokenizer.apply_chat_template(chat_example,
+                                                             tokenize=False,
+                                                             max_length=self.config.get("max_length"))
+
+        self.config['gpt_base_model'] = False
+        result = self.processer.tokenize_function(examples, self.tokenizer)
+        self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result)
+
+
+if __name__ == '__main__':
+    unittest.main()

From c026adfcb10382a998081b104b8db9eeaef9afb9 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Wed, 10 Apr 2024 08:44:21 +0000
Subject: [PATCH 31/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 docs/finetune_parameters.md                          | 3 +--
 llm_on_ray/common/dataprocesser/general_processer.py | 6 +++---
 llm_on_ray/finetune/finetune.py                      | 6 +++---
 llm_on_ray/finetune/finetune_config.py               | 4 ++--
 4 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md
index f9432b11b..ee3615d5e 100644
--- a/docs/finetune_parameters.md
+++ b/docs/finetune_parameters.md
@@ -15,8 +15,7 @@ The following are the parameters supported in the finetuning workflow.
 |lora_config|task_type: CAUSAL_LM<br>r: 8<br>lora_alpha: 32<br>lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.|
 |deltatuner_config|"algo": "lora"<br>"denas": True<br>"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.|
 |enable_gradient_checkpointing|False|enable gradient checkpointing to save GPU memory, but will cost more compute runtime|
-|chat_template|"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### Instruction: ' + message['content'] + eos_token }}{{ '### Response:'  + message['content'] + eos_token }}{% endif %}{% endfor %}{{'### End \n'}}"|LLM-on-Ray default chat default.|
-|custom_chat_template|None|User-defined chat template.|
+|chat_template|None|User-defined chat template.|
 
 
 ## Dataset Parameters
diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py
index 12f172bd7..92c94338c 100644
--- a/llm_on_ray/common/dataprocesser/general_processer.py
+++ b/llm_on_ray/common/dataprocesser/general_processer.py
@@ -128,8 +128,8 @@ def tokenize_function(self, examples, tokenizer):
                 },
                 {"role": "assistant", "content": examples["response"] + "\n\n"},
             ]
-            if self.config.get("custom_chat_template") is not None:
-                tokenizer.chat_template = self.config.get("custom_chat_template")
+            if self.config.get("chat_template") is not None:
+                tokenizer.chat_template = self.config.get("chat_template")
                 new_tokenizer = tokenizer.apply_chat_template(
                     new_messages,
                     tokenize=False,
@@ -140,7 +140,7 @@ def tokenize_function(self, examples, tokenizer):
                     tokenize=False,
                 )
             else:
-                tokenizer.chat_template = self.config.get("chat_template")
+                tokenizer.chat_template = self.config.get("default_chat_template")
                 new_tokenizer = tokenizer.apply_chat_template(
                     new_messages,
                     tokenize=False,
diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py
index 02a6e189c..9f25d9583 100644
--- a/llm_on_ray/finetune/finetune.py
+++ b/llm_on_ray/finetune/finetune.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 
-#!/usr/bin/env python
+# !/usr/bin/env python
 
 import os
 import argparse
@@ -249,8 +249,8 @@ def train_func(config: Dict[str, Any]):
                 "block_size": config["Dataset"].get("block_size", 512),
                 "shuffle": config["Dataset"].get("shuffle", False),
                 "gpt_base_model": config["General"].get("gpt_base_model", False),
-                "custom_chat_template": config["General"]["custom_chat_template"],
                 "chat_template": config["General"]["chat_template"],
+                "default_chat_template": config["General"]["default_chat_template"],
             },
             "lr_scheduler": {
                 "enable": True,
@@ -358,7 +358,7 @@ def main(external_config=None):
 
             if "xpu" in ipex.__version__:
                 num_cpus = (
-                    resources_per_worker["CPU"] * num_training_workers + 1
+                        resources_per_worker["CPU"] * num_training_workers + 1
                 )  # additional 1 for head worker
                 ray.init(num_cpus=num_cpus, runtime_env=runtime_env)
             else:
diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py
index 4817f8c30..98597901d 100644
--- a/llm_on_ray/finetune/finetune_config.py
+++ b/llm_on_ray/finetune/finetune_config.py
@@ -59,8 +59,8 @@ class General(BaseModel):
     lora_config: Optional[LoraConfig] = None
     deltatuner_config: Optional[DeltatunerConfig] = None
     enable_gradient_checkpointing: bool = False
-    custom_chat_template: Optional[str] = None
-    chat_template: Optional[str] = (
+    chat_template: Optional[str] = None
+    default_chat_template: str = (
         "{{ bos_token }}"
         "{% if messages[0]['role'] == 'system' %}"
         "{{ raise_exception('System role not supported') }}"

From d51880bc24b89a2b8f86cd604422871fe76166ff Mon Sep 17 00:00:00 2001
From: Xiaochang Wu <xiaochang.wu@intel.com>
Date: Tue, 9 Apr 2024 09:19:24 +0800
Subject: [PATCH 32/47] Support latest Ray 2.10 release (#158)

* update

* fix blocking

* update

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* update

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* fix setup and getting started

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* update

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* update

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* nit

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* Add dependencies for tests and update pyproject.toml

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* Update dependencies and test workflow

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* Update dependencies and fix torch_dist.py

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

* Update OpenAI SDK installation and start ray cluster

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>

---------

Signed-off-by: Wu, Xiaochang <xiaochang.wu@intel.com>
---
 pyproject.toml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a18574675..451d2649d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,9 +34,9 @@ dependencies = [
     "deltatuner==1.1.9",
     "py-cpuinfo",
     "pydantic-yaml",
-    "async_timeout",
-    "typer",
-    "jinja2>=3.0.0"
+    "async-timeout",
+    "jinja2>=3.0.0",
+    "typer"
 ]
 
 [project.optional-dependencies]

From 63d2ef82e2080f373bf3c1fb16f1a078881d6d69 Mon Sep 17 00:00:00 2001
From: yutianchen <tianchen.yu@intel.com>
Date: Tue, 9 Apr 2024 15:38:35 +0800
Subject: [PATCH 33/47] [Tests] Add query single test (#156)

* single test

* single test

* single test

* single test

* fix hang error
---
 tests/inference/test_query_single.py | 107 +++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 tests/inference/test_query_single.py

diff --git a/tests/inference/test_query_single.py b/tests/inference/test_query_single.py
new file mode 100644
index 000000000..1c32f6b73
--- /dev/null
+++ b/tests/inference/test_query_single.py
@@ -0,0 +1,107 @@
+import subprocess
+import pytest
+import os
+
+os.environ["no_proxy"] = "localhost,127.0.0.1"
+
+
+def start_serve(model_name):
+    current_path = os.path.dirname(os.path.abspath(__file__))
+
+    config_path = os.path.join(
+        current_path, "../../.github/workflows/config/" + model_name + "-ci.yaml"
+    )
+
+    cmd_serve = ["llm_on_ray-serve", "--config_file", config_path, "--simple"]
+
+    result_serve = subprocess.run(cmd_serve, capture_output=True, text=True)
+
+    # Ensure there are no errors in the serve script execution
+    assert result_serve.returncode == 0, print(
+        "\n" + "Serve error stderr message: " + "\n", result_serve.stderr
+    )
+
+    # Print the output of subprocess.run for checking if output is expected
+    print("\n" + "Serve message: " + "\n", result_serve.stdout)
+
+    # Ensure there are no errors in the serve script execution
+    assert "Error" not in result_serve.stderr
+
+
+def script_with_args(
+    base_url, model_name, streaming_response, max_new_tokens, temperature, top_p, top_k
+):
+    current_path = os.path.dirname(os.path.abspath(__file__))
+
+    os.path.join(current_path, "../../.github/workflows/config/" + model_name + "-ci.yaml")
+
+    example_query_single_path = os.path.join(
+        current_path, "../../examples/inference/api_server_simple/query_single.py"
+    )
+
+    cmd_single = [
+        "python",
+        example_query_single_path,
+        "--model_endpoint",
+        base_url + model_name,
+    ]
+
+    if streaming_response:
+        cmd_single.append("--streaming_response")
+
+    if max_new_tokens is not None:
+        cmd_single.extend(["--max_new_tokens", str(max_new_tokens)])
+
+    if temperature is not None:
+        cmd_single.extend(["--temperature", str(temperature)])
+
+    if top_p is not None:
+        cmd_single.extend(["--top_p", str(top_p)])
+
+    if top_k is not None:
+        cmd_single.extend(["--top_k", str(top_k)])
+
+    result_query_single = subprocess.run(cmd_single, capture_output=True, text=True)
+
+    # Print the output of subprocess.run for checking if output is expected
+    print(result_query_single)
+
+    # Ensure there are no errors in the OpenAI API query script execution
+    assert "Error" not in result_query_single.stderr
+
+    # Returncode should be 0 when there is no exception
+    assert result_query_single.returncode == 0
+
+
+executed_models = {}
+
+
+# Parametrize the test function with different combinations of parameters
+# TODO: more models and combinations will be added and tested.
+@pytest.mark.parametrize(
+    "base_url,model_name,streaming_response,max_new_tokens,temperature,top_p, top_k",
+    [
+        (base_url, model_name, streaming_response, max_new_tokens, temperature, top_p, top_k)
+        for base_url in ["http://localhost:8000/"]
+        for model_name in ["gpt2"]
+        for streaming_response in [None]
+        for max_new_tokens in [None]
+        for temperature in [None]
+        for top_p in [None]
+        for top_k in [None]
+    ],
+)
+def test_script(
+    base_url, model_name, streaming_response, max_new_tokens, temperature, top_p, top_k
+):
+    global executed_models
+
+    # Check if this modelname has already executed start_serve
+    if model_name not in executed_models:
+        start_serve(model_name)
+        # Mark this modelname has already executed start_serve
+        executed_models[model_name] = True
+
+    script_with_args(
+        base_url, model_name, streaming_response, max_new_tokens, temperature, top_p, top_k
+    )

From 05d63ef49a3c4570060245a3f104acc3a86eee16 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Wed, 10 Apr 2024 08:50:32 +0000
Subject: [PATCH 34/47] format

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 .../common/dataprocesser/general_processer.py |  20 +-
 llm_on_ray/finetune/finetune.py               |   2 +-
 tests/finetune/test_chat_template.py          | 180 +++++++++---------
 3 files changed, 104 insertions(+), 98 deletions(-)

diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py
index 92c94338c..b2727e97b 100644
--- a/llm_on_ray/common/dataprocesser/general_processer.py
+++ b/llm_on_ray/common/dataprocesser/general_processer.py
@@ -116,15 +116,19 @@ def tokenize_function(self, examples, tokenizer):
                 new_message = PROMPT_NO_INPUT_FORMAT.format(
                     instruction=instruction, response=response
                 )
-            return tokenizer(new_message, add_special_tokens=False, max_length=self.config.get("max_length"))
+            return tokenizer(
+                new_message, add_special_tokens=False, max_length=self.config.get("max_length")
+            )
         else:
             new_messages = [
                 {
                     "role": "user",
                     "content": "###Instruction:\n"
-                            + examples["instruction"] + "\n\n"
-                            + "###context:\n"
-                            + examples["context"] + "\n\n",
+                    + examples["instruction"]
+                    + "\n\n"
+                    + "###context:\n"
+                    + examples["context"]
+                    + "\n\n",
                 },
                 {"role": "assistant", "content": examples["response"] + "\n\n"},
             ]
@@ -145,9 +149,9 @@ def tokenize_function(self, examples, tokenizer):
                     new_messages,
                     tokenize=False,
                 )
-            tokenizer = tokenizer(new_tokenizer,
-                                  add_special_tokens=False,
-                                  max_length=self.config.get("max_length"))
+            tokenizer = tokenizer(
+                new_tokenizer, add_special_tokens=False, max_length=self.config.get("max_length")
+            )
             return tokenizer
 
     def prepare(self, tokenizer, dataset):
@@ -184,7 +188,7 @@ def group_texts(examples):
                     total_length = (total_length // block_size) * block_size
                 # Split by chunks of max_len.
                 result = {
-                    k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
+                    k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
                     for k, t in concatenated_examples.items()
                 }
                 result["labels"] = result["input_ids"].copy()
diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py
index 9f25d9583..14422967b 100644
--- a/llm_on_ray/finetune/finetune.py
+++ b/llm_on_ray/finetune/finetune.py
@@ -358,7 +358,7 @@ def main(external_config=None):
 
             if "xpu" in ipex.__version__:
                 num_cpus = (
-                        resources_per_worker["CPU"] * num_training_workers + 1
+                    resources_per_worker["CPU"] * num_training_workers + 1
                 )  # additional 1 for head worker
                 ray.init(num_cpus=num_cpus, runtime_env=runtime_env)
             else:
diff --git a/tests/finetune/test_chat_template.py b/tests/finetune/test_chat_template.py
index 7cdda115c..2270a5781 100644
--- a/tests/finetune/test_chat_template.py
+++ b/tests/finetune/test_chat_template.py
@@ -7,133 +7,135 @@
 
 class TestTokenizeFunction(unittest.TestCase):
     def setUp(self):
-        self.tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf')
+        self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
         self.config = {
-            'gpt_base_model': True,
-            'max_length': 512,
-            'trust_remote_code': False,
-            'chat_template': "Below is an instruction that describes a task. Write a response that appropriately "
-                             "completes the request\n {% if messages[0]['role'] == 'system' %}{{ raise_exception("
-                             "'System role not supported') }}{% endif %}{% for message in messages %}{% if (message["
-                             "'role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles "
-                             "must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] "
-                             "== 'user' %}{{ '### Instruction: ' + message['content'] }}{% elif message['role'] == "
-                             "'assistant' %}{{ '### Response: '  + message['content'] }}{% endif %}{% endfor %}{{'### "
-                             "End \n'}}",
+            "gpt_base_model": True,
+            "max_length": 512,
+            "trust_remote_code": False,
+            "chat_template": "Below is an instruction that describes a task. Write a response that appropriately "
+            "completes the request\n {% if messages[0]['role'] == 'system' %}{{ raise_exception("
+            "'System role not supported') }}{% endif %}{% for message in messages %}{% if (message["
+            "'role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles "
+            "must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] "
+            "== 'user' %}{{ '### Instruction: ' + message['content'] }}{% elif message['role'] == "
+            "'assistant' %}{{ '### Response: '  + message['content'] }}{% endif %}{% endfor %}{{'### "
+            "End \n'}}",
         }
         self.processer = GeneralProcesser(self.config)
 
     def test_tokenize_function_with_gpt_model(self):
-        self.tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-j-6b')
+        self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b")
 
-        examples = \
-            {
-                "instruction": "Test instruction",
-                "response": "Test response",
-                "context": "Test context",
-            }
+        examples = {
+            "instruction": "Test instruction",
+            "response": "Test response",
+            "context": "Test context",
+        }
 
         # Verify the format of the result
-        expected_result = 'Below is an instruction that describes a task. Write a response that '\
-                          'appropriately completes the request.\n'\
-                          '\n'\
-                          '### Instruction:\n'\
-                          'Test instruction\n'\
-                          '\n'\
-                          'Input:\n'\
-                          'Test context\n'\
-                          '\n'\
-                          '### Response:\n'\
-                          'Test response\n'\
-                          '\n'\
-                          '### End'
+        expected_result = (
+            "Below is an instruction that describes a task. Write a response that "
+            "appropriately completes the request.\n"
+            "\n"
+            "### Instruction:\n"
+            "Test instruction\n"
+            "\n"
+            "Input:\n"
+            "Test context\n"
+            "\n"
+            "### Response:\n"
+            "Test response\n"
+            "\n"
+            "### End"
+        )
 
         result = self.processer.tokenize_function(examples, self.tokenizer)
-        self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result)
+        self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result)
 
     def test_tokenize_function_with_custom_chat_template(self):
-        examples = \
-            {
-                "instruction": "Test instruction",
-                "response": "Test response",
-                "context": "Test context",
-            }
+        examples = {
+            "instruction": "Test instruction",
+            "response": "Test response",
+            "context": "Test context",
+        }
 
         # Verify the format of the result
-        expected_result = '<|im_start|>user\n' \
-                          '###Instruction:\n' \
-                          'Test instruction\n' \
-                          '\n' \
-                          '###context:\n' \
-                          'Test context\n' \
-                          '\n' \
-                          '<|im_end|><|im_start|>assistant\n' \
-                          'Test response\n' \
-                          '\n' \
-                          '<|im_end|>'
+        expected_result = (
+            "<|im_start|>user\n"
+            "###Instruction:\n"
+            "Test instruction\n"
+            "\n"
+            "###context:\n"
+            "Test context\n"
+            "\n"
+            "<|im_end|><|im_start|>assistant\n"
+            "Test response\n"
+            "\n"
+            "<|im_end|>"
+        )
         # Set custom chat template
-        self.config['custom_chat_template'] = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'"\
-                                              "+ message['content'] + '<|im_end|>'}}{% endfor %}"
+        self.config["custom_chat_template"] = (
+            "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'"
+            "+ message['content'] + '<|im_end|>'}}{% endfor %}"
+        )
 
-        self.config['gpt_base_model'] = False
+        self.config["gpt_base_model"] = False
         result = self.processer.tokenize_function(examples, self.tokenizer)
-        self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result)
+        self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result)
 
     def test_tokenize_function_with_chat_template(self):
-        examples = \
-            {
-                "instruction": "Test instruction",
-                "response": "Test response",
-                "context": "Test context",
-            }
+        examples = {
+            "instruction": "Test instruction",
+            "response": "Test response",
+            "context": "Test context",
+        }
 
         # Verify the format of the result
-        expected_result = 'Below is an instruction that describes a task. Write a response that '\
-                          'appropriately completes the request\n'\
-                          '### Instruction: ###Instruction:\n'\
-                          'Test instruction\n'\
-                          '\n'\
-                          '###context:\n'\
-                          'Test context\n'\
-                          '\n'\
-                          '### Response: Test response\n'\
-                          '\n'\
-                          '### End \n'\
-
-        self.config['gpt_base_model'] = False
+        expected_result = (
+            "Below is an instruction that describes a task. Write a response that "
+            "appropriately completes the request\n"
+            "### Instruction: ###Instruction:\n"
+            "Test instruction\n"
+            "\n"
+            "###context:\n"
+            "Test context\n"
+            "\n"
+            "### Response: Test response\n"
+            "\n"
+            "### End \n"
+        )
+        self.config["gpt_base_model"] = False
         result = self.processer.tokenize_function(examples, self.tokenizer)
-        self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result)
+        self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result)
 
     def test_tokenize_function_with_default_chat_template(self):
-        self.tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b-it')
-        examples = \
-            {
-                "instruction": "Test instruction",
-                "response": "Test response",
-                "context": "Test context",
-            }
+        self.tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
+        examples = {
+            "instruction": "Test instruction",
+            "response": "Test response",
+            "context": "Test context",
+        }
 
         chat_example = [
             {
                 "role": "user",
                 "content": "###Instruction:\nTest instruction\n\n###context:\nTest context\n\n",
-
             },
             {
                 "role": "assistant",
                 "content": "Test response\n\n",
-            }
+            },
         ]
 
         # Verify the format of the result
-        expected_result = self.tokenizer.apply_chat_template(chat_example,
-                                                             tokenize=False,
-                                                             max_length=self.config.get("max_length"))
+        expected_result = self.tokenizer.apply_chat_template(
+            chat_example, tokenize=False, max_length=self.config.get("max_length")
+        )
 
-        self.config['gpt_base_model'] = False
+        self.config["gpt_base_model"] = False
         result = self.processer.tokenize_function(examples, self.tokenizer)
-        self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result)
+        self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()

From 3f0b7bcbdfcf5f9f25b80d1ee0680ce29aa59331 Mon Sep 17 00:00:00 2001
From: minmingzhu <45281494+minmingzhu@users.noreply.github.com>
Date: Wed, 10 Apr 2024 11:22:03 +0000
Subject: [PATCH 35/47] [Finetune] use base model mpt-7b instead of mpt-7b-chat
 (#181)

* use base model mpt-7b instead of mpt-7b-chat

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* manual setting specify tokenizer

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update doc/finetune_parameters.md

Signed-off-by: minmingzhu <minming.zhu@intel.com>

---------

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 llm_on_ray/finetune/models/mpt-7b.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llm_on_ray/finetune/models/mpt-7b.yaml b/llm_on_ray/finetune/models/mpt-7b.yaml
index eb2f8f119..5bceeee4d 100644
--- a/llm_on_ray/finetune/models/mpt-7b.yaml
+++ b/llm_on_ray/finetune/models/mpt-7b.yaml
@@ -1,7 +1,6 @@
 General:
   base_model: mosaicml/mpt-7b
   tokenizer_name: EleutherAI/gpt-neox-20b
-  is_base_model: false
   gpt_base_model: false
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: /tmp/llm-ray/checkpoint

From 42ecf6375dabbbd6de736db0b77cd22e158acf54 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 22 Apr 2024 06:27:14 +0000
Subject: [PATCH 36/47] fix license issues

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 tests/finetune/test_chat_template.py | 15 +++++++++++++++
 tests/inference/test_query_single.py | 16 ++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/tests/finetune/test_chat_template.py b/tests/finetune/test_chat_template.py
index 2270a5781..a416d8f7b 100644
--- a/tests/finetune/test_chat_template.py
+++ b/tests/finetune/test_chat_template.py
@@ -1,3 +1,18 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 import unittest
 
 import transformers
diff --git a/tests/inference/test_query_single.py b/tests/inference/test_query_single.py
index 1c32f6b73..d48727a30 100644
--- a/tests/inference/test_query_single.py
+++ b/tests/inference/test_query_single.py
@@ -1,3 +1,19 @@
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
 import subprocess
 import pytest
 import os

From 85520e90df55c34e239234c30a0a38f49005dd28 Mon Sep 17 00:00:00 2001
From: minmingzhu <45281494+minmingzhu@users.noreply.github.com>
Date: Mon, 22 Apr 2024 14:18:31 +0800
Subject: [PATCH 37/47] Update finetune.yaml

---
 llm_on_ray/finetune/finetune.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llm_on_ray/finetune/finetune.yaml b/llm_on_ray/finetune/finetune.yaml
index 1dee1ebf8..1f1cc46ca 100644
--- a/llm_on_ray/finetune/finetune.yaml
+++ b/llm_on_ray/finetune/finetune.yaml
@@ -12,7 +12,6 @@ General:
     lora_alpha: 32
     lora_dropout: 0.1
   enable_gradient_checkpointing: false
-  custom_chat_template: null
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   group: true

From 968e61627edbad2069b88c6f82d0e58427016c36 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Fri, 19 Apr 2024 14:36:48 +0000
Subject: [PATCH 38/47] integrate inference chat template

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 .../inference/models/CodeLlama-7b-hf.yaml     |  2 +-
 llm_on_ray/inference/models/bloom-560m.yaml   |  4 +-
 llm_on_ray/inference/models/deplot.yaml       |  2 +-
 llm_on_ray/inference/models/falcon-7b.yaml    |  2 +-
 llm_on_ray/inference/models/fuyu8b.yaml       |  2 +-
 llm_on_ray/inference/models/gemma-2b.yaml     |  2 +-
 llm_on_ray/inference/models/gpt-j-6b.yaml     |  2 +-
 llm_on_ray/inference/models/gpt2.yaml         |  3 +-
 .../inference/models/llama-2-7b-chat-hf.yaml  |  2 +-
 .../models/mistral-7b-Instruct-v0.2.yaml      |  4 +-
 .../inference/models/mistral-7b-v0.1.yaml     |  4 +-
 llm_on_ray/inference/models/mpt-7b.yaml       |  2 +-
 .../inference/models/neural-chat-7b-v3-1.yaml |  2 +-
 llm_on_ray/inference/models/opt-125m.yaml     |  2 +-
 .../inference/models/sqlcoder-7b-2.yaml       |  2 +-
 llm_on_ray/inference/models/starcoder.yaml    |  2 +-
 llm_on_ray/inference/predictor_deployment.py  | 83 ++++++++++++++++---
 17 files changed, 93 insertions(+), 29 deletions(-)

diff --git a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
index 8a2ef79fd..5cad7e6aa 100644
--- a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
+++ b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: "cpu"
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/bloom-560m.yaml b/llm_on_ray/inference/models/bloom-560m.yaml
index be5e9414e..92dbb8809 100644
--- a/llm_on_ray/inference/models/bloom-560m.yaml
+++ b/llm_on_ray/inference/models/bloom-560m.yaml
@@ -6,9 +6,9 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: CPU
 ipex:
-  enabled: true
+  enabled: false
   precision: bf16
 model_description:
   model_id_or_path: bigscience/bloom-560m
diff --git a/llm_on_ray/inference/models/deplot.yaml b/llm_on_ray/inference/models/deplot.yaml
index bb4ea7ec5..acfbe3e87 100644
--- a/llm_on_ray/inference/models/deplot.yaml
+++ b/llm_on_ray/inference/models/deplot.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: "cpu"
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/falcon-7b.yaml b/llm_on_ray/inference/models/falcon-7b.yaml
index 88a088350..fbbbdda08 100644
--- a/llm_on_ray/inference/models/falcon-7b.yaml
+++ b/llm_on_ray/inference/models/falcon-7b.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: "cpu"
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/fuyu8b.yaml b/llm_on_ray/inference/models/fuyu8b.yaml
index 7c4977adc..3f5fa7ab7 100644
--- a/llm_on_ray/inference/models/fuyu8b.yaml
+++ b/llm_on_ray/inference/models/fuyu8b.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: "cpu"
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/gemma-2b.yaml b/llm_on_ray/inference/models/gemma-2b.yaml
index 09e971081..b6d16b18c 100644
--- a/llm_on_ray/inference/models/gemma-2b.yaml
+++ b/llm_on_ray/inference/models/gemma-2b.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 2
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: CPU
 ipex:
   enabled: true
   precision: bf16
diff --git a/llm_on_ray/inference/models/gpt-j-6b.yaml b/llm_on_ray/inference/models/gpt-j-6b.yaml
index 9719b2f7e..3bdb9997f 100644
--- a/llm_on_ray/inference/models/gpt-j-6b.yaml
+++ b/llm_on_ray/inference/models/gpt-j-6b.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: "cpu"
 ipex:
   # false here for ci coverage
   enabled: false
diff --git a/llm_on_ray/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml
index ca008cba2..cddc45cd8 100644
--- a/llm_on_ray/inference/models/gpt2.yaml
+++ b/llm_on_ray/inference/models/gpt2.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: CPU
 ipex:
   enabled: true
   precision: bf16
@@ -15,3 +15,4 @@ model_description:
   tokenizer_name_or_path: gpt2
   gpt_base_model: true
   chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
+
diff --git a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
index 7fdae3933..4f2ed0194 100644
--- a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
+++ b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: "cpu"
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml
index ea50f6af7..ab901eb95 100644
--- a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml
+++ b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml
@@ -5,13 +5,13 @@ cpus_per_worker: 48
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: CPU
 ipex:
   enabled: false
   precision: bf16
 model_description:
   model_id_or_path: mistralai/Mistral-7B-Instruct-v0.2
-  ipexllm: false
+  bigdl: false
   tokenizer_name_or_path: mistralai/Mistral-7B-Instruct-v0.2
   config:
     trust_remote_code: true
diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
index db2eec1e4..7b5427669 100644
--- a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
+++ b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
@@ -6,13 +6,13 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: CPU
 ipex:
   enabled: false
   precision: bf16
 model_description:
   model_id_or_path: mistralai/Mistral-7B-v0.1
-  ipexllm: false
+  bigdl: false
   tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
   config:
     trust_remote_code: true
diff --git a/llm_on_ray/inference/models/mpt-7b.yaml b/llm_on_ray/inference/models/mpt-7b.yaml
index 89ce086ed..80f062a82 100644
--- a/llm_on_ray/inference/models/mpt-7b.yaml
+++ b/llm_on_ray/inference/models/mpt-7b.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: CPU
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
index be7b8d611..2d6ac4d29 100644
--- a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
+++ b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: CPU
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml
index 92fd30260..81aa7093e 100644
--- a/llm_on_ray/inference/models/opt-125m.yaml
+++ b/llm_on_ray/inference/models/opt-125m.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: CPU
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
index 0a1b43766..e4e629599 100644
--- a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
+++ b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml
@@ -5,7 +5,7 @@ cpus_per_worker: 22
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: cpu
+device: "cpu"
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/models/starcoder.yaml b/llm_on_ray/inference/models/starcoder.yaml
index 660b10ba8..a57ae351d 100644
--- a/llm_on_ray/inference/models/starcoder.yaml
+++ b/llm_on_ray/inference/models/starcoder.yaml
@@ -9,7 +9,7 @@ workers_per_group: 2
 ipex:
   enabled: false
   precision: bf16
-device: cpu
+device: "cpu"
 model_description:
   model_id_or_path: bigcode/starcoder
   tokenizer_name_or_path: bigcode/starcoder
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index 2e642bfff..3cc22c870 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -288,12 +288,12 @@ async def handle_static_batch(self, prompts: List[str], **config: Dict[str, Any]
                     preprocessing_time=0,
                 )
 
-    def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=None):
+    def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=None):
         """
         Preprocesses the input prompts.
 
         Args:
-            input (Union[str, List[str]]): The input prompt(s) to be preprocessed.
+            input (Union[str, List[dict]]): The input prompt(s) to be preprocessed.
             tools (List[str]): The list of tools to be used.
             tool_choice: The choice of tool to be used.
 
@@ -313,10 +313,7 @@ def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=No
 
         if isinstance(input, str):
             return input
-        elif isinstance(input, List):
-            prompts = []
-            images = []
-
+        elif isinstance(input, list):
             prompt_format = get_prompt_format(input)
             if prompt_format == PromptFormat.CHAT_FORMAT:
                 # Process the input prompts with tools
@@ -344,16 +341,32 @@ def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=No
                         prompt = self.process_tool.get_prompt(input)
                         return prompt
                 else:
-                    prompts.extend(input)
+                    if isinstance(input, list) and input and isinstance(input[0], dict):
+                        prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False)
+                    elif isinstance(input, list) and input and isinstance(input[0], list):
+                        prompt = [self.predictor.tokenizer.apply_chat_template(t, tokenize=False) for t in input]
+                    elif isinstance(input, list) and input and isinstance(input[0], ChatMessage):
+                        messages = []
+                        for chat_message in input:
+                            message = {"role": chat_message.role, "content": chat_message.content}
+                            messages.append(message)
+                        prompt = self.predictor.tokenizer.apply_chat_template(messages, tokenize=False)
+                    elif isinstance(input, list) and input and isinstance(input[0], str):
+                        prompt = input
+                    elif isinstance(input, str):
+                        prompt = input
+                    else:
+                        raise TypeError(f"Unsupported type {type(input)} for text. Expected dict or list of dicts.")
+                logger.info(prompt)
+                return prompt
             elif prompt_format == PromptFormat.PROMPTS_FORMAT:
-                prompts.extend(input)
-            else:
                 raise HTTPException(400, "Invalid prompt format.")
-            return prompts
+            return input
         else:
             raise HTTPException(400, "Invalid prompt format.")
 
     async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSONResponse, str]:
+        logger.info("PredictorDeployment call")
         self.use_openai = False
 
         try:
@@ -372,6 +385,7 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON
                 content="Empty prompt is not supported.",
             )
         config = json_request["config"] if "config" in json_request else {}
+
         # return prompt or list of prompts preprocessed
         prompts = self.preprocess_prompts(input)
 
@@ -401,3 +415,52 @@ async def openai_call(
                 yield result
         else:
             yield await self.handle_non_streaming(prompts, config)
+
+
+    def _extract_messages(messages):
+        texts, images = [], []
+        for message in messages:
+            if message['role'] == 'user' and isinstance(message['content'], list):
+                texts.append({"role": "user", "content": message['content'][0]['text']})
+                images.append({"role": "user", "content": message['content'][1]['image_url']['url']})
+            else:
+                texts.append(message)
+        return texts, images
+
+    def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]):
+        """Prepare image from history messages."""
+        from PIL import Image
+        import requests
+        from io import BytesIO
+        import base64
+        import re
+
+        # prepare images
+        images = []
+        if isinstance(messages[0], list):
+            for i in len(messages):
+                for msg in messages[i]:
+                    msg = dict(msg)
+                    role, content = msg["role"], msg["content"]
+                    if "url" not in content:
+                        continue
+                    is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
+                    if is_data:
+                        encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
+                        images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str))))
+                    else:
+                        images[i].append(Image.open(requests.get(content["url"], stream=True).raw))
+        else:
+            for msg in messages:
+                msg = dict(msg)
+                role, content = msg["role"], msg["content"]
+                if "url" not in content:
+                    continue
+                is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
+                if is_data:
+                    encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
+                    images.append(Image.open(BytesIO(base64.b64decode(encoded_str))))
+                else:
+                    images.append(Image.open(requests.get(content["url"], stream=True).raw))
+
+        return images
\ No newline at end of file

From 43c333fad0d89b7c8ae9eb67e3c77f3700795194 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 22 Apr 2024 01:00:41 +0000
Subject: [PATCH 39/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 llm_on_ray/inference/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llm_on_ray/inference/utils.py b/llm_on_ray/inference/utils.py
index 91e311088..5a8db0401 100644
--- a/llm_on_ray/inference/utils.py
+++ b/llm_on_ray/inference/utils.py
@@ -162,13 +162,13 @@ def is_cpu_without_ipex(infer_conf: InferenceConfig) -> bool:
     return (not infer_conf.ipex.enabled) and infer_conf.device == DEVICE_CPU
 
 
-def get_prompt_format(input: Union[List[str], List[dict], List[ChatMessage]]):
+def get_prompt_format(input: Union[List[str], List[dict], List[List[dict]], List[ChatMessage]]):
     chat_format = True
     prompts_format = True
     for item in input:
-        if isinstance(item, str) or isinstance(item, list):
+        if isinstance(item, str):
             chat_format = False
-        elif isinstance(item, dict) or isinstance(item, ChatMessage):
+        elif isinstance(item, dict) or isinstance(item, ChatMessage) or isinstance(item, list):
             prompts_format = False
         else:
             chat_format = False

From b5b7f28de6be40810a58e8bd72a232f67ad87203 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 22 Apr 2024 02:31:09 +0000
Subject: [PATCH 40/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 llm_on_ray/inference/predictor_deployment.py | 41 ++++++++++++--------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index 3cc22c870..4937ed9ce 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -288,6 +288,7 @@ async def handle_static_batch(self, prompts: List[str], **config: Dict[str, Any]
                     preprocessing_time=0,
                 )
 
+    # TODO:Abstract the preprocess_prompts function into a class for handling chat templates
     def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=None):
         """
         Preprocesses the input prompts.
@@ -344,19 +345,26 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No
                     if isinstance(input, list) and input and isinstance(input[0], dict):
                         prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False)
                     elif isinstance(input, list) and input and isinstance(input[0], list):
-                        prompt = [self.predictor.tokenizer.apply_chat_template(t, tokenize=False) for t in input]
+                        prompt = [
+                            self.predictor.tokenizer.apply_chat_template(t, tokenize=False)
+                            for t in input
+                        ]
                     elif isinstance(input, list) and input and isinstance(input[0], ChatMessage):
                         messages = []
                         for chat_message in input:
                             message = {"role": chat_message.role, "content": chat_message.content}
                             messages.append(message)
-                        prompt = self.predictor.tokenizer.apply_chat_template(messages, tokenize=False)
+                        prompt = self.predictor.tokenizer.apply_chat_template(
+                            messages, tokenize=False
+                        )
                     elif isinstance(input, list) and input and isinstance(input[0], str):
                         prompt = input
                     elif isinstance(input, str):
                         prompt = input
                     else:
-                        raise TypeError(f"Unsupported type {type(input)} for text. Expected dict or list of dicts.")
+                        raise TypeError(
+                            f"Unsupported type {type(input)} for text. Expected dict or list of dicts."
+                        )
                 logger.info(prompt)
                 return prompt
             elif prompt_format == PromptFormat.PROMPTS_FORMAT:
@@ -416,18 +424,19 @@ async def openai_call(
         else:
             yield await self.handle_non_streaming(prompts, config)
 
-
-    def _extract_messages(messages):
+    def _extract_messages(self, messages):
         texts, images = [], []
         for message in messages:
-            if message['role'] == 'user' and isinstance(message['content'], list):
-                texts.append({"role": "user", "content": message['content'][0]['text']})
-                images.append({"role": "user", "content": message['content'][1]['image_url']['url']})
+            if message["role"] == "user" and isinstance(message["content"], list):
+                texts.append({"role": "user", "content": message["content"][0]["text"]})
+                images.append(
+                    {"role": "user", "content": message["content"][1]["image_url"]["url"]}
+                )
             else:
                 texts.append(message)
         return texts, images
 
-    def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]):
+    def _prepare_image(self, messages: list):
         """Prepare image from history messages."""
         from PIL import Image
         import requests
@@ -436,12 +445,12 @@ def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]):
         import re
 
         # prepare images
-        images = []
-        if isinstance(messages[0], list):
-            for i in len(messages):
+        images: List = []
+        if isinstance(messages[0], List):
+            for i in range(len(messages)):
                 for msg in messages[i]:
                     msg = dict(msg)
-                    role, content = msg["role"], msg["content"]
+                    content = msg["content"]
                     if "url" not in content:
                         continue
                     is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
@@ -450,10 +459,10 @@ def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]):
                         images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str))))
                     else:
                         images[i].append(Image.open(requests.get(content["url"], stream=True).raw))
-        else:
+        elif isinstance(messages[0], dict):
             for msg in messages:
                 msg = dict(msg)
-                role, content = msg["role"], msg["content"]
+                content = msg["content"]
                 if "url" not in content:
                     continue
                 is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
@@ -463,4 +472,4 @@ def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]):
                 else:
                     images.append(Image.open(requests.get(content["url"], stream=True).raw))
 
-        return images
\ No newline at end of file
+        return images

From 9500d96bf9177e7a7fbfb4850b6aa84887255607 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 22 Apr 2024 03:24:45 +0000
Subject: [PATCH 41/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 llm_on_ray/inference/models/opt-125m.yaml    | 2 +-
 llm_on_ray/inference/predictor_deployment.py | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/llm_on_ray/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml
index 81aa7093e..92fd30260 100644
--- a/llm_on_ray/inference/models/opt-125m.yaml
+++ b/llm_on_ray/inference/models/opt-125m.yaml
@@ -6,7 +6,7 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: CPU
+device: cpu
 ipex:
   enabled: false
   precision: bf16
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index 4937ed9ce..aab110727 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -311,7 +311,6 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No
         Raises:
             HTTPException: If the input prompt format is invalid or not supported.
         """
-
         if isinstance(input, str):
             return input
         elif isinstance(input, list):
@@ -365,7 +364,6 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No
                         raise TypeError(
                             f"Unsupported type {type(input)} for text. Expected dict or list of dicts."
                         )
-                logger.info(prompt)
                 return prompt
             elif prompt_format == PromptFormat.PROMPTS_FORMAT:
                 raise HTTPException(400, "Invalid prompt format.")

From 0c41b8b891cb7a54d43a4d36ff09c51c276260a8 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 22 Apr 2024 03:27:15 +0000
Subject: [PATCH 42/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 llm_on_ray/inference/models/mistral-7b-v0.1.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
index 7b5427669..db2eec1e4 100644
--- a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
+++ b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml
@@ -6,13 +6,13 @@ cpus_per_worker: 24
 gpus_per_worker: 0
 deepspeed: false
 workers_per_group: 2
-device: CPU
+device: cpu
 ipex:
   enabled: false
   precision: bf16
 model_description:
   model_id_or_path: mistralai/Mistral-7B-v0.1
-  bigdl: false
+  ipexllm: false
   tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
   config:
     trust_remote_code: true

From 0ff3d0bd5d83c149286e4711f74fe732a46960cb Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Wed, 24 Apr 2024 08:32:11 +0000
Subject: [PATCH 43/47] Integrate Web UI

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 llm_on_ray/inference/inference_config.py |   1 -
 llm_on_ray/ui/start_ui.py                | 341 +++++++++++------------
 2 files changed, 161 insertions(+), 181 deletions(-)

diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py
index e1ea2e2c3..b7b58598a 100644
--- a/llm_on_ray/inference/inference_config.py
+++ b/llm_on_ray/inference/inference_config.py
@@ -191,7 +191,6 @@ def _check_workers_per_group(cls, v: int):
 
 
 all_models: Dict[str, InferenceConfig] = {}
-base_models: Dict[str, InferenceConfig] = {}
 _models: Dict[str, InferenceConfig] = {}
 
 _cur = os.path.dirname(os.path.abspath(__file__))
diff --git a/llm_on_ray/ui/start_ui.py b/llm_on_ray/ui/start_ui.py
index 5cdece259..7b011cb9c 100644
--- a/llm_on_ray/ui/start_ui.py
+++ b/llm_on_ray/ui/start_ui.py
@@ -29,8 +29,10 @@
 from ray.train.base_trainer import TrainingFailedError
 from ray.tune.logger import LoggerCallback
 from ray.util import queue
-from llm_on_ray.inference.inference_config import all_models, ModelDescription, Prompt
-from llm_on_ray.inference.inference_config import InferenceConfig as FinetunedConfig
+
+from llm_on_ray.finetune.finetune_config import base_models, FinetuneConfig
+from llm_on_ray.inference.inference_config import ModelDescription, all_models
+from llm_on_ray.inference.inference_config import InferenceConfig
 
 from llm_on_ray.inference.predictor_deployment import PredictorDeployment
 from llm_on_ray.ui.html_format import cpu_memory_html, ray_status_html, custom_css
@@ -108,20 +110,20 @@ def get_result(self):
 
 class ChatBotUI:
     def __init__(
-        self,
-        all_models: Dict[str, FinetunedConfig],
-        base_models: Dict[str, FinetunedConfig],
-        finetune_model_path: str,
-        finetuned_checkpoint_path: str,
-        repo_code_path: str,
-        default_data_path: str,
-        default_rag_path: str,
-        config: dict,
-        head_node_ip: str,
-        node_port: str,
-        node_user_name: str,
-        conda_env_name: str,
-        master_ip_port: str,
+            self,
+            all_models: Dict[str, InferenceConfig],
+            base_models: Dict[str, FinetuneConfig],
+            finetune_model_path: str,
+            finetuned_checkpoint_path: str,
+            repo_code_path: str,
+            default_data_path: str,
+            default_rag_path: str,
+            config: dict,
+            head_node_ip: str,
+            node_port: str,
+            node_user_name: str,
+            conda_env_name: str,
+            master_ip_port: str,
     ):
         self._all_models = all_models
         self._base_models = base_models
@@ -147,7 +149,6 @@ def __init__(
             "What is Ray?",
             "What is chatbot?",
         ]
-        self.process_tool = None
         self.finetune_actor = None
         self.finetune_status = False
         self.default_rag_path = default_rag_path
@@ -214,8 +215,10 @@ def user(self, user_message, history):
         return "", history + [[user_message, None]]
 
     def model_generate(self, prompt, request_url, model_name, config, simple_api=True):
+        print("model_generate")
+        print("prompt: ", prompt)
+
         if simple_api:
-            prompt = self.process_tool.get_prompt(prompt)
             sample_input = {"text": prompt, "config": config, "stream": True}
         else:
             sample_input = {
@@ -227,42 +230,42 @@ def model_generate(self, prompt, request_url, model_name, config, simple_api=Tru
                 "top_p": config["top_p"],
                 "top_k": config["top_k"],
             }
+
         proxies = {"http": None, "https": None}
         outputs = requests.post(request_url, proxies=proxies, json=sample_input, stream=True)
+
         outputs.raise_for_status()
         for output in outputs.iter_content(chunk_size=None, decode_unicode=True):
-            # remove context
-            if simple_api:
-                if prompt in output:
-                    output = output[len(prompt) :]
-            else:
-                if output is None or output == "":
-                    continue
+            if not simple_api:
                 import json
                 import re
 
-                chunk_data = re.sub("^data: ", "", output)
-                if chunk_data != "[DONE]":
-                    decoded_output = json.loads(chunk_data)
-                    if "choices" in decoded_output:
-                        choices = decoded_output["choices"]
+                if output is not None and output != "":
+                    # Get data from reponse chunk
+                    chunk_data = re.sub("^data: ", "", output)
+                    if chunk_data.strip() != "[DONE]":
+                        # Get message choices from data
+                        choices = json.loads(chunk_data)["choices"]
+
+                        # Pick content from first choice
                         output = choices[0]["delta"].get("content", "")
-                else:
-                    output = ""
+
+                    else:
+                        output = ""
             yield output
 
     def bot(
-        self,
-        history,
-        deploy_model_endpoint,
-        model_endpoint,
-        Max_new_tokens,
-        Temperature,
-        Top_p,
-        Top_k,
-        model_name=None,
-        image=None,
-        enhance_knowledge=None,
+            self,
+            history,
+            deploy_model_endpoint,
+            model_endpoint,
+            Max_new_tokens,
+            Temperature,
+            Top_p,
+            Top_k,
+            model_name=None,
+            image=None,
+            enhance_knowledge=None,
     ):
         request_url = model_endpoint if model_endpoint != "" else deploy_model_endpoint
         simple_api = is_simple_api(request_url, model_name)
@@ -295,11 +298,7 @@ def bot(
         for output in outputs:
             if len(output) != 0:
                 time_end = time.time()
-                if simple_api:
-                    history[-1][1] += output
-                    history[-1][1] = self.process_tool.convert_output(history[-1][1])
-                else:
-                    history[-1][1] += output
+                history[-1][1] += output
                 time_spend = round(time_end - time_start, 3)
                 token_num += 1
                 new_token_latency = f"""
@@ -310,16 +309,16 @@ def bot(
                 yield [history, new_token_latency]
 
     def bot_test(
-        self,
-        bot_queue,
-        queue_id,
-        history,
-        model_endpoint,
-        Max_new_tokens,
-        Temperature,
-        Top_p,
-        Top_k,
-        model_name=None,
+            self,
+            bot_queue,
+            queue_id,
+            history,
+            model_endpoint,
+            Max_new_tokens,
+            Temperature,
+            Top_p,
+            Top_k,
+            model_name=None,
     ):
         request_url = model_endpoint
         simple_api = is_simple_api(request_url, model_name)
@@ -355,19 +354,19 @@ def bot_test(
         bot_queue.put([queue_id, "", ""])
 
     def bot_rag(
-        self,
-        history,
-        deploy_model_endpoint,
-        model_endpoint,
-        Max_new_tokens,
-        Temperature,
-        Top_p,
-        Top_k,
-        rag_selector,
-        rag_path,
-        returned_k,
-        model_name=None,
-        image=None,
+            self,
+            history,
+            deploy_model_endpoint,
+            model_endpoint,
+            Max_new_tokens,
+            Temperature,
+            Top_p,
+            Top_k,
+            rag_selector,
+            rag_path,
+            returned_k,
+            model_name=None,
+            image=None,
     ):
         enhance_knowledge = None
         if os.path.isabs(rag_path):
@@ -413,16 +412,16 @@ def bot_rag(
             yield output
 
     def regenerate(
-        self,
-        db_dir,
-        upload_type,
-        input_type,
-        input_texts,
-        depth,
-        upload_files,
-        embedding_model,
-        splitter_chunk_size,
-        cpus_per_worker,
+            self,
+            db_dir,
+            upload_type,
+            input_type,
+            input_texts,
+            depth,
+            upload_files,
+            embedding_model,
+            splitter_chunk_size,
+            cpus_per_worker,
     ):
         if upload_type == "Youtube":
             input_texts = input_texts.split(";")
@@ -501,16 +500,16 @@ def regenerate(
         return db_dir
 
     def send_all_bot(
-        self,
-        id,
-        history,
-        deployed_model_endpoint,
-        model_endpoint,
-        Max_new_tokens,
-        Temperature,
-        Top_p,
-        Top_k,
-        model_name,
+            self,
+            id,
+            history,
+            deployed_model_endpoint,
+            model_endpoint,
+            Max_new_tokens,
+            Temperature,
+            Top_p,
+            Top_k,
+            model_name,
     ):
         id = int(id)
         self.bot_queue[id] = Queue()
@@ -536,21 +535,20 @@ def send_all_bot(
             yield res[1]
 
     def finetune(
-        self,
-        model_name,
-        custom_model_name,
-        custom_tokenizer_name,
-        dataset,
-        new_model_name,
-        batch_size,
-        num_epochs,
-        max_train_step,
-        lr,
-        worker_num,
-        cpus_per_worker_ftn,
+            self,
+            model_name,
+            custom_model_name,
+            custom_tokenizer_name,
+            dataset,
+            new_model_name,
+            batch_size,
+            num_epochs,
+            max_train_step,
+            lr,
+            worker_num,
+            cpus_per_worker_ftn,
     ):
         if model_name == "specify other models":
-            model_desc = None
             origin_model_path = custom_model_name
             tokenizer_path = custom_tokenizer_name
             if "gpt" in model_name.lower() or "pythia" in model_name.lower():
@@ -558,35 +556,33 @@ def finetune(
             else:
                 gpt_base_model = False
         else:
-            model_desc = self._base_models[model_name].model_description
-            origin_model_path = model_desc.model_id_or_path
-            tokenizer_path = model_desc.tokenizer_name_or_path
-            gpt_base_model = model_desc.gpt_base_model
+            finetune_config = self._base_models[model_name]
+            gpt_base_model = finetune_config.General.gpt_base_model
+
+        print(type(finetune_config))
+        print(f"Finetune config: {finetune_config}")
+        finetune_config = finetune_config.dict()
+        print(type(finetune_config))
+        print(f"Finetune config: {finetune_config}")
         last_gpt_base_model = False
         finetuned_model_path = os.path.join(self.finetuned_model_path, model_name, new_model_name)
-        finetuned_checkpoint_path = (
-            os.path.join(self.finetuned_checkpoint_path, model_name, new_model_name)
-            if self.finetuned_checkpoint_path != ""
-            else None
-        )
 
-        finetune_config = self.config.copy()
-        training_config = finetune_config.get("Training")
-        exist_worker = int(training_config["num_training_workers"])
-        exist_cpus_per_worker_ftn = int(training_config["resources_per_worker"]["CPU"])
+        exist_worker = int(finetune_config["Training"].get("num_training_workers"))
+
+        exist_cpus_per_worker_ftn = int(finetune_config["Training"].get("resources_per_worker")["CPU"])
 
         ray_resources = ray.available_resources()
         if "CPU" not in ray_resources or cpus_per_worker_ftn * worker_num + 1 > int(
-            ray.available_resources()["CPU"]
+                ray.available_resources()["CPU"]
         ):
             num_req = cpus_per_worker_ftn * worker_num + 1
             num_act = int(ray.available_resources()["CPU"])
             error_msg = f"Resources are not meeting the demand, required num_cpu is {num_req}, actual num_cpu is {num_act}"
             raise gr.Error(error_msg)
         if (
-            worker_num != exist_worker
-            or cpus_per_worker_ftn != exist_cpus_per_worker_ftn
-            or not (gpt_base_model and last_gpt_base_model)
+                worker_num != exist_worker
+                or cpus_per_worker_ftn != exist_cpus_per_worker_ftn
+                or not (gpt_base_model and last_gpt_base_model)
         ):
             ray.shutdown()
             new_ray_init_config = {
@@ -606,22 +602,17 @@ def finetune(
             if gpt_base_model:
                 new_ray_init_config["runtime_env"]["pip"] = ["transformers==4.26.0"]
             else:
-                new_ray_init_config["runtime_env"]["pip"] = ["transformers==4.31.0"]
-            last_gpt_base_model = gpt_base_model
-            finetune_config["Training"]["num_training_workers"] = int(worker_num)
-            finetune_config["Training"]["resources_per_worker"]["CPU"] = int(cpus_per_worker_ftn)
+                new_ray_init_config["runtime_env"]["pip"] = ["transformers==4.38.1"]
 
             ray.init(**new_ray_init_config)
-            exist_worker = worker_num
-            exist_cpus_per_worker_ftn = cpus_per_worker_ftn
 
         finetune_config["Dataset"]["train_file"] = dataset
-        finetune_config["General"]["base_model"] = origin_model_path
+        # finetune_config["General"]["base_model"] = origin_model_path
         finetune_config["Training"]["epochs"] = num_epochs
         finetune_config["General"]["output_dir"] = finetuned_model_path
-        finetune_config["General"]["config"]["trust_remote_code"] = True
-        if finetuned_checkpoint_path:
-            finetune_config["General"]["checkpoint_dir"] = finetuned_checkpoint_path
+
+        # if finetuned_checkpoint_path:
+        #     finetune_config["General"]["checkpoint_dir"] = finetuned_checkpoint_path
         finetune_config["Training"]["batch_size"] = batch_size
         finetune_config["Training"]["learning_rate"] = lr
         if max_train_step != 0:
@@ -653,6 +644,9 @@ def finetune(
         self.finetune_status = False
         # todo: a more reasonable solution is needed
         try:
+            print("Start fine-tuning")
+            print(finetune_config)
+
             results = main(finetune_config)
             if results.metrics["done"]:
                 self.finetune_status = True
@@ -668,21 +662,18 @@ def finetune(
         ray.kill(self.finetune_actor)
         self.finetune_actor = None
 
-        new_prompt = Prompt()
-        new_prompt.intro = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n"
-        new_prompt.human_id = "\n### Instruction"
-        new_prompt.bot_id = "\n### Response"
-        new_prompt.stop_words.extend(
-            ["### Instruction", "# Instruction", "### Question", "##", " ="]
-        )
-        new_model_desc = ModelDescription(
-            model_id_or_path=finetuned_model_path,
-            tokenizer_name_or_path=tokenizer_path,
-            prompt=new_prompt,
-            chat_processor=model_desc.chat_processor if model_desc is not None else "ChatModelGptJ",
-        )
+        if finetune_config["General"].get("lora_config", None) is not None:
+            new_model_desc = ModelDescription(
+                model_id_or_path=finetune_config["General"].get("base_model"),
+                tokenizer_name_or_path=finetuned_model_path,
+            )
+        else:
+            new_model_desc = ModelDescription(
+                model_id_or_path=finetuned_model_path,
+                tokenizer_name_or_path=finetuned_model_path,
+            )
         new_model_desc.config.trust_remote_code = True
-        new_finetuned = FinetunedConfig(
+        new_finetuned = InferenceConfig(
             name=new_model_name,
             route_prefix="/" + new_model_name,
             model_description=new_model_desc,
@@ -711,14 +702,14 @@ def finetune_progress(self, progress=gr.Progress()):
                 progress(
                     float(int(value_step) / int(total_steps)),
                     desc="Start Training: epoch "
-                    + str(value_epoch)
-                    + " / "
-                    + str(total_epochs)
-                    + "  "
-                    + "step "
-                    + str(value_step)
-                    + " / "
-                    + str(total_steps),
+                         + str(value_epoch)
+                         + " / "
+                         + str(total_epochs)
+                         + "  "
+                         + "step "
+                         + str(value_step)
+                         + " / "
+                         + str(total_steps),
                 )
             except Exception:
                 pass
@@ -726,15 +717,15 @@ def finetune_progress(self, progress=gr.Progress()):
         return "<h4 style='text-align: left; margin-bottom: 1rem'>Completed the fine-tuning process.</h4>"
 
     def deploy_func(
-        self,
-        model_name: str,
-        replica_num: int,
-        cpus_per_worker_deploy: int,
-        hpus_per_worker_deploy: int,
+            self,
+            model_name: str,
+            replica_num: int,
+            cpus_per_worker_deploy: int,
+            hpus_per_worker_deploy: int,
     ):
         self.shutdown_deploy()
         if cpus_per_worker_deploy * replica_num > int(
-            ray.available_resources()["CPU"]
+                ray.available_resources()["CPU"]
         ) or hpus_per_worker_deploy * replica_num > int(
             ray.available_resources()["HPU"] if "HPU" in ray.available_resources() else 0
         ):
@@ -744,20 +735,8 @@ def deploy_func(
 
         finetuned = self._all_models[model_name]
         model_desc = finetuned.model_description
-        prompt = model_desc.prompt
         print("model path: ", model_desc.model_id_or_path)
 
-        if model_desc.chat_processor is not None:
-            chat_model = getattr(sys.modules[__name__], model_desc.chat_processor, None)
-            if chat_model is None:
-                return (
-                    model_name
-                    + " deployment failed. "
-                    + model_desc.chat_processor
-                    + " does not exist."
-                )
-            self.process_tool = chat_model(**prompt.dict())
-
         finetuned_deploy = finetuned.copy(deep=True)
         if hpus_per_worker_deploy > 0:
             finetuned_deploy.device = "hpu"
@@ -776,13 +755,14 @@ def deploy_func(
         elif "fuyu-8b" in model_name:
             pip_env = "transformers==4.37.2"
         else:
-            pip_env = "transformers==4.31.0"
+            pip_env = "transformers==4.38.1"
         if finetuned_deploy.device == "cpu":
             ray_actor_options["runtime_env"] = {"pip": [pip_env]}
         deployment = PredictorDeployment.options(  # type: ignore
             num_replicas=replica_num,
             ray_actor_options=ray_actor_options,
         ).bind(finetuned_deploy)
+        print("deployment: ", deployment)
         serve.start(http_options={"host": finetuned_deploy.host, "port": finetuned_deploy.port})
         serve.run(
             deployment,
@@ -810,7 +790,7 @@ def exec_command(self, index, command, ray=False):
         return stdout.read().decode("utf-8"), stderr.read().decode("utf-8")
 
     def get_ray_cluster(self):
-        command = "ray status"
+        command = "/home/damon/anaconda3/bin/ray status"
         out, err = self.exec_command(-1, command, ray=True)
         try:
             out_words = [word for word in out.split("\n") if "CPU" in word][0]
@@ -854,9 +834,9 @@ def kill_node(self, btn_txt, index):
         elif btn_txt == "Start":
             index = int(index)
             command = (
-                "RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address="
-                + self.master_ip_port
-                + r""" --resources='{"special_hardware": 2}'"""
+                    "RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address="
+                    + self.master_ip_port
+                    + r""" --resources='{"special_hardware": 2}'"""
             )
             self.exec_command(index, command, ray=True)
             self.ray_nodes[index]["Alive"] = "True"
@@ -1321,8 +1301,8 @@ def _init_ui(self):
                         rag_input_text = gr.Textbox(
                             label="Local file path",
                             placeholder="Support types: "
-                            + " ".join(recdp_support_suffix)
-                            + ". Support multiple absolute paths, separated by ';'",
+                                        + " ".join(recdp_support_suffix)
+                                        + ". Support multiple absolute paths, separated by ';'",
                             visible=True,
                             scale=2,
                         )
@@ -1777,7 +1757,7 @@ def _init_ui(self):
     ray_init_config: Dict[str, Any] = {
         "runtime_env": {
             "env_vars": {
-                "OMP_NUM_THREADS": "24",
+                "OMP_NUM_THREADS": "32",
                 "ACCELERATE_USE_CPU": "True",
                 "ACCELERATE_MIXED_PRECISION": "no",
                 "CCL_WORKER_COUNT": "1",
@@ -1799,9 +1779,10 @@ def _init_ui(self):
     default_rag_path = args.default_rag_path
 
     initial_model_list = {k: all_models[k] for k in sorted(all_models.keys())}
+    initial_base_model_list = {k: base_models[k] for k in sorted(base_models.keys())}
     ui = ChatBotUI(
         initial_model_list,
-        initial_model_list,
+        initial_base_model_list,
         finetune_model_path,
         finetune_checkpoint_path,
         repo_path,

From 0ec92054ec7543287135455db0addc752c6c93bb Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Fri, 26 Apr 2024 01:33:00 +0000
Subject: [PATCH 44/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 llm_on_ray/finetune/finetune_config.py |  20 ++-
 llm_on_ray/ui/start_ui.py              | 171 ++++++++++++-------------
 2 files changed, 102 insertions(+), 89 deletions(-)

diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py
index 98597901d..1f7cb191e 100644
--- a/llm_on_ray/finetune/finetune_config.py
+++ b/llm_on_ray/finetune/finetune_config.py
@@ -13,9 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import os
 
 from pydantic import BaseModel, validator
-from typing import Optional, List
+from typing import Optional, List, Dict
+
+from pydantic_yaml import parse_yaml_raw_as
 
 PRECISION_BF16 = "bf16"
 PRECISION_FP16 = "fp16"
@@ -162,3 +165,18 @@ class FinetuneConfig(BaseModel):
     General: General
     Dataset: Dataset
     Training: Training
+
+base_models: Dict[str, FinetuneConfig] = {}
+_models: Dict[str, FinetuneConfig] = {}
+
+_cur = os.path.dirname(os.path.abspath(__file__))
+_models_folder = _cur + "/models"
+for model_file in os.listdir(_models_folder):
+    file_path = _models_folder + "/" + model_file
+    if os.path.isdir(file_path):
+        continue
+    with open(file_path, "r") as f:
+        m: FinetuneConfig = parse_yaml_raw_as(FinetuneConfig, f)
+        _models[m.name] = m
+
+all_models = _models.copy()
diff --git a/llm_on_ray/ui/start_ui.py b/llm_on_ray/ui/start_ui.py
index 7b011cb9c..58965627b 100644
--- a/llm_on_ray/ui/start_ui.py
+++ b/llm_on_ray/ui/start_ui.py
@@ -215,9 +215,6 @@ def user(self, user_message, history):
         return "", history + [[user_message, None]]
 
     def model_generate(self, prompt, request_url, model_name, config, simple_api=True):
-        print("model_generate")
-        print("prompt: ", prompt)
-
         if simple_api:
             sample_input = {"text": prompt, "config": config, "stream": True}
         else:
@@ -255,17 +252,17 @@ def model_generate(self, prompt, request_url, model_name, config, simple_api=Tru
             yield output
 
     def bot(
-            self,
-            history,
-            deploy_model_endpoint,
-            model_endpoint,
-            Max_new_tokens,
-            Temperature,
-            Top_p,
-            Top_k,
-            model_name=None,
-            image=None,
-            enhance_knowledge=None,
+        self,
+        history,
+        deploy_model_endpoint,
+        model_endpoint,
+        Max_new_tokens,
+        Temperature,
+        Top_p,
+        Top_k,
+        model_name=None,
+        image=None,
+        enhance_knowledge=None,
     ):
         request_url = model_endpoint if model_endpoint != "" else deploy_model_endpoint
         simple_api = is_simple_api(request_url, model_name)
@@ -309,16 +306,16 @@ def bot(
                 yield [history, new_token_latency]
 
     def bot_test(
-            self,
-            bot_queue,
-            queue_id,
-            history,
-            model_endpoint,
-            Max_new_tokens,
-            Temperature,
-            Top_p,
-            Top_k,
-            model_name=None,
+        self,
+        bot_queue,
+        queue_id,
+        history,
+        model_endpoint,
+        Max_new_tokens,
+        Temperature,
+        Top_p,
+        Top_k,
+        model_name=None,
     ):
         request_url = model_endpoint
         simple_api = is_simple_api(request_url, model_name)
@@ -354,19 +351,19 @@ def bot_test(
         bot_queue.put([queue_id, "", ""])
 
     def bot_rag(
-            self,
-            history,
-            deploy_model_endpoint,
-            model_endpoint,
-            Max_new_tokens,
-            Temperature,
-            Top_p,
-            Top_k,
-            rag_selector,
-            rag_path,
-            returned_k,
-            model_name=None,
-            image=None,
+        self,
+        history,
+        deploy_model_endpoint,
+        model_endpoint,
+        Max_new_tokens,
+        Temperature,
+        Top_p,
+        Top_k,
+        rag_selector,
+        rag_path,
+        returned_k,
+        model_name=None,
+        image=None,
     ):
         enhance_knowledge = None
         if os.path.isabs(rag_path):
@@ -412,16 +409,16 @@ def bot_rag(
             yield output
 
     def regenerate(
-            self,
-            db_dir,
-            upload_type,
-            input_type,
-            input_texts,
-            depth,
-            upload_files,
-            embedding_model,
-            splitter_chunk_size,
-            cpus_per_worker,
+        self,
+        db_dir,
+        upload_type,
+        input_type,
+        input_texts,
+        depth,
+        upload_files,
+        embedding_model,
+        splitter_chunk_size,
+        cpus_per_worker,
     ):
         if upload_type == "Youtube":
             input_texts = input_texts.split(";")
@@ -500,16 +497,16 @@ def regenerate(
         return db_dir
 
     def send_all_bot(
-            self,
-            id,
-            history,
-            deployed_model_endpoint,
-            model_endpoint,
-            Max_new_tokens,
-            Temperature,
-            Top_p,
-            Top_k,
-            model_name,
+        self,
+        id,
+        history,
+        deployed_model_endpoint,
+        model_endpoint,
+        Max_new_tokens,
+        Temperature,
+        Top_p,
+        Top_k,
+        model_name,
     ):
         id = int(id)
         self.bot_queue[id] = Queue()
@@ -535,18 +532,18 @@ def send_all_bot(
             yield res[1]
 
     def finetune(
-            self,
-            model_name,
-            custom_model_name,
-            custom_tokenizer_name,
-            dataset,
-            new_model_name,
-            batch_size,
-            num_epochs,
-            max_train_step,
-            lr,
-            worker_num,
-            cpus_per_worker_ftn,
+        self,
+        model_name,
+        custom_model_name,
+        custom_tokenizer_name,
+        dataset,
+        new_model_name,
+        batch_size,
+        num_epochs,
+        max_train_step,
+        lr,
+        worker_num,
+        cpus_per_worker_ftn,
     ):
         if model_name == "specify other models":
             origin_model_path = custom_model_name
@@ -559,11 +556,8 @@ def finetune(
             finetune_config = self._base_models[model_name]
             gpt_base_model = finetune_config.General.gpt_base_model
 
-        print(type(finetune_config))
-        print(f"Finetune config: {finetune_config}")
+
         finetune_config = finetune_config.dict()
-        print(type(finetune_config))
-        print(f"Finetune config: {finetune_config}")
         last_gpt_base_model = False
         finetuned_model_path = os.path.join(self.finetuned_model_path, model_name, new_model_name)
 
@@ -573,16 +567,16 @@ def finetune(
 
         ray_resources = ray.available_resources()
         if "CPU" not in ray_resources or cpus_per_worker_ftn * worker_num + 1 > int(
-                ray.available_resources()["CPU"]
+            ray.available_resources()["CPU"]
         ):
             num_req = cpus_per_worker_ftn * worker_num + 1
             num_act = int(ray.available_resources()["CPU"])
             error_msg = f"Resources are not meeting the demand, required num_cpu is {num_req}, actual num_cpu is {num_act}"
             raise gr.Error(error_msg)
         if (
-                worker_num != exist_worker
-                or cpus_per_worker_ftn != exist_cpus_per_worker_ftn
-                or not (gpt_base_model and last_gpt_base_model)
+            worker_num != exist_worker
+            or cpus_per_worker_ftn != exist_cpus_per_worker_ftn
+            or not (gpt_base_model and last_gpt_base_model)
         ):
             ray.shutdown()
             new_ray_init_config = {
@@ -607,12 +601,13 @@ def finetune(
             ray.init(**new_ray_init_config)
 
         finetune_config["Dataset"]["train_file"] = dataset
-        # finetune_config["General"]["base_model"] = origin_model_path
+        if origin_model_path is not None:
+             finetune_config["General"]["base_model"] = origin_model_path
+        if tokenizer_path is not None:
+             finetune_config["General"]["tokenizer_name"] = tokenizer_path
         finetune_config["Training"]["epochs"] = num_epochs
         finetune_config["General"]["output_dir"] = finetuned_model_path
 
-        # if finetuned_checkpoint_path:
-        #     finetune_config["General"]["checkpoint_dir"] = finetuned_checkpoint_path
         finetune_config["Training"]["batch_size"] = batch_size
         finetune_config["Training"]["learning_rate"] = lr
         if max_train_step != 0:
@@ -666,6 +661,7 @@ def finetune(
             new_model_desc = ModelDescription(
                 model_id_or_path=finetune_config["General"].get("base_model"),
                 tokenizer_name_or_path=finetuned_model_path,
+                peft_model_id_or_path=finetuned_model_path,
             )
         else:
             new_model_desc = ModelDescription(
@@ -762,7 +758,6 @@ def deploy_func(
             num_replicas=replica_num,
             ray_actor_options=ray_actor_options,
         ).bind(finetuned_deploy)
-        print("deployment: ", deployment)
         serve.start(http_options={"host": finetuned_deploy.host, "port": finetuned_deploy.port})
         serve.run(
             deployment,
@@ -790,7 +785,7 @@ def exec_command(self, index, command, ray=False):
         return stdout.read().decode("utf-8"), stderr.read().decode("utf-8")
 
     def get_ray_cluster(self):
-        command = "/home/damon/anaconda3/bin/ray status"
+        command = "ray status"
         out, err = self.exec_command(-1, command, ray=True)
         try:
             out_words = [word for word in out.split("\n") if "CPU" in word][0]
@@ -834,9 +829,9 @@ def kill_node(self, btn_txt, index):
         elif btn_txt == "Start":
             index = int(index)
             command = (
-                    "RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address="
-                    + self.master_ip_port
-                    + r""" --resources='{"special_hardware": 2}'"""
+                "RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address="
+                + self.master_ip_port
+                + r""" --resources='{"special_hardware": 2}'"""
             )
             self.exec_command(index, command, ray=True)
             self.ray_nodes[index]["Alive"] = "True"
@@ -1301,8 +1296,8 @@ def _init_ui(self):
                         rag_input_text = gr.Textbox(
                             label="Local file path",
                             placeholder="Support types: "
-                                        + " ".join(recdp_support_suffix)
-                                        + ". Support multiple absolute paths, separated by ';'",
+                            + " ".join(recdp_support_suffix)
+                            + ". Support multiple absolute paths, separated by ';'",
                             visible=True,
                             scale=2,
                         )
@@ -1757,7 +1752,7 @@ def _init_ui(self):
     ray_init_config: Dict[str, Any] = {
         "runtime_env": {
             "env_vars": {
-                "OMP_NUM_THREADS": "32",
+                "OMP_NUM_THREADS": "24",
                 "ACCELERATE_USE_CPU": "True",
                 "ACCELERATE_MIXED_PRECISION": "no",
                 "CCL_WORKER_COUNT": "1",

From a3284947e06fe50537d96e7a0929b91e3278c2ed Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Fri, 26 Apr 2024 02:02:13 +0000
Subject: [PATCH 45/47] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 llm_on_ray/finetune/finetune_config.py |  3 +-
 llm_on_ray/ui/start_ui.py              | 65 +++++++++++++-------------
 2 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py
index 1f7cb191e..2cb9d0ce8 100644
--- a/llm_on_ray/finetune/finetune_config.py
+++ b/llm_on_ray/finetune/finetune_config.py
@@ -166,6 +166,7 @@ class FinetuneConfig(BaseModel):
     Dataset: Dataset
     Training: Training
 
+
 base_models: Dict[str, FinetuneConfig] = {}
 _models: Dict[str, FinetuneConfig] = {}
 
@@ -177,6 +178,6 @@ class FinetuneConfig(BaseModel):
         continue
     with open(file_path, "r") as f:
         m: FinetuneConfig = parse_yaml_raw_as(FinetuneConfig, f)
-        _models[m.name] = m
+        _models[m.General.base_model] = m
 
 all_models = _models.copy()
diff --git a/llm_on_ray/ui/start_ui.py b/llm_on_ray/ui/start_ui.py
index 58965627b..e7188b283 100644
--- a/llm_on_ray/ui/start_ui.py
+++ b/llm_on_ray/ui/start_ui.py
@@ -110,20 +110,20 @@ def get_result(self):
 
 class ChatBotUI:
     def __init__(
-            self,
-            all_models: Dict[str, InferenceConfig],
-            base_models: Dict[str, FinetuneConfig],
-            finetune_model_path: str,
-            finetuned_checkpoint_path: str,
-            repo_code_path: str,
-            default_data_path: str,
-            default_rag_path: str,
-            config: dict,
-            head_node_ip: str,
-            node_port: str,
-            node_user_name: str,
-            conda_env_name: str,
-            master_ip_port: str,
+        self,
+        all_models: Dict[str, InferenceConfig],
+        base_models: Dict[str, FinetuneConfig],
+        finetune_model_path: str,
+        finetuned_checkpoint_path: str,
+        repo_code_path: str,
+        default_data_path: str,
+        default_rag_path: str,
+        config: dict,
+        head_node_ip: str,
+        node_port: str,
+        node_user_name: str,
+        conda_env_name: str,
+        master_ip_port: str,
     ):
         self._all_models = all_models
         self._base_models = base_models
@@ -556,14 +556,15 @@ def finetune(
             finetune_config = self._base_models[model_name]
             gpt_base_model = finetune_config.General.gpt_base_model
 
-
         finetune_config = finetune_config.dict()
         last_gpt_base_model = False
         finetuned_model_path = os.path.join(self.finetuned_model_path, model_name, new_model_name)
 
         exist_worker = int(finetune_config["Training"].get("num_training_workers"))
 
-        exist_cpus_per_worker_ftn = int(finetune_config["Training"].get("resources_per_worker")["CPU"])
+        exist_cpus_per_worker_ftn = int(
+            finetune_config["Training"].get("resources_per_worker")["CPU"]
+        )
 
         ray_resources = ray.available_resources()
         if "CPU" not in ray_resources or cpus_per_worker_ftn * worker_num + 1 > int(
@@ -602,9 +603,9 @@ def finetune(
 
         finetune_config["Dataset"]["train_file"] = dataset
         if origin_model_path is not None:
-             finetune_config["General"]["base_model"] = origin_model_path
+            finetune_config["General"]["base_model"] = origin_model_path
         if tokenizer_path is not None:
-             finetune_config["General"]["tokenizer_name"] = tokenizer_path
+            finetune_config["General"]["tokenizer_name"] = tokenizer_path
         finetune_config["Training"]["epochs"] = num_epochs
         finetune_config["General"]["output_dir"] = finetuned_model_path
 
@@ -698,14 +699,14 @@ def finetune_progress(self, progress=gr.Progress()):
                 progress(
                     float(int(value_step) / int(total_steps)),
                     desc="Start Training: epoch "
-                         + str(value_epoch)
-                         + " / "
-                         + str(total_epochs)
-                         + "  "
-                         + "step "
-                         + str(value_step)
-                         + " / "
-                         + str(total_steps),
+                    + str(value_epoch)
+                    + " / "
+                    + str(total_epochs)
+                    + "  "
+                    + "step "
+                    + str(value_step)
+                    + " / "
+                    + str(total_steps),
                 )
             except Exception:
                 pass
@@ -713,15 +714,15 @@ def finetune_progress(self, progress=gr.Progress()):
         return "<h4 style='text-align: left; margin-bottom: 1rem'>Completed the fine-tuning process.</h4>"
 
     def deploy_func(
-            self,
-            model_name: str,
-            replica_num: int,
-            cpus_per_worker_deploy: int,
-            hpus_per_worker_deploy: int,
+        self,
+        model_name: str,
+        replica_num: int,
+        cpus_per_worker_deploy: int,
+        hpus_per_worker_deploy: int,
     ):
         self.shutdown_deploy()
         if cpus_per_worker_deploy * replica_num > int(
-                ray.available_resources()["CPU"]
+            ray.available_resources()["CPU"]
         ) or hpus_per_worker_deploy * replica_num > int(
             ray.available_resources()["HPU"] if "HPU" in ray.available_resources() else 0
         ):

From a8e7b384e6215d622a96c3a8ef13a5638a4abc42 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 29 Apr 2024 16:09:46 +0800
Subject: [PATCH 46/47] update

---
 .../api_server_simple/query_single.py         |  7 +-
 llm_on_ray/finetune/finetune_config.py        |  2 +-
 llm_on_ray/inference/chat_template_process.py |  3 -
 llm_on_ray/inference/models/gpt2.yaml         |  1 -
 llm_on_ray/inference/predictor_deployment.py  | 80 ++-----------------
 llm_on_ray/inference/utils.py                 |  4 +-
 6 files changed, 14 insertions(+), 83 deletions(-)

diff --git a/examples/inference/api_server_simple/query_single.py b/examples/inference/api_server_simple/query_single.py
index 62bb4dc45..b6d935c9a 100644
--- a/examples/inference/api_server_simple/query_single.py
+++ b/examples/inference/api_server_simple/query_single.py
@@ -55,7 +55,12 @@
 )
 
 args = parser.parse_args()
-prompt = "Once upon a time,"
+# prompt = "Once upon a time,"
+prompt = [
+    {"role": "user", "content": "Which is bigger, the moon or the sun?"},
+]
+
+
 config: Dict[str, Union[int, float]] = {}
 if args.max_new_tokens:
     config["max_new_tokens"] = int(args.max_new_tokens)
diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py
index 2cb9d0ce8..136b698eb 100644
--- a/llm_on_ray/finetune/finetune_config.py
+++ b/llm_on_ray/finetune/finetune_config.py
@@ -64,7 +64,7 @@ class General(BaseModel):
     enable_gradient_checkpointing: bool = False
     chat_template: Optional[str] = None
     default_chat_template: str = (
-        "{{ bos_token }}"
+        "Below is an instruction that describes a task. Write a response that appropriately completes the request."
         "{% if messages[0]['role'] == 'system' %}"
         "{{ raise_exception('System role not supported') }}"
         "{% endif %}"
diff --git a/llm_on_ray/inference/chat_template_process.py b/llm_on_ray/inference/chat_template_process.py
index 2f7a64d27..851004b01 100644
--- a/llm_on_ray/inference/chat_template_process.py
+++ b/llm_on_ray/inference/chat_template_process.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 #
 from typing import List
-
 from llm_on_ray.inference.api_openai_backend.openai_protocol import ChatMessage
 
 
@@ -63,14 +62,12 @@ def _extract_messages(self, messages):
         return texts, images
 
     def _prepare_image(self, messages: list):
-        """Prepare image from history messages."""
         from PIL import Image
         import requests
         from io import BytesIO
         import base64
         import re
 
-        # prepare images
         images: List = []
         for msg in messages:
             msg = dict(msg)
diff --git a/llm_on_ray/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml
index cddc45cd8..9ad098c24 100644
--- a/llm_on_ray/inference/models/gpt2.yaml
+++ b/llm_on_ray/inference/models/gpt2.yaml
@@ -15,4 +15,3 @@ model_description:
   tokenizer_name_or_path: gpt2
   gpt_base_model: true
   chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}"
-
diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index aab110727..502ad150f 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -340,31 +340,6 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No
                     else:
                         prompt = self.process_tool.get_prompt(input)
                         return prompt
-                else:
-                    if isinstance(input, list) and input and isinstance(input[0], dict):
-                        prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False)
-                    elif isinstance(input, list) and input and isinstance(input[0], list):
-                        prompt = [
-                            self.predictor.tokenizer.apply_chat_template(t, tokenize=False)
-                            for t in input
-                        ]
-                    elif isinstance(input, list) and input and isinstance(input[0], ChatMessage):
-                        messages = []
-                        for chat_message in input:
-                            message = {"role": chat_message.role, "content": chat_message.content}
-                            messages.append(message)
-                        prompt = self.predictor.tokenizer.apply_chat_template(
-                            messages, tokenize=False
-                        )
-                    elif isinstance(input, list) and input and isinstance(input[0], str):
-                        prompt = input
-                    elif isinstance(input, str):
-                        prompt = input
-                    else:
-                        raise TypeError(
-                            f"Unsupported type {type(input)} for text. Expected dict or list of dicts."
-                        )
-                return prompt
             elif prompt_format == PromptFormat.PROMPTS_FORMAT:
                 raise HTTPException(400, "Invalid prompt format.")
             return input
@@ -411,9 +386,14 @@ async def openai_call(
         tool_choice=None,
     ):
         self.use_openai = True
+        print("openai_call")
+        print(input)
+        print(type(input))
 
         # return prompt or list of prompts preprocessed
         prompts = self.preprocess_prompts(input, tools, tool_choice)
+        print(prompts)
+        print(type(prompts))
 
         # Handle streaming response
         if streaming_response:
@@ -421,53 +401,3 @@ async def openai_call(
                 yield result
         else:
             yield await self.handle_non_streaming(prompts, config)
-
-    def _extract_messages(self, messages):
-        texts, images = [], []
-        for message in messages:
-            if message["role"] == "user" and isinstance(message["content"], list):
-                texts.append({"role": "user", "content": message["content"][0]["text"]})
-                images.append(
-                    {"role": "user", "content": message["content"][1]["image_url"]["url"]}
-                )
-            else:
-                texts.append(message)
-        return texts, images
-
-    def _prepare_image(self, messages: list):
-        """Prepare image from history messages."""
-        from PIL import Image
-        import requests
-        from io import BytesIO
-        import base64
-        import re
-
-        # prepare images
-        images: List = []
-        if isinstance(messages[0], List):
-            for i in range(len(messages)):
-                for msg in messages[i]:
-                    msg = dict(msg)
-                    content = msg["content"]
-                    if "url" not in content:
-                        continue
-                    is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
-                    if is_data:
-                        encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
-                        images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str))))
-                    else:
-                        images[i].append(Image.open(requests.get(content["url"], stream=True).raw))
-        elif isinstance(messages[0], dict):
-            for msg in messages:
-                msg = dict(msg)
-                content = msg["content"]
-                if "url" not in content:
-                    continue
-                is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0
-                if is_data:
-                    encoded_str = re.sub("^data:image/.+;base64,", "", content["url"])
-                    images.append(Image.open(BytesIO(base64.b64decode(encoded_str))))
-                else:
-                    images.append(Image.open(requests.get(content["url"], stream=True).raw))
-
-        return images
diff --git a/llm_on_ray/inference/utils.py b/llm_on_ray/inference/utils.py
index 5a8db0401..56b9146e5 100644
--- a/llm_on_ray/inference/utils.py
+++ b/llm_on_ray/inference/utils.py
@@ -162,13 +162,13 @@ def is_cpu_without_ipex(infer_conf: InferenceConfig) -> bool:
     return (not infer_conf.ipex.enabled) and infer_conf.device == DEVICE_CPU
 
 
-def get_prompt_format(input: Union[List[str], List[dict], List[List[dict]], List[ChatMessage]]):
+def get_prompt_format(input: Union[List[str], List[dict], List[ChatMessage]]):
     chat_format = True
     prompts_format = True
     for item in input:
         if isinstance(item, str):
             chat_format = False
-        elif isinstance(item, dict) or isinstance(item, ChatMessage) or isinstance(item, list):
+        elif isinstance(item, dict) or isinstance(item, ChatMessage):
             prompts_format = False
         else:
             chat_format = False

From cbae21340d16133265fd75cece7f18d1ac2935d6 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Mon, 6 May 2024 11:12:07 +0800
Subject: [PATCH 47/47] update

---
 llm_on_ray/inference/predictor_deployment.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py
index 502ad150f..74b9430e8 100644
--- a/llm_on_ray/inference/predictor_deployment.py
+++ b/llm_on_ray/inference/predictor_deployment.py
@@ -311,9 +311,13 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No
         Raises:
             HTTPException: If the input prompt format is invalid or not supported.
         """
+
         if isinstance(input, str):
             return input
-        elif isinstance(input, list):
+        elif isinstance(input, List):
+            prompts = []
+            images = []
+
             prompt_format = get_prompt_format(input)
             if prompt_format == PromptFormat.CHAT_FORMAT:
                 # Process the input prompts with tools
@@ -340,14 +344,17 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No
                     else:
                         prompt = self.process_tool.get_prompt(input)
                         return prompt
+                else:
+                    prompts.extend(input)
             elif prompt_format == PromptFormat.PROMPTS_FORMAT:
+                prompts.extend(input)
+            else:
                 raise HTTPException(400, "Invalid prompt format.")
-            return input
+            return prompts
         else:
             raise HTTPException(400, "Invalid prompt format.")
 
     async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSONResponse, str]:
-        logger.info("PredictorDeployment call")
         self.use_openai = False
 
         try:
@@ -366,7 +373,6 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON
                 content="Empty prompt is not supported.",
             )
         config = json_request["config"] if "config" in json_request else {}
-
         # return prompt or list of prompts preprocessed
         prompts = self.preprocess_prompts(input)