From 94df92c6fafefc7104c55f718b61904056e3a5ed Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Fri, 19 Apr 2024 14:36:48 +0000 Subject: [PATCH 01/47] integrate inference chat template Signed-off-by: minmingzhu --- llm_on_ray/inference/inference_config.py | 19 +++ .../inference/models/CodeLlama-7b-hf.yaml | 8 +- llm_on_ray/inference/models/bloom-560m.yaml | 10 +- .../models/deepseek-coder-33b-instruct.yaml | 12 +- llm_on_ray/inference/models/deplot.yaml | 15 +- llm_on_ray/inference/models/falcon-7b.yaml | 8 +- llm_on_ray/inference/models/fuyu8b.yaml | 15 +- llm_on_ray/inference/models/gemma-2b.yaml | 10 +- llm_on_ray/inference/models/gpt-j-6b.yaml | 17 +-- llm_on_ray/inference/models/gpt2.yaml | 8 +- .../inference/models/llama-2-7b-chat-hf.yaml | 10 +- .../models/mistral-7b-Instruct-v0.2.yaml | 10 +- .../inference/models/mistral-7b-v0.1.yaml | 11 +- llm_on_ray/inference/models/mpt-7b.yaml | 15 +- .../inference/models/neural-chat-7b-v3-1.yaml | 13 +- llm_on_ray/inference/models/opt-125m.yaml | 8 +- .../inference/models/sqlcoder-7b-2.yaml | 8 +- llm_on_ray/inference/models/starcoder.yaml | 8 +- llm_on_ray/inference/predictor_deployment.py | 144 ++++++++++++------ 19 files changed, 145 insertions(+), 204 deletions(-) diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py index 96833c24b..6842fe63e 100644 --- a/llm_on_ray/inference/inference_config.py +++ b/llm_on_ray/inference/inference_config.py @@ -112,6 +112,25 @@ class ModelDescription(BaseModel): input_processor: str = "AutoProcessor" model_loader: str = "AutoModel" + chat_model_with_image: bool = False + chat_template: Union[str, None] = None + default_chat_template: str = ( + "{{ bos_token }}" + "{% if messages[0]['role'] == 'system' %}" + "{{ raise_exception('System role not supported') }}" + "{% endif %}" + "{% for message in messages %}" + "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" + "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" + "{% endif %}" + "{% if message['role'] == 'user' %}" + "{{ '### Instruction: ' + message['content'] + eos_token }}" + "{% elif message['role'] == 'assistant' %}" + "{{ '### Response:' + message['content'] + eos_token }}" + "{% endif %}{% endfor %}" + "{{'### End \n'}}" + ) + @validator("quantization_type") def _check_quant_type(cls, v: str): if v: diff --git a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml index 9ea2d77db..3c93da6b6 100644 --- a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml +++ b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml @@ -6,16 +6,10 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: "cpu" ipex: enabled: false precision: bf16 model_description: model_id_or_path: codellama/CodeLlama-7b-hf tokenizer_name_or_path: codellama/CodeLlama-7b-hf - chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] diff --git a/llm_on_ray/inference/models/bloom-560m.yaml b/llm_on_ray/inference/models/bloom-560m.yaml index ba2a6d962..92dbb8809 100644 --- a/llm_on_ray/inference/models/bloom-560m.yaml +++ b/llm_on_ray/inference/models/bloom-560m.yaml @@ -6,16 +6,10 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: - enabled: true + enabled: false precision: bf16 model_description: model_id_or_path: bigscience/bloom-560m tokenizer_name_or_path: bigscience/bloom-560m - chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] diff --git a/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml b/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml index 75e646a44..84f6d2a43 100644 --- a/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml +++ b/llm_on_ray/inference/models/deepseek-coder-33b-instruct.yaml @@ -10,16 +10,6 @@ device: cpu ipex: enabled: false precision: bf16 -model_description: +model_description: model_id_or_path: deepseek-ai/deepseek-coder-33b-instruct tokenizer_name_or_path: deepseek-ai/deepseek-coder-33b-instruct - chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: ['<|EOT|>', ""] - - - - diff --git a/llm_on_ray/inference/models/deplot.yaml b/llm_on_ray/inference/models/deplot.yaml index 4e732a4fe..6b518def9 100644 --- a/llm_on_ray/inference/models/deplot.yaml +++ b/llm_on_ray/inference/models/deplot.yaml @@ -6,22 +6,11 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: "cpu" ipex: enabled: false precision: bf16 model_description: model_id_or_path: google/deplot tokenizer_name_or_path: google/deplot - chat_processor: ChatModelwithImage - input_processor: 'AutoProcessor' - model_loader: 'Pix2StructForConditionalGeneration' - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] - config: - use_auth_token: '' + chat_model_with_image: true diff --git a/llm_on_ray/inference/models/falcon-7b.yaml b/llm_on_ray/inference/models/falcon-7b.yaml index 8176a2689..5a59e6e47 100644 --- a/llm_on_ray/inference/models/falcon-7b.yaml +++ b/llm_on_ray/inference/models/falcon-7b.yaml @@ -6,16 +6,10 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: "cpu" ipex: enabled: false precision: bf16 model_description: model_id_or_path: tiiuae/falcon-7b tokenizer_name_or_path: tiiuae/falcon-7b - chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] diff --git a/llm_on_ray/inference/models/fuyu8b.yaml b/llm_on_ray/inference/models/fuyu8b.yaml index 551a85789..c45affbae 100644 --- a/llm_on_ray/inference/models/fuyu8b.yaml +++ b/llm_on_ray/inference/models/fuyu8b.yaml @@ -6,22 +6,11 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: "cpu" ipex: enabled: false precision: bf16 model_description: model_id_or_path: adept/fuyu-8b tokenizer_name_or_path: adept/fuyu-8b - chat_processor: ChatModelwithImage - input_processor: FuyuProcessor - model_loader: FuyuForCausalLM - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] - config: - use_auth_token: '' + chat_model_with_image: true diff --git a/llm_on_ray/inference/models/gemma-2b.yaml b/llm_on_ray/inference/models/gemma-2b.yaml index 8335857ca..c08bd571b 100644 --- a/llm_on_ray/inference/models/gemma-2b.yaml +++ b/llm_on_ray/inference/models/gemma-2b.yaml @@ -6,20 +6,12 @@ cpus_per_worker: 2 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: enabled: true precision: bf16 model_description: model_id_or_path: google/gemma-2b tokenizer_name_or_path: google/gemma-2b - chat_processor: ChatModelGemma - prompt: - intro: '' - human_id: 'user - {msg}' - bot_id: 'model - {msg}' - stop_words: [] config: use_auth_token: ' ' diff --git a/llm_on_ray/inference/models/gpt-j-6b.yaml b/llm_on_ray/inference/models/gpt-j-6b.yaml index c7778c12e..b2a7a04df 100644 --- a/llm_on_ray/inference/models/gpt-j-6b.yaml +++ b/llm_on_ray/inference/models/gpt-j-6b.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: "cpu" ipex: # false here for ci coverage enabled: false @@ -14,17 +14,4 @@ ipex: model_description: model_id_or_path: EleutherAI/gpt-j-6b tokenizer_name_or_path: EleutherAI/gpt-j-6b - chat_processor: ChatModelGptJ - gpt_base_model: true - prompt: - intro: 'Below is an instruction that describes a task. Write a response that appropriately - completes the request. - - ' - human_id: ' - - ### Instruction' - bot_id: ' - - ### Response' - stop_words: [] + gpt_base_model: true \ No newline at end of file diff --git a/llm_on_ray/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml index 48287670a..2d44b9882 100644 --- a/llm_on_ray/inference/models/gpt2.yaml +++ b/llm_on_ray/inference/models/gpt2.yaml @@ -6,17 +6,11 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: enabled: true precision: bf16 model_description: model_id_or_path: gpt2 tokenizer_name_or_path: gpt2 - chat_processor: ChatModelGptJ gpt_base_model: true - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] diff --git a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml index 4b3e11e98..4f2ed0194 100644 --- a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml +++ b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml @@ -6,20 +6,12 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: "cpu" ipex: enabled: false precision: bf16 model_description: model_id_or_path: meta-llama/Llama-2-7b-chat-hf tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml index 1af9aad1b..ab901eb95 100644 --- a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml +++ b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml @@ -5,19 +5,13 @@ cpus_per_worker: 48 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: enabled: false precision: bf16 model_description: model_id_or_path: mistralai/Mistral-7B-Instruct-v0.2 - ipexllm: false + bigdl: false tokenizer_name_or_path: mistralai/Mistral-7B-Instruct-v0.2 - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST]' - bot_id: '' - stop_words: [] config: trust_remote_code: true diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml index c8a0ff385..d5dbec146 100644 --- a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml +++ b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml @@ -6,19 +6,14 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: enabled: false precision: bf16 model_description: model_id_or_path: mistralai/Mistral-7B-v0.1 - ipexllm: false + bigdl: false tokenizer_name_or_path: mistralai/Mistral-7B-v0.1 - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST]' - bot_id: '' - stop_words: [] + config: trust_remote_code: true diff --git a/llm_on_ray/inference/models/mpt-7b.yaml b/llm_on_ray/inference/models/mpt-7b.yaml index 4ea12adb3..80f062a82 100644 --- a/llm_on_ray/inference/models/mpt-7b.yaml +++ b/llm_on_ray/inference/models/mpt-7b.yaml @@ -6,25 +6,12 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: enabled: false precision: bf16 model_description: model_id_or_path: mosaicml/mpt-7b tokenizer_name_or_path: EleutherAI/gpt-neox-20b - chat_processor: ChatModelGptJ - prompt: - intro: 'Below is an instruction that describes a task, paired with an input that - provides further context. Write a response that appropriately completes the request. - - ' - human_id: ' - - ### Instruction' - bot_id: ' - - ### Response' - stop_words: [] config: trust_remote_code: true diff --git a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml index 13a29676c..670654715 100644 --- a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml +++ b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml @@ -6,20 +6,11 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: enabled: false precision: bf16 model_description: model_id_or_path: Intel/neural-chat-7b-v3-1 tokenizer_name_or_path: Intel/neural-chat-7b-v3-1 - chat_processor: ChatModelGptJ - prompt: - intro: '### System: - You are a chatbot developed by Intel. Please answer all questions to the best of your ability.' - human_id: ' - - ### User' - bot_id: ' - - ### Assistant' + chat_template: "{{ bos_token }}'### System:You are a chatbot developed by Intel. Please answer all questions to the best of your ability.\n'{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### User: ' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '### Assistant:' + message['content'] + eos_token }}{% endif %}{% endfor %}" diff --git a/llm_on_ray/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml index 545cd2145..e5f431b42 100644 --- a/llm_on_ray/inference/models/opt-125m.yaml +++ b/llm_on_ray/inference/models/opt-125m.yaml @@ -6,16 +6,10 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: enabled: false precision: bf16 model_description: model_id_or_path: facebook/opt-125m tokenizer_name_or_path: facebook/opt-125m - chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] diff --git a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml index 7130148a3..be85f8463 100644 --- a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml +++ b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml @@ -5,18 +5,12 @@ cpus_per_worker: 22 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: "cpu" ipex: enabled: false precision: bf16 model_description: model_id_or_path: defog/sqlcoder-7b-2 tokenizer_name_or_path: defog/sqlcoder-7b-2 - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: ["```"] config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/starcoder.yaml b/llm_on_ray/inference/models/starcoder.yaml index 0da59ac02..2044cf109 100644 --- a/llm_on_ray/inference/models/starcoder.yaml +++ b/llm_on_ray/inference/models/starcoder.yaml @@ -9,15 +9,9 @@ workers_per_group: 2 ipex: enabled: false precision: bf16 -device: cpu +device: "cpu" model_description: model_id_or_path: bigcode/starcoder tokenizer_name_or_path: bigcode/starcoder - chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index 18b23d86b..bf5cc35af 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -51,31 +51,11 @@ def __init__( max_batch_size=_DEFAULT_MAX_BATCH_SIZE, ): self.device = torch.device(infer_conf.device) - self.process_tool = None - chat_processor_name = infer_conf.model_description.chat_processor - prompt = infer_conf.model_description.prompt self.handle_dynamic_batch.set_max_batch_size(max_batch_size) - - if chat_processor_name: - try: - module = __import__("chat_process") - except Exception: - sys.path.append(os.path.dirname(__file__)) - module = __import__("chat_process") - chat_processor = getattr(module, chat_processor_name, None) - if chat_processor is None: - raise ValueError( - infer_conf.name - + " deployment failed. chat_processor(" - + chat_processor_name - + ") does not exist." - ) - self.process_tool = chat_processor(**prompt.dict()) - self.use_deepspeed = infer_conf.deepspeed self.use_vllm = infer_conf.vllm.enabled - self.is_mllm = True if chat_processor_name in ["ChatModelwithImage"] else False + self.is_mllm = infer_conf.model_description.chat_model_with_image # Used to determine if openai backend is used self.use_openai = False @@ -305,12 +285,12 @@ async def handle_static_batch(self, prompts: List[str], **config: Dict[str, Any] preprocessing_time=0, ) - def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=None): + def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=None): """ Preprocesses the input prompts. Args: - input (Union[str, List[str]]): The input prompt(s) to be preprocessed. + input (Union[str, List[dict]]): The input prompt(s) to be preprocessed. tools (List[str]): The list of tools to be used. tool_choice: The choice of tool to be used. @@ -327,12 +307,14 @@ def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=No Raises: HTTPException: If the input prompt format is invalid or not supported. """ + + logger.info("preprocess_prompts") + logger.info(type(input)) + logger.info(input) + if isinstance(input, str): return input - elif isinstance(input, List): - prompts = [] - images = [] - + elif isinstance(input, list): prompt_format = get_prompt_format(input) if prompt_format == PromptFormat.CHAT_FORMAT: # Process the input prompts with tools @@ -349,27 +331,55 @@ def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=No m.content = self.openai_tools_prompter.content_from_assistant(m) # type: ignore elif m.tool_call_id is not None: # type: ignore m.content = self.openai_tools_prompter.content_from_tool(m) # type: ignore - # Process the input prompts with MLLM tool - if self.process_tool is not None: - if self.is_mllm: - input, image = self.process_tool.get_prompt(input) - prompts.append(input) - images.extend(image) - return prompts, images - else: - prompt = self.process_tool.get_prompt(input) - return prompt + + if self.predictor.infer_conf.model_description.chat_template is not None: + self.predictor.tokenizer.chat_template = self.predictor.infer_conf.model_description.chat_template + elif self.predictor.tokenizer.chat_template is None: + self.predictor.tokenizer.chat_template = self.predictor.infer_conf.model_description.default_chat_template + + if self.is_mllm: + if isinstance(input, list): + if isinstance(input, list) and input and isinstance(input[0], ChatMessage): + messages = [] + for chat_message in input: + message = {"role": chat_message.role, "content": chat_message.content} + messages.append(message) + texts, images = self._extract_messages(messages) + elif isinstance(input, list) and input and isinstance(input[0], dict): + texts, images = self._extract_messages(input) + elif isinstance(input, list) and input and isinstance(input[0], list): + texts, images = [self._extract_messages(p) for p in input] + + image = self._prepare_image(images) + prompt = self.tokenize_inputs(texts) + return prompt, image else: - prompts.extend(input) + if isinstance(input, list) and input and isinstance(input[0], dict): + prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False) + elif isinstance(input, list) and input and isinstance(input[0], list): + prompt = [self.predictor.tokenizer.apply_chat_template(t, tokenize=False) for t in input] + elif isinstance(input, list) and input and isinstance(input[0], ChatMessage): + messages = [] + for chat_message in input: + message = {"role": chat_message.role, "content": chat_message.content} + messages.append(message) + prompt = self.predictor.tokenizer.apply_chat_template(messages, tokenize=False) + elif isinstance(input, list) and input and isinstance(input[0], str): + prompt = input + elif isinstance(input, str): + prompt = input + else: + raise TypeError(f"Unsupported type {type(input)} for text. Expected dict or list of dicts.") + logger.info(prompt) + return prompt elif prompt_format == PromptFormat.PROMPTS_FORMAT: - prompts.extend(input) - else: raise HTTPException(400, "Invalid prompt format.") - return prompts + return input else: raise HTTPException(400, "Invalid prompt format.") async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSONResponse, str]: + logger.info("PredictorDeployment call") self.use_openai = False try: @@ -379,7 +389,6 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON status_code=400, content="Invalid JSON format from http request.", ) - streaming_response = json_request["stream"] if "stream" in json_request else False input = json_request["text"] if "text" in json_request else "" if input == "": @@ -388,7 +397,7 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON content="Empty prompt is not supported.", ) config = json_request["config"] if "config" in json_request else {} - + logger.info(input) # return prompt or list of prompts preprocessed prompts = self.preprocess_prompts(input) @@ -418,3 +427,52 @@ async def openai_call( yield result else: yield await self.handle_non_streaming(prompts, config) + + + def _extract_messages(messages): + texts, images = [], [] + for message in messages: + if message['role'] == 'user' and isinstance(message['content'], list): + texts.append({"role": "user", "content": message['content'][0]['text']}) + images.append({"role": "user", "content": message['content'][1]['image_url']['url']}) + else: + texts.append(message) + return texts, images + + def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]): + """Prepare image from history messages.""" + from PIL import Image + import requests + from io import BytesIO + import base64 + import re + + # prepare images + images = [] + if isinstance(messages[0], list): + for i in len(messages): + for msg in messages[i]: + msg = dict(msg) + role, content = msg["role"], msg["content"] + if "url" not in content: + continue + is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 + if is_data: + encoded_str = re.sub("^data:image/.+;base64,", "", content["url"]) + images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str)))) + else: + images[i].append(Image.open(requests.get(content["url"], stream=True).raw)) + else: + for msg in messages: + msg = dict(msg) + role, content = msg["role"], msg["content"] + if "url" not in content: + continue + is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 + if is_data: + encoded_str = re.sub("^data:image/.+;base64,", "", content["url"]) + images.append(Image.open(BytesIO(base64.b64decode(encoded_str)))) + else: + images.append(Image.open(requests.get(content["url"], stream=True).raw)) + + return images \ No newline at end of file From f84756980cd5ef8596aba1d3918b79cc834d88e8 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 22 Apr 2024 01:00:41 +0000 Subject: [PATCH 02/47] update Signed-off-by: minmingzhu --- .github/workflows/config/mpt_deltatuner.yaml | 15 --------------- .../models/hpu/llama-2-70b-chat-hf-hpu.yaml | 8 -------- .../models/hpu/llama-2-7b-chat-hf-hpu.yaml | 8 -------- .../inference/models/hpu/neural-chat-7b-v3-3.yaml | 10 ---------- .../models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml | 6 ------ .../models/ipex-llm/mpt-7b-ipex-llm.yaml | 13 ------------- llm_on_ray/inference/models/mistral-7b-v0.1.yaml | 1 - .../models/vllm/llama-2-7b-chat-hf-vllm.yaml | 8 -------- llm_on_ray/inference/utils.py | 6 +++--- 9 files changed, 3 insertions(+), 72 deletions(-) diff --git a/.github/workflows/config/mpt_deltatuner.yaml b/.github/workflows/config/mpt_deltatuner.yaml index 250004dc2..7399d587b 100644 --- a/.github/workflows/config/mpt_deltatuner.yaml +++ b/.github/workflows/config/mpt_deltatuner.yaml @@ -13,20 +13,5 @@ ipex: model_description: model_id_or_path: mosaicml/mpt-7b tokenizer_name_or_path: EleutherAI/gpt-neox-20b - chat_processor: ChatModelGptJ - peft_model_id_or_path: nathan0/mpt-7b-deltatuner-model - peft_type: deltatuner - prompt: - intro: 'Below is an instruction that describes a task, paired with an input that - provides further context. Write a response that appropriately completes the request. - - ' - human_id: ' - - ### Instruction' - bot_id: ' - - ### Response' - stop_words: [] config: trust_remote_code: true diff --git a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml index d68da8428..ab411ff0e 100644 --- a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml @@ -10,13 +10,5 @@ device: hpu model_description: model_id_or_path: meta-llama/Llama-2-70b-chat-hf tokenizer_name_or_path: meta-llama/Llama-2-70b-chat-hf - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml index 374a98f77..b7b19f02a 100644 --- a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml @@ -8,13 +8,5 @@ device: hpu model_description: model_id_or_path: meta-llama/Llama-2-7b-chat-hf tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml index 848358bec..00ff121c5 100644 --- a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml +++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml @@ -14,13 +14,3 @@ ipex: model_description: model_id_or_path: Intel/neural-chat-7b-v3-3 tokenizer_name_or_path: Intel/neural-chat-7b-v3-3 - chat_processor: ChatModelGptJ - prompt: - intro: '### System: - You are a chatbot developed by Intel. Please answer all questions to the best of your ability.' - human_id: ' - - ### User' - bot_id: ' - - ### Assistant' diff --git a/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml b/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml index 6a8523467..5ec652bba 100644 --- a/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml +++ b/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml @@ -14,12 +14,6 @@ model_description: model_id_or_path: mistralai/Mistral-7B-v0.1 ipexllm: true tokenizer_name_or_path: mistralai/Mistral-7B-v0.1 - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST]' - bot_id: '' - stop_words: [] config: trust_remote_code: true load_in_4bit: true diff --git a/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml b/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml index d352a6517..ecb129973 100644 --- a/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml +++ b/llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml @@ -14,19 +14,6 @@ model_description: model_id_or_path: mosaicml/mpt-7b-chat ipexllm: true tokenizer_name_or_path: EleutherAI/gpt-neox-20b - chat_processor: ChatModelGptJ - prompt: - intro: 'Below is an instruction that describes a task, paired with an input that - provides further context. Write a response that appropriately completes the request. - - ' - human_id: ' - - ### Instruction' - bot_id: ' - - ### Response' - stop_words: [] config: trust_remote_code: true load_in_4bit: true diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml index d5dbec146..12de7c3bb 100644 --- a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml +++ b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml @@ -14,6 +14,5 @@ model_description: model_id_or_path: mistralai/Mistral-7B-v0.1 bigdl: false tokenizer_name_or_path: mistralai/Mistral-7B-v0.1 - config: trust_remote_code: true diff --git a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml index acbf58455..9302b9be2 100644 --- a/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml +++ b/llm_on_ray/inference/models/vllm/llama-2-7b-chat-hf-vllm.yaml @@ -16,13 +16,5 @@ ipex: model_description: model_id_or_path: meta-llama/Llama-2-7b-chat-hf tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/llm_on_ray/inference/utils.py b/llm_on_ray/inference/utils.py index 91e311088..5a8db0401 100644 --- a/llm_on_ray/inference/utils.py +++ b/llm_on_ray/inference/utils.py @@ -162,13 +162,13 @@ def is_cpu_without_ipex(infer_conf: InferenceConfig) -> bool: return (not infer_conf.ipex.enabled) and infer_conf.device == DEVICE_CPU -def get_prompt_format(input: Union[List[str], List[dict], List[ChatMessage]]): +def get_prompt_format(input: Union[List[str], List[dict], List[List[dict]], List[ChatMessage]]): chat_format = True prompts_format = True for item in input: - if isinstance(item, str) or isinstance(item, list): + if isinstance(item, str): chat_format = False - elif isinstance(item, dict) or isinstance(item, ChatMessage): + elif isinstance(item, dict) or isinstance(item, ChatMessage) or isinstance(item, list): prompts_format = False else: chat_format = False From 0df70f1fa0c1f676398971eea06517ea22b9a8af Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 22 Apr 2024 02:31:09 +0000 Subject: [PATCH 03/47] update Signed-off-by: minmingzhu --- llm_on_ray/inference/predictor_deployment.py | 56 +++++++++++++------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index bf5cc35af..1d693f272 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -285,6 +285,7 @@ async def handle_static_batch(self, prompts: List[str], **config: Dict[str, Any] preprocessing_time=0, ) + # TODO:Abstract the preprocess_prompts function into a class for handling chat templates def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=None): """ Preprocesses the input prompts. @@ -333,16 +334,23 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No m.content = self.openai_tools_prompter.content_from_tool(m) # type: ignore if self.predictor.infer_conf.model_description.chat_template is not None: - self.predictor.tokenizer.chat_template = self.predictor.infer_conf.model_description.chat_template + self.predictor.tokenizer.chat_template = ( + self.predictor.infer_conf.model_description.chat_template + ) elif self.predictor.tokenizer.chat_template is None: - self.predictor.tokenizer.chat_template = self.predictor.infer_conf.model_description.default_chat_template + self.predictor.tokenizer.chat_template = ( + self.predictor.infer_conf.model_description.default_chat_template + ) if self.is_mllm: if isinstance(input, list): if isinstance(input, list) and input and isinstance(input[0], ChatMessage): messages = [] for chat_message in input: - message = {"role": chat_message.role, "content": chat_message.content} + message = { + "role": chat_message.role, + "content": chat_message.content, + } messages.append(message) texts, images = self._extract_messages(messages) elif isinstance(input, list) and input and isinstance(input[0], dict): @@ -351,25 +359,32 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No texts, images = [self._extract_messages(p) for p in input] image = self._prepare_image(images) - prompt = self.tokenize_inputs(texts) + prompt = self.predictor.tokenizer.apply_chat_template(texts, tokenize=False) return prompt, image else: if isinstance(input, list) and input and isinstance(input[0], dict): prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False) elif isinstance(input, list) and input and isinstance(input[0], list): - prompt = [self.predictor.tokenizer.apply_chat_template(t, tokenize=False) for t in input] + prompt = [ + self.predictor.tokenizer.apply_chat_template(t, tokenize=False) + for t in input + ] elif isinstance(input, list) and input and isinstance(input[0], ChatMessage): messages = [] for chat_message in input: message = {"role": chat_message.role, "content": chat_message.content} messages.append(message) - prompt = self.predictor.tokenizer.apply_chat_template(messages, tokenize=False) + prompt = self.predictor.tokenizer.apply_chat_template( + messages, tokenize=False + ) elif isinstance(input, list) and input and isinstance(input[0], str): prompt = input elif isinstance(input, str): prompt = input else: - raise TypeError(f"Unsupported type {type(input)} for text. Expected dict or list of dicts.") + raise TypeError( + f"Unsupported type {type(input)} for text. Expected dict or list of dicts." + ) logger.info(prompt) return prompt elif prompt_format == PromptFormat.PROMPTS_FORMAT: @@ -428,18 +443,19 @@ async def openai_call( else: yield await self.handle_non_streaming(prompts, config) - - def _extract_messages(messages): + def _extract_messages(self, messages): texts, images = [], [] for message in messages: - if message['role'] == 'user' and isinstance(message['content'], list): - texts.append({"role": "user", "content": message['content'][0]['text']}) - images.append({"role": "user", "content": message['content'][1]['image_url']['url']}) + if message["role"] == "user" and isinstance(message["content"], list): + texts.append({"role": "user", "content": message["content"][0]["text"]}) + images.append( + {"role": "user", "content": message["content"][1]["image_url"]["url"]} + ) else: texts.append(message) return texts, images - def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]): + def _prepare_image(self, messages: list): """Prepare image from history messages.""" from PIL import Image import requests @@ -448,12 +464,12 @@ def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]): import re # prepare images - images = [] - if isinstance(messages[0], list): - for i in len(messages): + images: List = [] + if isinstance(messages[0], List): + for i in range(len(messages)): for msg in messages[i]: msg = dict(msg) - role, content = msg["role"], msg["content"] + content = msg["content"] if "url" not in content: continue is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 @@ -462,10 +478,10 @@ def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]): images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str)))) else: images[i].append(Image.open(requests.get(content["url"], stream=True).raw)) - else: + elif isinstance(messages[0], dict): for msg in messages: msg = dict(msg) - role, content = msg["role"], msg["content"] + content = msg["content"] if "url" not in content: continue is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 @@ -475,4 +491,4 @@ def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]): else: images.append(Image.open(requests.get(content["url"], stream=True).raw)) - return images \ No newline at end of file + return images From 6534808bf85df964a853be4cf0377f5db5f4bca7 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 22 Apr 2024 03:24:45 +0000 Subject: [PATCH 04/47] update Signed-off-by: minmingzhu --- .github/workflows/config/bloom-560m-ci.yaml | 6 ------ .github/workflows/config/gpt2-ci.yaml | 6 ------ .../config/llama-2-7b-chat-hf-vllm-fp32.yaml | 8 -------- .../workflows/config/mpt_deltatuner_deepspeed.yaml | 13 ------------- .github/workflows/config/opt-125m-ci.yaml | 6 ------ llm_on_ray/inference/models/opt-125m.yaml | 2 +- llm_on_ray/inference/predictor_deployment.py | 6 ------ 7 files changed, 1 insertion(+), 46 deletions(-) diff --git a/.github/workflows/config/bloom-560m-ci.yaml b/.github/workflows/config/bloom-560m-ci.yaml index 16a97d896..674644798 100644 --- a/.github/workflows/config/bloom-560m-ci.yaml +++ b/.github/workflows/config/bloom-560m-ci.yaml @@ -13,9 +13,3 @@ ipex: model_description: model_id_or_path: bigscience/bloom-560m tokenizer_name_or_path: bigscience/bloom-560m - chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] diff --git a/.github/workflows/config/gpt2-ci.yaml b/.github/workflows/config/gpt2-ci.yaml index 1e6df57cb..e9bed1366 100644 --- a/.github/workflows/config/gpt2-ci.yaml +++ b/.github/workflows/config/gpt2-ci.yaml @@ -12,10 +12,4 @@ ipex: model_description: model_id_or_path: gpt2 tokenizer_name_or_path: gpt2 - chat_processor: ChatModelGptJ gpt_base_model: true - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] diff --git a/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml b/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml index 46be6eb57..d3d96a0e1 100644 --- a/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml +++ b/.github/workflows/config/llama-2-7b-chat-hf-vllm-fp32.yaml @@ -16,13 +16,5 @@ ipex: model_description: model_id_or_path: meta-llama/Llama-2-7b-chat-hf tokenizer_name_or_path: meta-llama/Llama-2-7b-chat-hf - chat_processor: ChatModelLLama - prompt: - intro: '' - human_id: '[INST] {msg} [/INST] - - ' - bot_id: '' - stop_words: [] config: use_auth_token: '' diff --git a/.github/workflows/config/mpt_deltatuner_deepspeed.yaml b/.github/workflows/config/mpt_deltatuner_deepspeed.yaml index 40051e0fa..a4fdd0709 100644 --- a/.github/workflows/config/mpt_deltatuner_deepspeed.yaml +++ b/.github/workflows/config/mpt_deltatuner_deepspeed.yaml @@ -13,20 +13,7 @@ ipex: model_description: model_id_or_path: mosaicml/mpt-7b tokenizer_name_or_path: EleutherAI/gpt-neox-20b - chat_processor: ChatModelGptJ peft_model_id_or_path: nathan0/mpt-7b-deltatuner-model peft_type: deltatuner - prompt: - intro: 'Below is an instruction that describes a task, paired with an input that - provides further context. Write a response that appropriately completes the request. - - ' - human_id: ' - - ### Instruction' - bot_id: ' - - ### Response' - stop_words: [] config: trust_remote_code: true diff --git a/.github/workflows/config/opt-125m-ci.yaml b/.github/workflows/config/opt-125m-ci.yaml index 047d0008c..e5ab095a6 100644 --- a/.github/workflows/config/opt-125m-ci.yaml +++ b/.github/workflows/config/opt-125m-ci.yaml @@ -13,9 +13,3 @@ ipex: model_description: model_id_or_path: facebook/opt-125m tokenizer_name_or_path: facebook/opt-125m - chat_processor: ChatModelGptJ - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] diff --git a/llm_on_ray/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml index e5f431b42..65a6d6bf7 100644 --- a/llm_on_ray/inference/models/opt-125m.yaml +++ b/llm_on_ray/inference/models/opt-125m.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: CPU +device: cpu ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index 1d693f272..92d8435a0 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -308,11 +308,6 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No Raises: HTTPException: If the input prompt format is invalid or not supported. """ - - logger.info("preprocess_prompts") - logger.info(type(input)) - logger.info(input) - if isinstance(input, str): return input elif isinstance(input, list): @@ -385,7 +380,6 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No raise TypeError( f"Unsupported type {type(input)} for text. Expected dict or list of dicts." ) - logger.info(prompt) return prompt elif prompt_format == PromptFormat.PROMPTS_FORMAT: raise HTTPException(400, "Invalid prompt format.") From 5a864dc73bed276a0f5125aab78c4670cd2b0d23 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 22 Apr 2024 03:27:15 +0000 Subject: [PATCH 05/47] update Signed-off-by: minmingzhu --- llm_on_ray/inference/models/mistral-7b-v0.1.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml index 12de7c3bb..0ed664efd 100644 --- a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml +++ b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml @@ -6,13 +6,13 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: CPU +device: cpu ipex: enabled: false precision: bf16 model_description: model_id_or_path: mistralai/Mistral-7B-v0.1 - bigdl: false + ipexllm: false tokenizer_name_or_path: mistralai/Mistral-7B-v0.1 config: trust_remote_code: true From e06105ee534f8c49dfbf4c5520bd866a19cbc4f6 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 22 Apr 2024 05:33:28 +0000 Subject: [PATCH 06/47] update Signed-off-by: minmingzhu --- tests/test_getting_started.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_getting_started.sh b/tests/test_getting_started.sh index 6a900a553..052ac51bb 100755 --- a/tests/test_getting_started.sh +++ b/tests/test_getting_started.sh @@ -33,7 +33,7 @@ curl $ENDPOINT_URL/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "gpt2", - "messages": [{"role": "assistant", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}], + "messages": [{"role": "user", "content": "Hello!"}], "temperature": 0.7 }' From 9a11e52c4329904204b064c0e17f70815f8018e6 Mon Sep 17 00:00:00 2001 From: minmingzhu <45281494+minmingzhu@users.noreply.github.com> Date: Mon, 22 Apr 2024 14:21:41 +0800 Subject: [PATCH 07/47] Update query_http_requests.py --- examples/inference/api_server_openai/query_http_requests.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/inference/api_server_openai/query_http_requests.py b/examples/inference/api_server_openai/query_http_requests.py index 536deb30e..a2be3873f 100644 --- a/examples/inference/api_server_openai/query_http_requests.py +++ b/examples/inference/api_server_openai/query_http_requests.py @@ -58,7 +58,6 @@ body = { "model": args.model_name, "messages": [ - {"role": "assistant", "content": "You are a helpful assistant."}, {"role": "user", "content": args.input_text}, ], "stream": args.streaming_response, From 02ee02d724097537db292ccc38edd816324c2fc7 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Fri, 26 Apr 2024 03:52:24 +0000 Subject: [PATCH 08/47] update Signed-off-by: minmingzhu --- llm_on_ray/inference/chat_process.py | 222 ------------------ llm_on_ray/inference/chat_template_process.py | 77 ++++++ .../inference/models/CodeLlama-7b-hf.yaml | 2 +- llm_on_ray/inference/models/bloom-560m.yaml | 4 +- llm_on_ray/inference/models/deplot.yaml | 2 +- llm_on_ray/inference/models/falcon-7b.yaml | 2 +- llm_on_ray/inference/models/fuyu8b.yaml | 2 +- llm_on_ray/inference/models/gemma-2b.yaml | 2 +- llm_on_ray/inference/models/gpt-j-6b.yaml | 2 +- llm_on_ray/inference/models/gpt2.yaml | 2 +- .../inference/models/llama-2-7b-chat-hf.yaml | 2 +- .../models/mistral-7b-Instruct-v0.2.yaml | 4 +- llm_on_ray/inference/models/mpt-7b.yaml | 2 +- .../inference/models/neural-chat-7b-v3-1.yaml | 2 +- .../inference/models/sqlcoder-7b-2.yaml | 2 +- llm_on_ray/inference/models/starcoder.yaml | 2 +- llm_on_ray/inference/predictor_deployment.py | 48 ---- llm_on_ray/ui/start_ui.py | 2 +- 18 files changed, 94 insertions(+), 287 deletions(-) delete mode 100644 llm_on_ray/inference/chat_process.py create mode 100644 llm_on_ray/inference/chat_template_process.py diff --git a/llm_on_ray/inference/chat_process.py b/llm_on_ray/inference/chat_process.py deleted file mode 100644 index 3ee238fb7..000000000 --- a/llm_on_ray/inference/chat_process.py +++ /dev/null @@ -1,222 +0,0 @@ -# -# Copyright 2023 The LLM-on-Ray Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -class ChatModel: - human_id = "" - bot_id = "" - unknown_id = "" - MEANINGLESS_WORDS = ["", "", "<|endoftext|>", "
"] - stop_words = [""] - - def __init__(self, intro, human_id, bot_id, stop_words) -> None: - self.intro = intro - self.human_id = human_id - self.bot_id = bot_id - self.stop_words = stop_words - self.MEANINGLESS_WORDS.extend(self.stop_words) - - def prepare_prompt(self, messages: list): - """Prepare prompt from history messages.""" - prompt = "" - for msg in messages: - role, content = msg.role, msg.content - if role == "user": - prompt += f"{self.human_id}: {content}\n" - elif role == "assistant": - prompt += f"{self.bot_id}: {content}\n" - else: - prompt += f"{self.unknown_id}: {content}\n" - prompt += f"{self.bot_id}:" - return prompt - - def convert_output(self, output: str): - """Convert the model output to final answer.""" - human_id = self.human_id.strip() - bot_id = self.bot_id.strip() - if human_id != "": - output = output.split(human_id)[0] - if bot_id != "": - output = output.split(bot_id)[0] - for word in self.MEANINGLESS_WORDS: - output = output.replace(word, "") - text = output - # remove partial human_id or bot id - if "\n" in text and ( - human_id.startswith(text[text.rfind("\n") + 1 :]) - or bot_id.startswith(text[text.rfind("\n") + 1]) - ): - text = text[: text.rfind("\n")] - return text - - def get_prompt(self, messages): - """Generate response based on messages.""" - prompt = self.prepare_prompt(messages) - return prompt - - -class ChatModelGptJ(ChatModel): - def __init__(self, intro, human_id, bot_id, stop_words): - super().__init__(intro, human_id, bot_id, stop_words) - - def prepare_prompt(self, messages: list): - """Prepare prompt from history messages.""" - prompt = self.intro - for msg in messages: - msg = dict(msg) - role, content = msg["role"], msg["content"] - if role == "user": - if self.human_id != "": - prompt += f"{self.human_id}:\n{content}\n" - else: - prompt += f"{content}\n" - elif role == "assistant": - if self.bot_id != "": - prompt += f"{self.bot_id}:\n{content}\n" - else: - prompt += f"{content}\n" - else: - prompt += f"### Unknown:\n{content}\n" - if self.bot_id != "": - prompt += f"{self.bot_id}:\n" - return prompt - - -class ChatModelLLama(ChatModel): - def __init__(self, intro, human_id, bot_id, stop_words): - super().__init__(intro, human_id, bot_id, stop_words) - - def prepare_prompt(self, messages: list): - """Prepare prompt from history messages.""" - prompt = self.intro - for msg in messages: - msg = dict(msg) - role, content = msg["role"], msg["content"] - if role == "user": - if self.human_id != "": - prompt += self.human_id.format(msg=content) - else: - prompt += f"{content}\n" - elif role == "assistant": - prompt += f"{content}\n" - elif role == "tool": - prompt += f"{content}\n" - elif role == "system": - prompt += f"### system:\n{content}\n" - else: - prompt += f"### Unknown:\n{content}\n" - if self.bot_id != "": - prompt += f"{self.bot_id}:\n" - return prompt - - -class ChatModelwithImage(ChatModel): - def __init__(self, intro, human_id, bot_id, stop_words): - super().__init__(intro, human_id, bot_id, stop_words) - - def prepare_prompt(self, messages: list): - """Prepare prompt from history messages.""" - from PIL import Image - import requests - from io import BytesIO - import base64 - import re - - prompt = self.intro - for msg in messages: - msg = dict(msg) - role, content = msg["role"], msg["content"] - text_prompt = [] - image_prompt = [] - for item in content: - if item["type"] == "text": - text_prompt.append(item["text"]) - elif item["type"] == "image_url": - image_prompt.append(item["image_url"]) - else: - raise ValueError(f"Unknown content type {item['type']}") - - content = "\n".join(text_prompt) - # prepare images - images = [] - for img in image_prompt: - if "url" not in img: - continue - is_data = len(re.findall("^data:image/.+;base64,", img["url"])) > 0 - if is_data: - encoded_str = re.sub("^data:image/.+;base64,", "", img["url"]) - images.append(Image.open(BytesIO(base64.b64decode(encoded_str)))) - else: - images.append(Image.open(requests.get(img["url"], stream=True).raw)) - - if role == "user": - if self.human_id != "": - prompt += self.human_id.format(msg=content) - else: - prompt += f"{content}\n" - elif role == "assistant": - prompt += f"{content}\n" - else: - prompt += f"### Unknown:\n{content}\n" - if self.bot_id != "": - prompt += f"{self.bot_id}:\n" - return prompt, images - - -class ChatModelGemma(ChatModel): - def __init__(self, intro, human_id, bot_id, stop_words): - super().__init__(intro, human_id, bot_id, stop_words) - - def prepare_prompt(self, messages: list): - """Prepare prompt from history messages.""" - prompt = self.intro - for msg in messages: - msg = dict(msg) - role, content = msg["role"], msg["content"] - if role == "user": - if self.human_id != "": - prompt += f"{self.human_id} {content}\n" - else: - prompt += f"{content}\n" - elif role == "assistant": - if self.bot_id != "": - prompt += f"{self.bot_id} {content}\n" - else: - prompt += f"{content}\n" - else: - prompt += f"### Unknown:\n{content}\n" - if self.bot_id != "": - prompt += f"{self.bot_id}:\n" - return prompt - - -class ChatModelNoFormat(ChatModel): - def __init__(self, intro, human_id, bot_id, stop_words): - super().__init__(intro, human_id, bot_id, stop_words) - - def prepare_prompt(self, messages: list): - """Prepare prompt from history messages.""" - prompt = "" - for msg in messages: - msg = dict(msg) - prompt += msg["content"] - return prompt - - -if __name__ == "__main__": - process_tool = ChatModelGptJ( - "", "### Instruction", "### Response", stop_words=["##", "### Instruction"] - ) diff --git a/llm_on_ray/inference/chat_template_process.py b/llm_on_ray/inference/chat_template_process.py new file mode 100644 index 000000000..25ff1056d --- /dev/null +++ b/llm_on_ray/inference/chat_template_process.py @@ -0,0 +1,77 @@ +# +# Copyright 2023 The LLM-on-Ray Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +class ChatModel: + + def __init__(self, tokenizer) -> None: + self.tokenizer = tokenizer + + def get_prompt(self, messages): + """Generate response based on messages.""" + prompt = self.prepare_prompt(messages) + return prompt + + + def _extract_messages(self, messages): + texts, images = [], [] + for message in messages: + if message["role"] == "user" and isinstance(message["content"], list): + texts.append({"role": "user", "content": message["content"][0]["text"]}) + images.append( + {"role": "user", "content": message["content"][1]["image_url"]["url"]} + ) + else: + texts.append(message) + return texts, images + + def _prepare_image(self, messages: list): + """Prepare image from history messages.""" + from PIL import Image + import requests + from io import BytesIO + import base64 + import re + + # prepare images + images: List = [] + if isinstance(messages[0], List): + for i in range(len(messages)): + for msg in messages[i]: + msg = dict(msg) + content = msg["content"] + if "url" not in content: + continue + is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 + if is_data: + encoded_str = re.sub("^data:image/.+;base64,", "", content["url"]) + images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str)))) + else: + images[i].append(Image.open(requests.get(content["url"], stream=True).raw)) + elif isinstance(messages[0], dict): + for msg in messages: + msg = dict(msg) + content = msg["content"] + if "url" not in content: + continue + is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 + if is_data: + encoded_str = re.sub("^data:image/.+;base64,", "", content["url"]) + images.append(Image.open(BytesIO(base64.b64decode(encoded_str)))) + else: + images.append(Image.open(requests.get(content["url"], stream=True).raw)) + + return images \ No newline at end of file diff --git a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml index 3c93da6b6..eff253e46 100644 --- a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml +++ b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: "cpu" +device: cpu ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/bloom-560m.yaml b/llm_on_ray/inference/models/bloom-560m.yaml index 92dbb8809..be5e9414e 100644 --- a/llm_on_ray/inference/models/bloom-560m.yaml +++ b/llm_on_ray/inference/models/bloom-560m.yaml @@ -6,9 +6,9 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: CPU +device: cpu ipex: - enabled: false + enabled: true precision: bf16 model_description: model_id_or_path: bigscience/bloom-560m diff --git a/llm_on_ray/inference/models/deplot.yaml b/llm_on_ray/inference/models/deplot.yaml index 6b518def9..8f8edd47c 100644 --- a/llm_on_ray/inference/models/deplot.yaml +++ b/llm_on_ray/inference/models/deplot.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: "cpu" +device: cpu ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/falcon-7b.yaml b/llm_on_ray/inference/models/falcon-7b.yaml index 5a59e6e47..5801f12be 100644 --- a/llm_on_ray/inference/models/falcon-7b.yaml +++ b/llm_on_ray/inference/models/falcon-7b.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: "cpu" +device: cpu ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/fuyu8b.yaml b/llm_on_ray/inference/models/fuyu8b.yaml index c45affbae..e62303d83 100644 --- a/llm_on_ray/inference/models/fuyu8b.yaml +++ b/llm_on_ray/inference/models/fuyu8b.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: "cpu" +device: cpu ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/gemma-2b.yaml b/llm_on_ray/inference/models/gemma-2b.yaml index c08bd571b..a27b6bc0f 100644 --- a/llm_on_ray/inference/models/gemma-2b.yaml +++ b/llm_on_ray/inference/models/gemma-2b.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 2 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: CPU +device: cpu ipex: enabled: true precision: bf16 diff --git a/llm_on_ray/inference/models/gpt-j-6b.yaml b/llm_on_ray/inference/models/gpt-j-6b.yaml index b2a7a04df..6560f1623 100644 --- a/llm_on_ray/inference/models/gpt-j-6b.yaml +++ b/llm_on_ray/inference/models/gpt-j-6b.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: "cpu" +device: cpu ipex: # false here for ci coverage enabled: false diff --git a/llm_on_ray/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml index 2d44b9882..96737288f 100644 --- a/llm_on_ray/inference/models/gpt2.yaml +++ b/llm_on_ray/inference/models/gpt2.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: CPU +device: cpu ipex: enabled: true precision: bf16 diff --git a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml index 4f2ed0194..7fdae3933 100644 --- a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml +++ b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: "cpu" +device: cpu ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml index ab901eb95..ea50f6af7 100644 --- a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml +++ b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml @@ -5,13 +5,13 @@ cpus_per_worker: 48 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: CPU +device: cpu ipex: enabled: false precision: bf16 model_description: model_id_or_path: mistralai/Mistral-7B-Instruct-v0.2 - bigdl: false + ipexllm: false tokenizer_name_or_path: mistralai/Mistral-7B-Instruct-v0.2 config: trust_remote_code: true diff --git a/llm_on_ray/inference/models/mpt-7b.yaml b/llm_on_ray/inference/models/mpt-7b.yaml index 80f062a82..89ce086ed 100644 --- a/llm_on_ray/inference/models/mpt-7b.yaml +++ b/llm_on_ray/inference/models/mpt-7b.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: CPU +device: cpu ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml index 670654715..bd49ce189 100644 --- a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml +++ b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: CPU +device: cpu ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml index be85f8463..6d12b35df 100644 --- a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml +++ b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml @@ -5,7 +5,7 @@ cpus_per_worker: 22 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: "cpu" +device: cpu ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/starcoder.yaml b/llm_on_ray/inference/models/starcoder.yaml index 2044cf109..adbc91fc0 100644 --- a/llm_on_ray/inference/models/starcoder.yaml +++ b/llm_on_ray/inference/models/starcoder.yaml @@ -9,7 +9,7 @@ workers_per_group: 2 ipex: enabled: false precision: bf16 -device: "cpu" +device: cpu model_description: model_id_or_path: bigcode/starcoder tokenizer_name_or_path: bigcode/starcoder diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index 92d8435a0..f32946b26 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -437,52 +437,4 @@ async def openai_call( else: yield await self.handle_non_streaming(prompts, config) - def _extract_messages(self, messages): - texts, images = [], [] - for message in messages: - if message["role"] == "user" and isinstance(message["content"], list): - texts.append({"role": "user", "content": message["content"][0]["text"]}) - images.append( - {"role": "user", "content": message["content"][1]["image_url"]["url"]} - ) - else: - texts.append(message) - return texts, images - - def _prepare_image(self, messages: list): - """Prepare image from history messages.""" - from PIL import Image - import requests - from io import BytesIO - import base64 - import re - - # prepare images - images: List = [] - if isinstance(messages[0], List): - for i in range(len(messages)): - for msg in messages[i]: - msg = dict(msg) - content = msg["content"] - if "url" not in content: - continue - is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 - if is_data: - encoded_str = re.sub("^data:image/.+;base64,", "", content["url"]) - images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str)))) - else: - images[i].append(Image.open(requests.get(content["url"], stream=True).raw)) - elif isinstance(messages[0], dict): - for msg in messages: - msg = dict(msg) - content = msg["content"] - if "url" not in content: - continue - is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 - if is_data: - encoded_str = re.sub("^data:image/.+;base64,", "", content["url"]) - images.append(Image.open(BytesIO(base64.b64decode(encoded_str)))) - else: - images.append(Image.open(requests.get(content["url"], stream=True).raw)) - return images diff --git a/llm_on_ray/ui/start_ui.py b/llm_on_ray/ui/start_ui.py index c30851a8e..9f76a2696 100644 --- a/llm_on_ray/ui/start_ui.py +++ b/llm_on_ray/ui/start_ui.py @@ -31,7 +31,7 @@ from ray.util import queue from llm_on_ray.inference.inference_config import all_models, ModelDescription, Prompt from llm_on_ray.inference.inference_config import InferenceConfig as FinetunedConfig -from llm_on_ray.inference.chat_process import ( +from llm_on_ray.inference.chat_template_process import ( ChatModelGptJ, ChatModelLLama, ChatModelwithImage, From 5d11e45ee48793fcd772605b0c0d8cbf4799e174 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Fri, 26 Apr 2024 07:43:10 +0000 Subject: [PATCH 09/47] update Signed-off-by: minmingzhu --- llm_on_ray/inference/chat_template_process.py | 63 ++++++++++++-- llm_on_ray/inference/predictor_deployment.py | 82 ++++++------------- 2 files changed, 82 insertions(+), 63 deletions(-) diff --git a/llm_on_ray/inference/chat_template_process.py b/llm_on_ray/inference/chat_template_process.py index 25ff1056d..c39a10b84 100644 --- a/llm_on_ray/inference/chat_template_process.py +++ b/llm_on_ray/inference/chat_template_process.py @@ -13,18 +13,71 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from typing import List, Union +from llm_on_ray.inference.api_openai_backend.openai_protocol import ChatMessage -class ChatModel: + +class ChatTemplatePreprocess: def __init__(self, tokenizer) -> None: self.tokenizer = tokenizer - def get_prompt(self, messages): - """Generate response based on messages.""" - prompt = self.prepare_prompt(messages) - return prompt + def get_prompt(self, input: List, is_mllm=False): + """Generate response based on input.""" + if self.predictor.infer_conf.model_description.chat_template is not None: + self.predictor.tokenizer.chat_template = ( + self.predictor.infer_conf.model_description.chat_template + ) + elif self.predictor.tokenizer.chat_template is None: + self.predictor.tokenizer.chat_template = ( + self.predictor.infer_conf.model_description.default_chat_template + ) + if is_mllm: + if isinstance(input, List): + if isinstance(input, list) and input and isinstance(input[0], ChatMessage): + messages = [] + for chat_message in input: + message = { + "role": chat_message.role, + "content": chat_message.content, + } + messages.append(message) + texts, images = self._extract_messages(messages) + elif isinstance(input, list) and input and isinstance(input[0], dict): + texts, images = self._extract_messages(input) + elif isinstance(input, list) and input and isinstance(input[0], list): + texts, images = [self._extract_messages(p) for p in input] + + image = self._prepare_image(images) + prompt = self.predictor.tokenizer.apply_chat_template(texts, tokenize=False) + return prompt, image + else: + if isinstance(input, list) and input and isinstance(input[0], dict): + prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False) + elif isinstance(input, list) and input and isinstance(input[0], list): + prompt = [ + self.predictor.tokenizer.apply_chat_template(t, tokenize=False) + for t in input + ] + elif isinstance(input, list) and input and isinstance(input[0], ChatMessage): + messages = [] + for chat_message in input: + message = {"role": chat_message.role, "content": chat_message.content} + messages.append(message) + prompt = self.predictor.tokenizer.apply_chat_template( + messages, tokenize=False + ) + elif isinstance(input, list) and input and isinstance(input[0], str): + prompt = input + elif isinstance(input, str): + prompt = input + else: + raise TypeError( + f"Unsupported type {type(input)} for text. Expected dict or list of dicts." + ) + return prompt def _extract_messages(self, messages): texts, images = [], [] diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index f32946b26..d5e211c37 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -26,6 +26,8 @@ from starlette.requests import Request from starlette.responses import StreamingResponse, JSONResponse from fastapi import HTTPException + +from llm_on_ray.inference.chat_template_process import ChatTemplatePreprocess from llm_on_ray.inference.inference_config import InferenceConfig from llm_on_ray.inference.api_openai_backend.openai_protocol import ( ChatMessage, @@ -82,6 +84,8 @@ def __init__( self.predictor = TransformerPredictor(infer_conf) self.loop = asyncio.get_running_loop() + self.process_tool = ChatTemplatePreprocess(self.predictor.tokenizer) + def consume_streamer(self, streamer): for text in streamer: @@ -285,13 +289,12 @@ async def handle_static_batch(self, prompts: List[str], **config: Dict[str, Any] preprocessing_time=0, ) - # TODO:Abstract the preprocess_prompts function into a class for handling chat templates - def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=None): + def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=None): """ Preprocesses the input prompts. Args: - input (Union[str, List[dict]]): The input prompt(s) to be preprocessed. + input (Union[str, List[str]]): The input prompt(s) to be preprocessed. tools (List[str]): The list of tools to be used. tool_choice: The choice of tool to be used. @@ -310,7 +313,10 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No """ if isinstance(input, str): return input - elif isinstance(input, list): + elif isinstance(input, List): + prompts = [] + images = [] + prompt_format = get_prompt_format(input) if prompt_format == PromptFormat.CHAT_FORMAT: # Process the input prompts with tools @@ -327,63 +333,23 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No m.content = self.openai_tools_prompter.content_from_assistant(m) # type: ignore elif m.tool_call_id is not None: # type: ignore m.content = self.openai_tools_prompter.content_from_tool(m) # type: ignore - - if self.predictor.infer_conf.model_description.chat_template is not None: - self.predictor.tokenizer.chat_template = ( - self.predictor.infer_conf.model_description.chat_template - ) - elif self.predictor.tokenizer.chat_template is None: - self.predictor.tokenizer.chat_template = ( - self.predictor.infer_conf.model_description.default_chat_template - ) - - if self.is_mllm: - if isinstance(input, list): - if isinstance(input, list) and input and isinstance(input[0], ChatMessage): - messages = [] - for chat_message in input: - message = { - "role": chat_message.role, - "content": chat_message.content, - } - messages.append(message) - texts, images = self._extract_messages(messages) - elif isinstance(input, list) and input and isinstance(input[0], dict): - texts, images = self._extract_messages(input) - elif isinstance(input, list) and input and isinstance(input[0], list): - texts, images = [self._extract_messages(p) for p in input] - - image = self._prepare_image(images) - prompt = self.predictor.tokenizer.apply_chat_template(texts, tokenize=False) - return prompt, image - else: - if isinstance(input, list) and input and isinstance(input[0], dict): - prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False) - elif isinstance(input, list) and input and isinstance(input[0], list): - prompt = [ - self.predictor.tokenizer.apply_chat_template(t, tokenize=False) - for t in input - ] - elif isinstance(input, list) and input and isinstance(input[0], ChatMessage): - messages = [] - for chat_message in input: - message = {"role": chat_message.role, "content": chat_message.content} - messages.append(message) - prompt = self.predictor.tokenizer.apply_chat_template( - messages, tokenize=False - ) - elif isinstance(input, list) and input and isinstance(input[0], str): - prompt = input - elif isinstance(input, str): - prompt = input + # Process the input prompts with MLLM tool + if self.process_tool is not None: + if self.is_mllm: + input, image = self.process_tool.get_prompt(input, self.is_mllm) + prompts.append(input) + images.extend(image) + return prompts, images else: - raise TypeError( - f"Unsupported type {type(input)} for text. Expected dict or list of dicts." - ) - return prompt + prompt = self.process_tool.get_prompt(input) + return prompt + else: + prompts.extend(input) elif prompt_format == PromptFormat.PROMPTS_FORMAT: + prompts.extend(input) + else: raise HTTPException(400, "Invalid prompt format.") - return input + return prompts else: raise HTTPException(400, "Invalid prompt format.") From 62ab1bfbf810221ecc76e4b602bb10da652a008c Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Sat, 27 Apr 2024 22:09:17 +0800 Subject: [PATCH 10/47] update --- llm_on_ray/inference/chat_template_process.py | 14 +++++--------- llm_on_ray/inference/predictor_deployment.py | 10 ++++++---- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/llm_on_ray/inference/chat_template_process.py b/llm_on_ray/inference/chat_template_process.py index c39a10b84..388c850f5 100644 --- a/llm_on_ray/inference/chat_template_process.py +++ b/llm_on_ray/inference/chat_template_process.py @@ -19,9 +19,8 @@ class ChatTemplatePreprocess: - - def __init__(self, tokenizer) -> None: - self.tokenizer = tokenizer + def __init__(self, predictor) -> None: + self.predictor = predictor def get_prompt(self, input: List, is_mllm=False): """Generate response based on input.""" @@ -58,17 +57,14 @@ def get_prompt(self, input: List, is_mllm=False): prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False) elif isinstance(input, list) and input and isinstance(input[0], list): prompt = [ - self.predictor.tokenizer.apply_chat_template(t, tokenize=False) - for t in input + self.predictor.tokenizer.apply_chat_template(t, tokenize=False) for t in input ] elif isinstance(input, list) and input and isinstance(input[0], ChatMessage): messages = [] for chat_message in input: message = {"role": chat_message.role, "content": chat_message.content} messages.append(message) - prompt = self.predictor.tokenizer.apply_chat_template( - messages, tokenize=False - ) + prompt = self.predictor.tokenizer.apply_chat_template(messages, tokenize=False) elif isinstance(input, list) and input and isinstance(input[0], str): prompt = input elif isinstance(input, str): @@ -127,4 +123,4 @@ def _prepare_image(self, messages: list): else: images.append(Image.open(requests.get(content["url"], stream=True).raw)) - return images \ No newline at end of file + return images diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index d5e211c37..6f371681a 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -84,8 +84,7 @@ def __init__( self.predictor = TransformerPredictor(infer_conf) self.loop = asyncio.get_running_loop() - self.process_tool = ChatTemplatePreprocess(self.predictor.tokenizer) - + self.process_tool = ChatTemplatePreprocess(self.predictor) def consume_streamer(self, streamer): for text in streamer: @@ -311,6 +310,10 @@ def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=No Raises: HTTPException: If the input prompt format is invalid or not supported. """ + logger.info("preprocess_prompts") + logger.info(input) + logger.info(type(input)) + if isinstance(input, str): return input elif isinstance(input, List): @@ -366,6 +369,7 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON ) streaming_response = json_request["stream"] if "stream" in json_request else False input = json_request["text"] if "text" in json_request else "" + if input == "": return JSONResponse( status_code=400, @@ -402,5 +406,3 @@ async def openai_call( yield result else: yield await self.handle_non_streaming(prompts, config) - - From cc356f6fa68c14e52302c8511290f40458233616 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Sun, 28 Apr 2024 11:54:12 +0800 Subject: [PATCH 11/47] update --- .../inference/api_server_openai/query_http_requests_tool.py | 3 +-- llm_on_ray/inference/models/gpt-j-6b.yaml | 2 +- llm_on_ray/inference/predictor_deployment.py | 5 ----- llm_on_ray/ui/start_ui.py | 6 +----- 4 files changed, 3 insertions(+), 13 deletions(-) diff --git a/examples/inference/api_server_openai/query_http_requests_tool.py b/examples/inference/api_server_openai/query_http_requests_tool.py index 217f2b792..c9efd222d 100644 --- a/examples/inference/api_server_openai/query_http_requests_tool.py +++ b/examples/inference/api_server_openai/query_http_requests_tool.py @@ -73,7 +73,6 @@ messages = [ [ - {"role": "user", "content": "You are a helpful assistant"}, {"role": "user", "content": "What's the weather like in Boston today?"}, ], ] @@ -81,7 +80,7 @@ proxies = {"http": None, "https": None} for message in messages: - print(f"User: {message[1]['content']}") + print(f"User: {message[0]['content']}") print("Assistant:", end=" ", flush=True) body = { diff --git a/llm_on_ray/inference/models/gpt-j-6b.yaml b/llm_on_ray/inference/models/gpt-j-6b.yaml index 6560f1623..9719b2f7e 100644 --- a/llm_on_ray/inference/models/gpt-j-6b.yaml +++ b/llm_on_ray/inference/models/gpt-j-6b.yaml @@ -14,4 +14,4 @@ ipex: model_description: model_id_or_path: EleutherAI/gpt-j-6b tokenizer_name_or_path: EleutherAI/gpt-j-6b - gpt_base_model: true \ No newline at end of file + gpt_base_model: true diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index 6f371681a..2e642bfff 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -310,9 +310,6 @@ def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=No Raises: HTTPException: If the input prompt format is invalid or not supported. """ - logger.info("preprocess_prompts") - logger.info(input) - logger.info(type(input)) if isinstance(input, str): return input @@ -357,7 +354,6 @@ def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=No raise HTTPException(400, "Invalid prompt format.") async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSONResponse, str]: - logger.info("PredictorDeployment call") self.use_openai = False try: @@ -376,7 +372,6 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON content="Empty prompt is not supported.", ) config = json_request["config"] if "config" in json_request else {} - logger.info(input) # return prompt or list of prompts preprocessed prompts = self.preprocess_prompts(input) diff --git a/llm_on_ray/ui/start_ui.py b/llm_on_ray/ui/start_ui.py index 9f76a2696..5cdece259 100644 --- a/llm_on_ray/ui/start_ui.py +++ b/llm_on_ray/ui/start_ui.py @@ -31,11 +31,7 @@ from ray.util import queue from llm_on_ray.inference.inference_config import all_models, ModelDescription, Prompt from llm_on_ray.inference.inference_config import InferenceConfig as FinetunedConfig -from llm_on_ray.inference.chat_template_process import ( - ChatModelGptJ, - ChatModelLLama, - ChatModelwithImage, -) + from llm_on_ray.inference.predictor_deployment import PredictorDeployment from llm_on_ray.ui.html_format import cpu_memory_html, ray_status_html, custom_css from langchain.vectorstores import FAISS From 11718e80f7f8b355bcfe3f6401b82b72571244ee Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 29 Apr 2024 17:22:59 +0800 Subject: [PATCH 12/47] update --- llm_on_ray/inference/chat_template_process.py | 105 ++++++------------ llm_on_ray/inference/inference_config.py | 16 +-- llm_on_ray/inference/models/gemma-2b.yaml | 1 + .../inference/models/neural-chat-7b-v3-1.yaml | 3 +- llm_on_ray/inference/utils.py | 4 +- 5 files changed, 47 insertions(+), 82 deletions(-) diff --git a/llm_on_ray/inference/chat_template_process.py b/llm_on_ray/inference/chat_template_process.py index 388c850f5..2f7a64d27 100644 --- a/llm_on_ray/inference/chat_template_process.py +++ b/llm_on_ray/inference/chat_template_process.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from typing import List, Union +from typing import List from llm_on_ray.inference.api_openai_backend.openai_protocol import ChatMessage @@ -24,56 +24,31 @@ def __init__(self, predictor) -> None: def get_prompt(self, input: List, is_mllm=False): """Generate response based on input.""" - if self.predictor.infer_conf.model_description.chat_template is not None: - self.predictor.tokenizer.chat_template = ( - self.predictor.infer_conf.model_description.chat_template + self.predictor.tokenizer.chat_template = ( + self.predictor.infer_conf.model_description.chat_template + or self.predictor.tokenizer.chat_template + or self.predictor.infer_conf.model_description.default_chat_template + ) + + if isinstance(input, list) and input and isinstance(input[0], (ChatMessage, dict)): + messages = ( + [dict(chat_message) for chat_message in input] + if isinstance(input[0], ChatMessage) + else input ) - elif self.predictor.tokenizer.chat_template is None: - self.predictor.tokenizer.chat_template = ( - self.predictor.infer_conf.model_description.default_chat_template + prompt = self.predictor.tokenizer.apply_chat_template( + messages, add_generation_prompt=True, tokenize=False ) - - if is_mllm: - if isinstance(input, List): - if isinstance(input, list) and input and isinstance(input[0], ChatMessage): - messages = [] - for chat_message in input: - message = { - "role": chat_message.role, - "content": chat_message.content, - } - messages.append(message) - texts, images = self._extract_messages(messages) - elif isinstance(input, list) and input and isinstance(input[0], dict): - texts, images = self._extract_messages(input) - elif isinstance(input, list) and input and isinstance(input[0], list): - texts, images = [self._extract_messages(p) for p in input] - + if is_mllm: + texts, images = self._extract_messages(messages) image = self._prepare_image(images) - prompt = self.predictor.tokenizer.apply_chat_template(texts, tokenize=False) - return prompt, image - else: - if isinstance(input, list) and input and isinstance(input[0], dict): - prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False) - elif isinstance(input, list) and input and isinstance(input[0], list): - prompt = [ - self.predictor.tokenizer.apply_chat_template(t, tokenize=False) for t in input - ] - elif isinstance(input, list) and input and isinstance(input[0], ChatMessage): - messages = [] - for chat_message in input: - message = {"role": chat_message.role, "content": chat_message.content} - messages.append(message) - prompt = self.predictor.tokenizer.apply_chat_template(messages, tokenize=False) - elif isinstance(input, list) and input and isinstance(input[0], str): - prompt = input - elif isinstance(input, str): - prompt = input - else: - raise TypeError( - f"Unsupported type {type(input)} for text. Expected dict or list of dicts." + prompt = self.predictor.tokenizer.apply_chat_template( + texts, add_generation_prompt=True, tokenize=False ) - return prompt + return prompt, image + return prompt + + raise TypeError(f"Unsupported type {type(input)} for text. Expected dict or list of dicts.") def _extract_messages(self, messages): texts, images = [], [] @@ -97,30 +72,16 @@ def _prepare_image(self, messages: list): # prepare images images: List = [] - if isinstance(messages[0], List): - for i in range(len(messages)): - for msg in messages[i]: - msg = dict(msg) - content = msg["content"] - if "url" not in content: - continue - is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 - if is_data: - encoded_str = re.sub("^data:image/.+;base64,", "", content["url"]) - images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str)))) - else: - images[i].append(Image.open(requests.get(content["url"], stream=True).raw)) - elif isinstance(messages[0], dict): - for msg in messages: - msg = dict(msg) - content = msg["content"] - if "url" not in content: - continue - is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 - if is_data: - encoded_str = re.sub("^data:image/.+;base64,", "", content["url"]) - images.append(Image.open(BytesIO(base64.b64decode(encoded_str)))) - else: - images.append(Image.open(requests.get(content["url"], stream=True).raw)) + for msg in messages: + msg = dict(msg) + content = msg["content"] + if "url" not in content: + continue + is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 + if is_data: + encoded_str = re.sub("^data:image/.+;base64,", "", content["url"]) + images.append(Image.open(BytesIO(base64.b64decode(encoded_str)))) + else: + images.append(Image.open(requests.get(content["url"], stream=True).raw)) return images diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py index 6842fe63e..e1ea2e2c3 100644 --- a/llm_on_ray/inference/inference_config.py +++ b/llm_on_ray/inference/inference_config.py @@ -115,20 +115,22 @@ class ModelDescription(BaseModel): chat_model_with_image: bool = False chat_template: Union[str, None] = None default_chat_template: str = ( - "{{ bos_token }}" + "Below is an instruction that describes a task. Write a response that appropriately completes the request." "{% if messages[0]['role'] == 'system' %}" - "{{ raise_exception('System role not supported') }}" - "{% endif %}" - "{% for message in messages %}" + "{% set loop_messages = messages[1:] %}" + "{% set system_message = messages[0]['content'] %}" + "{% else %}{% set loop_messages = messages %}" + "{% set system_message = false %}{% endif %}" + "{% for message in loop_messages %}" "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" "{% endif %}" "{% if message['role'] == 'user' %}" - "{{ '### Instruction: ' + message['content'] + eos_token }}" + "{{ '### Instruction: ' + message['content'].strip() }}" "{% elif message['role'] == 'assistant' %}" - "{{ '### Response:' + message['content'] + eos_token }}" + "{{ '### Response:' + message['content'].strip() }}" "{% endif %}{% endfor %}" - "{{'### End \n'}}" + "{% if add_generation_prompt %}{{'### Response:\n'}}{% endif %}" ) @validator("quantization_type") diff --git a/llm_on_ray/inference/models/gemma-2b.yaml b/llm_on_ray/inference/models/gemma-2b.yaml index a27b6bc0f..09e971081 100644 --- a/llm_on_ray/inference/models/gemma-2b.yaml +++ b/llm_on_ray/inference/models/gemma-2b.yaml @@ -15,3 +15,4 @@ model_description: tokenizer_name_or_path: google/gemma-2b config: use_auth_token: ' ' + chat_template: "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}" diff --git a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml index bd49ce189..08945e865 100644 --- a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml +++ b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml @@ -13,4 +13,5 @@ ipex: model_description: model_id_or_path: Intel/neural-chat-7b-v3-1 tokenizer_name_or_path: Intel/neural-chat-7b-v3-1 - chat_template: "{{ bos_token }}'### System:You are a chatbot developed by Intel. Please answer all questions to the best of your ability.\n'{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### User: ' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ '### Assistant:' + message['content'] + eos_token }}{% endif %}{% endfor %}" + chat_template: "'### System:You are a chatbot developed by Intel. Please answer all questions to the best of your ability.\n'{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:]}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### User: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '### Assistant:' + message['content'].strip() }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'### Assistant:\n'}}{% endif %}" + diff --git a/llm_on_ray/inference/utils.py b/llm_on_ray/inference/utils.py index 5a8db0401..56b9146e5 100644 --- a/llm_on_ray/inference/utils.py +++ b/llm_on_ray/inference/utils.py @@ -162,13 +162,13 @@ def is_cpu_without_ipex(infer_conf: InferenceConfig) -> bool: return (not infer_conf.ipex.enabled) and infer_conf.device == DEVICE_CPU -def get_prompt_format(input: Union[List[str], List[dict], List[List[dict]], List[ChatMessage]]): +def get_prompt_format(input: Union[List[str], List[dict], List[ChatMessage]]): chat_format = True prompts_format = True for item in input: if isinstance(item, str): chat_format = False - elif isinstance(item, dict) or isinstance(item, ChatMessage) or isinstance(item, list): + elif isinstance(item, dict) or isinstance(item, ChatMessage): prompts_format = False else: chat_format = False From d254f266ccd06c9cceec4b04c95ad93eafe1c9b8 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 29 Apr 2024 17:43:58 +0800 Subject: [PATCH 13/47] update yaml file --- llm_on_ray/inference/models/gpt2.yaml | 1 + llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml | 1 + .../inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml | 1 + llm_on_ray/inference/models/mistral-7b-v0.1.yaml | 1 + llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml | 1 - llm_on_ray/inference/models/opt-125m.yaml | 1 + llm_on_ray/inference/models/sqlcoder-7b-2.yaml | 1 + llm_on_ray/inference/models/starcoder.yaml | 1 + 8 files changed, 7 insertions(+), 1 deletion(-) diff --git a/llm_on_ray/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml index 96737288f..354f1c348 100644 --- a/llm_on_ray/inference/models/gpt2.yaml +++ b/llm_on_ray/inference/models/gpt2.yaml @@ -14,3 +14,4 @@ model_description: model_id_or_path: gpt2 tokenizer_name_or_path: gpt2 gpt_base_model: true + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" \ No newline at end of file diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml index 00ff121c5..35fadb820 100644 --- a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml +++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml @@ -14,3 +14,4 @@ ipex: model_description: model_id_or_path: Intel/neural-chat-7b-v3-3 tokenizer_name_or_path: Intel/neural-chat-7b-v3-3 + chat_template: "'### System:You are a chatbot developed by Intel. Please answer all questions to the best of your ability.\n'{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:]}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### User: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '### Assistant:' + message['content'].strip() }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'### Assistant:\n'}}{% endif %}" diff --git a/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml b/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml index 5ec652bba..2ad30d0b8 100644 --- a/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml +++ b/llm_on_ray/inference/models/ipex-llm/mistral-7b-v0.1-ipex-llm.yaml @@ -17,3 +17,4 @@ model_description: config: trust_remote_code: true load_in_4bit: true + chat_template: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"" diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml index 0ed664efd..db2eec1e4 100644 --- a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml +++ b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml @@ -16,3 +16,4 @@ model_description: tokenizer_name_or_path: mistralai/Mistral-7B-v0.1 config: trust_remote_code: true + chat_template: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" diff --git a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml index 08945e865..973aa46f7 100644 --- a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml +++ b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml @@ -14,4 +14,3 @@ model_description: model_id_or_path: Intel/neural-chat-7b-v3-1 tokenizer_name_or_path: Intel/neural-chat-7b-v3-1 chat_template: "'### System:You are a chatbot developed by Intel. Please answer all questions to the best of your ability.\n'{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:]}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### User: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '### Assistant:' + message['content'].strip() }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'### Assistant:\n'}}{% endif %}" - diff --git a/llm_on_ray/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml index 65a6d6bf7..171b6fcd3 100644 --- a/llm_on_ray/inference/models/opt-125m.yaml +++ b/llm_on_ray/inference/models/opt-125m.yaml @@ -13,3 +13,4 @@ ipex: model_description: model_id_or_path: facebook/opt-125m tokenizer_name_or_path: facebook/opt-125m + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" \ No newline at end of file diff --git a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml index 6d12b35df..d723a185a 100644 --- a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml +++ b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml @@ -14,3 +14,4 @@ model_description: tokenizer_name_or_path: defog/sqlcoder-7b-2 config: use_auth_token: '' + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" \ No newline at end of file diff --git a/llm_on_ray/inference/models/starcoder.yaml b/llm_on_ray/inference/models/starcoder.yaml index adbc91fc0..a42967c7f 100644 --- a/llm_on_ray/inference/models/starcoder.yaml +++ b/llm_on_ray/inference/models/starcoder.yaml @@ -15,3 +15,4 @@ model_description: tokenizer_name_or_path: bigcode/starcoder config: use_auth_token: '' + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" \ No newline at end of file From 94f061a2c44c500d31634d738af1ca782f43f093 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 29 Apr 2024 22:07:05 +0800 Subject: [PATCH 14/47] update yaml --- llm_on_ray/inference/models/CodeLlama-7b-hf.yaml | 1 + llm_on_ray/inference/models/deplot.yaml | 1 + llm_on_ray/inference/models/falcon-7b.yaml | 1 + llm_on_ray/inference/models/fuyu8b.yaml | 1 + 4 files changed, 4 insertions(+) diff --git a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml index eff253e46..a5828f6ec 100644 --- a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml +++ b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml @@ -13,3 +13,4 @@ ipex: model_description: model_id_or_path: codellama/CodeLlama-7b-hf tokenizer_name_or_path: codellama/CodeLlama-7b-hf + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" \ No newline at end of file diff --git a/llm_on_ray/inference/models/deplot.yaml b/llm_on_ray/inference/models/deplot.yaml index 8f8edd47c..52e0f1a91 100644 --- a/llm_on_ray/inference/models/deplot.yaml +++ b/llm_on_ray/inference/models/deplot.yaml @@ -14,3 +14,4 @@ model_description: model_id_or_path: google/deplot tokenizer_name_or_path: google/deplot chat_model_with_image: true + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" \ No newline at end of file diff --git a/llm_on_ray/inference/models/falcon-7b.yaml b/llm_on_ray/inference/models/falcon-7b.yaml index 5801f12be..3f879bfe1 100644 --- a/llm_on_ray/inference/models/falcon-7b.yaml +++ b/llm_on_ray/inference/models/falcon-7b.yaml @@ -13,3 +13,4 @@ ipex: model_description: model_id_or_path: tiiuae/falcon-7b tokenizer_name_or_path: tiiuae/falcon-7b + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" \ No newline at end of file diff --git a/llm_on_ray/inference/models/fuyu8b.yaml b/llm_on_ray/inference/models/fuyu8b.yaml index e62303d83..2b5504d76 100644 --- a/llm_on_ray/inference/models/fuyu8b.yaml +++ b/llm_on_ray/inference/models/fuyu8b.yaml @@ -14,3 +14,4 @@ model_description: model_id_or_path: adept/fuyu-8b tokenizer_name_or_path: adept/fuyu-8b chat_model_with_image: true + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" \ No newline at end of file From 06c65791062df79e922c9c8ceb764177af49cdcc Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 29 Apr 2024 22:12:05 +0800 Subject: [PATCH 15/47] format yaml --- llm_on_ray/inference/models/CodeLlama-7b-hf.yaml | 3 ++- llm_on_ray/inference/models/deplot.yaml | 2 +- llm_on_ray/inference/models/falcon-7b.yaml | 2 +- llm_on_ray/inference/models/fuyu8b.yaml | 2 +- llm_on_ray/inference/models/gpt2.yaml | 2 +- llm_on_ray/inference/models/opt-125m.yaml | 2 +- llm_on_ray/inference/models/sqlcoder-7b-2.yaml | 2 +- llm_on_ray/inference/models/starcoder.yaml | 2 +- llm_on_ray/inference/predictor_deployment.py | 7 ++++++- 9 files changed, 15 insertions(+), 9 deletions(-) diff --git a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml index a5828f6ec..9d243226f 100644 --- a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml +++ b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml @@ -13,4 +13,5 @@ ipex: model_description: model_id_or_path: codellama/CodeLlama-7b-hf tokenizer_name_or_path: codellama/CodeLlama-7b-hf - chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" \ No newline at end of file + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" + \ No newline at end of file diff --git a/llm_on_ray/inference/models/deplot.yaml b/llm_on_ray/inference/models/deplot.yaml index 52e0f1a91..bb4ea7ec5 100644 --- a/llm_on_ray/inference/models/deplot.yaml +++ b/llm_on_ray/inference/models/deplot.yaml @@ -14,4 +14,4 @@ model_description: model_id_or_path: google/deplot tokenizer_name_or_path: google/deplot chat_model_with_image: true - chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" \ No newline at end of file + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" diff --git a/llm_on_ray/inference/models/falcon-7b.yaml b/llm_on_ray/inference/models/falcon-7b.yaml index 3f879bfe1..88a088350 100644 --- a/llm_on_ray/inference/models/falcon-7b.yaml +++ b/llm_on_ray/inference/models/falcon-7b.yaml @@ -13,4 +13,4 @@ ipex: model_description: model_id_or_path: tiiuae/falcon-7b tokenizer_name_or_path: tiiuae/falcon-7b - chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" \ No newline at end of file + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" diff --git a/llm_on_ray/inference/models/fuyu8b.yaml b/llm_on_ray/inference/models/fuyu8b.yaml index 2b5504d76..7c4977adc 100644 --- a/llm_on_ray/inference/models/fuyu8b.yaml +++ b/llm_on_ray/inference/models/fuyu8b.yaml @@ -14,4 +14,4 @@ model_description: model_id_or_path: adept/fuyu-8b tokenizer_name_or_path: adept/fuyu-8b chat_model_with_image: true - chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" \ No newline at end of file + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" diff --git a/llm_on_ray/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml index 354f1c348..ca008cba2 100644 --- a/llm_on_ray/inference/models/gpt2.yaml +++ b/llm_on_ray/inference/models/gpt2.yaml @@ -14,4 +14,4 @@ model_description: model_id_or_path: gpt2 tokenizer_name_or_path: gpt2 gpt_base_model: true - chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" \ No newline at end of file + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" diff --git a/llm_on_ray/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml index 171b6fcd3..92fd30260 100644 --- a/llm_on_ray/inference/models/opt-125m.yaml +++ b/llm_on_ray/inference/models/opt-125m.yaml @@ -13,4 +13,4 @@ ipex: model_description: model_id_or_path: facebook/opt-125m tokenizer_name_or_path: facebook/opt-125m - chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" \ No newline at end of file + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" diff --git a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml index d723a185a..0a1b43766 100644 --- a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml +++ b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml @@ -14,4 +14,4 @@ model_description: tokenizer_name_or_path: defog/sqlcoder-7b-2 config: use_auth_token: '' - chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" \ No newline at end of file + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" diff --git a/llm_on_ray/inference/models/starcoder.yaml b/llm_on_ray/inference/models/starcoder.yaml index a42967c7f..660b10ba8 100644 --- a/llm_on_ray/inference/models/starcoder.yaml +++ b/llm_on_ray/inference/models/starcoder.yaml @@ -15,4 +15,4 @@ model_description: tokenizer_name_or_path: bigcode/starcoder config: use_auth_token: '' - chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" \ No newline at end of file + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index 2e642bfff..f92baca85 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -391,9 +391,14 @@ async def openai_call( tool_choice=None, ): self.use_openai = True - + print("openai_call") + print(input) + print(type(input)) # return prompt or list of prompts preprocessed prompts = self.preprocess_prompts(input, tools, tool_choice) + print("preprocess_prompts") + print(prompts) + print(type(prompts)) # Handle streaming response if streaming_response: From c5766a10dd763444438bcfd5a8b50719f539e6fa Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 29 Apr 2024 22:16:05 +0800 Subject: [PATCH 16/47] update --- llm_on_ray/inference/models/CodeLlama-7b-hf.yaml | 1 - .../models/template/inference_config_template.yaml | 7 +------ llm_on_ray/inference/utils.py | 2 +- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml index 9d243226f..8a2ef79fd 100644 --- a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml +++ b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml @@ -14,4 +14,3 @@ model_description: model_id_or_path: codellama/CodeLlama-7b-hf tokenizer_name_or_path: codellama/CodeLlama-7b-hf chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" - \ No newline at end of file diff --git a/llm_on_ray/inference/models/template/inference_config_template.yaml b/llm_on_ray/inference/models/template/inference_config_template.yaml index 137ddb2dc..1e6726a12 100644 --- a/llm_on_ray/inference/models/template/inference_config_template.yaml +++ b/llm_on_ray/inference/models/template/inference_config_template.yaml @@ -13,7 +13,7 @@ ipex: precision: bf16 model_description: model_id_or_path: null - ipexllm:: false + ipexllm: false tokenizer_name_or_path: null chat_processor: null gpt_base_model: false @@ -22,11 +22,6 @@ model_description: peft_model_id_or_path: null peft_type: null use_hpu_graphs: true - prompt: - intro: '' - human_id: '' - bot_id: '' - stop_words: [] config: trust_remote_code: false use_auth_token: null diff --git a/llm_on_ray/inference/utils.py b/llm_on_ray/inference/utils.py index 56b9146e5..91e311088 100644 --- a/llm_on_ray/inference/utils.py +++ b/llm_on_ray/inference/utils.py @@ -166,7 +166,7 @@ def get_prompt_format(input: Union[List[str], List[dict], List[ChatMessage]]): chat_format = True prompts_format = True for item in input: - if isinstance(item, str): + if isinstance(item, str) or isinstance(item, list): chat_format = False elif isinstance(item, dict) or isinstance(item, ChatMessage): prompts_format = False From dad4224f77ba7c978aef9f5ab2f3303876c41a60 Mon Sep 17 00:00:00 2001 From: minmingzhu <45281494+minmingzhu@users.noreply.github.com> Date: Mon, 29 Apr 2024 22:28:06 +0800 Subject: [PATCH 17/47] Update mpt_deltatuner.yaml --- .github/workflows/config/mpt_deltatuner.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/config/mpt_deltatuner.yaml b/.github/workflows/config/mpt_deltatuner.yaml index 7399d587b..e0c0d6946 100644 --- a/.github/workflows/config/mpt_deltatuner.yaml +++ b/.github/workflows/config/mpt_deltatuner.yaml @@ -13,5 +13,7 @@ ipex: model_description: model_id_or_path: mosaicml/mpt-7b tokenizer_name_or_path: EleutherAI/gpt-neox-20b + peft_model_id_or_path: nathan0/mpt-7b-deltatuner-model + peft_type: deltatuner config: trust_remote_code: true From f28f4cdb175e1d54dd9e280b6ae8bbfbdafb1119 Mon Sep 17 00:00:00 2001 From: minmingzhu <45281494+minmingzhu@users.noreply.github.com> Date: Mon, 29 Apr 2024 23:44:19 +0800 Subject: [PATCH 18/47] Update neural-chat-7b-v3-1.yaml --- llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml index 973aa46f7..be7b8d611 100644 --- a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml +++ b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml @@ -13,4 +13,4 @@ ipex: model_description: model_id_or_path: Intel/neural-chat-7b-v3-1 tokenizer_name_or_path: Intel/neural-chat-7b-v3-1 - chat_template: "'### System:You are a chatbot developed by Intel. Please answer all questions to the best of your ability.\n'{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:]}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### User: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '### Assistant:' + message['content'].strip() }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'### Assistant:\n'}}{% endif %}" + chat_template: "'### System:You are a chatbot developed by Intel. Please answer all questions to the best of your ability.\n'{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:]%}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### User: ' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '### Assistant:' + message['content'].strip() }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{'### Assistant:\n'}}{% endif %}" From eec2124c2f26e96e5acd0f5e2e00d3a70f48b471 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 30 Apr 2024 14:08:54 +0800 Subject: [PATCH 19/47] update --- .github/workflows/config/gpt2-ci.yaml | 1 + .github/workflows/config/opt-125m-ci.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/config/gpt2-ci.yaml b/.github/workflows/config/gpt2-ci.yaml index e9bed1366..7ed3f6972 100644 --- a/.github/workflows/config/gpt2-ci.yaml +++ b/.github/workflows/config/gpt2-ci.yaml @@ -13,3 +13,4 @@ model_description: model_id_or_path: gpt2 tokenizer_name_or_path: gpt2 gpt_base_model: true + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" diff --git a/.github/workflows/config/opt-125m-ci.yaml b/.github/workflows/config/opt-125m-ci.yaml index e5ab095a6..96c9c345b 100644 --- a/.github/workflows/config/opt-125m-ci.yaml +++ b/.github/workflows/config/opt-125m-ci.yaml @@ -13,3 +13,4 @@ ipex: model_description: model_id_or_path: facebook/opt-125m tokenizer_name_or_path: facebook/opt-125m + chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" From 419aea3ced2aff42998d7013960874cb7731b1c5 Mon Sep 17 00:00:00 2001 From: minmingzhu <45281494+minmingzhu@users.noreply.github.com> Date: Mon, 6 May 2024 09:56:35 +0800 Subject: [PATCH 20/47] Update predictor_deployment.py --- llm_on_ray/inference/predictor_deployment.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index f92baca85..2e642bfff 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -391,14 +391,9 @@ async def openai_call( tool_choice=None, ): self.use_openai = True - print("openai_call") - print(input) - print(type(input)) + # return prompt or list of prompts preprocessed prompts = self.preprocess_prompts(input, tools, tool_choice) - print("preprocess_prompts") - print(prompts) - print(type(prompts)) # Handle streaming response if streaming_response: From dc6bb3bbfbbcbea1e26593839d5725466eb18e98 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 1 Apr 2024 02:29:30 +0000 Subject: [PATCH 21/47] implement fine-tuning chat template function Signed-off-by: minmingzhu --- .../common/dataprocesser/general_processer.py | 91 +++++-------------- llm_on_ray/finetune/finetune.py | 1 + llm_on_ray/finetune/finetune.yaml | 2 + llm_on_ray/finetune/finetune_config.py | 9 +- 4 files changed, 32 insertions(+), 71 deletions(-) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index b963611e7..8eea6f2c0 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -23,53 +23,9 @@ from llm_on_ray.common.dataprocesser import DataProcesser -INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request." -INSTRUCTION_KEY = "### Instruction:" -INPUT_KEY = "Input:" RESPONSE_KEY = "### Response:" -END_KEY = "### End" RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n" -PROMPT_NO_INPUT_FORMAT = """{intro} - -{instruction_key} -{instruction} - -{response_key} -{response} - -{end_key}""".format( - intro=INTRO_BLURB, - instruction_key=INSTRUCTION_KEY, - instruction="{instruction}", - response_key=RESPONSE_KEY, - response="{response}", - end_key=END_KEY, -) - -PROMPT_WITH_INPUT_FORMAT = """{intro} - -{instruction_key} -{instruction} - -{input_key} -{input} - -{response_key} -{response} - -{end_key}""".format( - intro=INTRO_BLURB, - instruction_key=INSTRUCTION_KEY, - instruction="{instruction}", - input_key=INPUT_KEY, - input="{input}", - response_key=RESPONSE_KEY, - response="{response}", - end_key=END_KEY, -) -TEXT_COLUMN_NAME = "text" - class DataCollatorForCompletionOnlyLM(transformers.DataCollatorForLanguageModeling): def torch_call(self, examples): @@ -103,6 +59,7 @@ def prepare(self, tokenizer, dataset): per_device_train_batch_size = self.config.get("per_device_train_batch_size") per_device_eval_batch_size = self.config.get("per_device_eval_batch_size") max_length = self.config.get("max_length") + custom_chat_template = self.config.get("custom_chat_template") group = self.config.get("group") block_size = self.config.get("block_size") shuffle = self.config.get("shuffle") @@ -114,35 +71,29 @@ def prepare(self, tokenizer, dataset): if isinstance(dataset, datasets.DatasetDict): column_names = dataset["train"].column_names - if column_names and TEXT_COLUMN_NAME not in column_names: - - def prompt(rec): - instruction = rec["instruction"] - response = rec["response"] - context = rec.get("context") - if not instruction: - raise ValueError(f"Expected an instruction in: {rec}") - if not response: - raise ValueError(f"Expected a response in: {rec}") - if context: - rec["text"] = PROMPT_WITH_INPUT_FORMAT.format( - instruction=instruction, response=response, input=context + def tokenize_function(examples): + if self.config.get("is_base_model"): + if custom_chat_template: + new_tokenizer = tokenizer.apply_chat_template( + examples, + chat_template=custom_chat_template, + tokenize=True, + max_length=max_length, ) else: - rec["text"] = PROMPT_NO_INPUT_FORMAT.format( - instruction=instruction, response=response + new_tokenizer = tokenizer.apply_chat_template( + examples, + chat_template=self.config.get("default_chat_template"), + tokenize=True, + max_length=max_length, ) - return rec - - dataset = dataset.map( - prompt, - load_from_cache_file=False, - desc="Prompt", - ) - column_names += [TEXT_COLUMN_NAME] - - def tokenize_function(examples): - return tokenizer(examples[TEXT_COLUMN_NAME], max_length=max_length) + else: + new_tokenizer = tokenizer.apply_chat_template( + examples, tokenize=False, max_length=max_length + ) + print(new_tokenizer) + print(new_tokenizer.default_chat_template) + return new_tokenizer tokenized_datasets = dataset.map( tokenize_function, diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 37c0481d6..febb97231 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -177,6 +177,7 @@ def train_func(config: Dict[str, Any]): config={ "name": tokenizer_name, "config": config["General"]["config"], + "custom_chat_template": config["General"]["custom_chat_template"], } ) diff --git a/llm_on_ray/finetune/finetune.yaml b/llm_on_ray/finetune/finetune.yaml index 1f1cc46ca..d8e46331a 100644 --- a/llm_on_ray/finetune/finetune.yaml +++ b/llm_on_ray/finetune/finetune.yaml @@ -1,5 +1,6 @@ General: base_model: EleutherAI/gpt-j-6b + is_base_model: false gpt_base_model: true output_dir: /tmp/llm-ray/output checkpoint_dir: null @@ -12,6 +13,7 @@ General: lora_alpha: 32 lora_dropout: 0.1 enable_gradient_checkpointing: false + custom_chat_template: null Dataset: train_file: examples/data/sample_finetune_data_small.jsonl group: true diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index a01095c16..3a638aeab 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -17,7 +17,6 @@ from pydantic import BaseModel, validator from typing import Optional, List - PRECISION_BF16 = "bf16" PRECISION_FP16 = "fp16" PRECISION_NO = "no" @@ -60,6 +59,14 @@ class General(BaseModel): lora_config: Optional[LoraConfig] = None deltatuner_config: Optional[DeltatunerConfig] = None enable_gradient_checkpointing: bool = False + custom_chat_template: Optional[str] = None + default_chat_template: str = ( + "{{'### Below is an instruction that describes a task. " + "Write a response that appropriately completes the request. \n'}}" + "{% for message in messages %}{{'### Instruction: ' + message['instruction'] " + "+ ' Input:' + message['context'] + ' ### Response:' + message['response'] " + "+ '### End \n'}}{% endfor %}" + ) class Dataset(BaseModel): From 22b0ae5ae2169fd227415f0a78008e270d9e8d01 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 2 Apr 2024 08:25:34 +0000 Subject: [PATCH 22/47] update Signed-off-by: minmingzhu --- llm_on_ray/common/trainer/default_trainer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/llm_on_ray/common/trainer/default_trainer.py b/llm_on_ray/common/trainer/default_trainer.py index 366d6f28b..e3800333c 100644 --- a/llm_on_ray/common/trainer/default_trainer.py +++ b/llm_on_ray/common/trainer/default_trainer.py @@ -33,6 +33,7 @@ class DefaultTrainer(Trainer): def __init__(self, config): self.model = None + self.tokenizer = None self.config = config dataprocesser_config = config.get("dataprocesser") dataprocesser_type = dataprocesser_config.get("type") @@ -121,7 +122,7 @@ def _get_lr_scheduler( def prepare(self, model, tokenizer, dataset, optimizer, accelerator): self._coordinate(accelerator) - + self.tokenizer = tokenizer embedding_size = model.get_input_embeddings().weight.shape[0] logger.info(f"model embedding size: {embedding_size}") if len(tokenizer) > embedding_size: @@ -288,6 +289,11 @@ def train(self): is_main_process=self.accelerator.is_main_process, save_function=self.accelerator.save, ) + self.tokenizer.save_pretrained( + output, + is_main_process=self.accelerator.is_main_process, + save_function=self.accelerator.save, + ) logger.info(f"finish save model to {output}") self.accelerator.wait_for_everyone() From 1768e2afc793e06573b050316e5dbe4aa7d5950a Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 8 Apr 2024 05:41:44 +0000 Subject: [PATCH 23/47] update Signed-off-by: minmingzhu --- .../common/dataprocesser/general_processer.py | 41 ++++++++++++++----- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 8eea6f2c0..5fafa3694 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -60,6 +60,8 @@ def prepare(self, tokenizer, dataset): per_device_eval_batch_size = self.config.get("per_device_eval_batch_size") max_length = self.config.get("max_length") custom_chat_template = self.config.get("custom_chat_template") + model_default_chat_template = self.config.get("model_default_chat_template") + group = self.config.get("group") block_size = self.config.get("block_size") shuffle = self.config.get("shuffle") @@ -74,25 +76,44 @@ def prepare(self, tokenizer, dataset): def tokenize_function(examples): if self.config.get("is_base_model"): if custom_chat_template: - new_tokenizer = tokenizer.apply_chat_template( + tokenizer.chat_template = custom_chat_template + new_tokenizer = tokenizer.apply_chat_template( examples, - chat_template=custom_chat_template, - tokenize=True, + tokenize=False, max_length=max_length, ) else: + tokenizer.chat_template = self.config.get("default_chat_template") new_tokenizer = tokenizer.apply_chat_template( examples, - chat_template=self.config.get("default_chat_template"), - tokenize=True, + tokenize=False, max_length=max_length, ) else: - new_tokenizer = tokenizer.apply_chat_template( - examples, tokenize=False, max_length=max_length - ) - print(new_tokenizer) - print(new_tokenizer.default_chat_template) + if model_default_chat_template: + tokenizer.chat_template = model_default_chat_template + new_tokenizer = tokenizer.apply_chat_template( + examples, + tokenize=False, + max_length=max_length, + ) + else: + new_messages = [ + { + "role": "user", + "content": "instruction: " + + examples["instruction"] + + " context: " + + examples["context"], + }, + {"role": "assistant", "content": "response: " + examples["response"]}, + ] + + new_tokenizer = tokenizer.apply_chat_template( + new_messages, + tokenize=False, + max_length=max_length, + ) return new_tokenizer tokenized_datasets = dataset.map( From 2f256e5bb53e91e1c4eb864c22539a0d682223fd Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 8 Apr 2024 05:59:44 +0000 Subject: [PATCH 24/47] update Signed-off-by: minmingzhu --- llm_on_ray/finetune/finetune.py | 5 ++++- llm_on_ray/finetune/finetune_config.py | 11 ++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index febb97231..502a985f6 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -177,7 +177,6 @@ def train_func(config: Dict[str, Any]): config={ "name": tokenizer_name, "config": config["General"]["config"], - "custom_chat_template": config["General"]["custom_chat_template"], } ) @@ -249,6 +248,10 @@ def train_func(config: Dict[str, Any]): "group": config["Dataset"].get("group", True), "block_size": config["Dataset"].get("block_size", 512), "shuffle": config["Dataset"].get("shuffle", False), + "is_base_model": config["General"]["is_base_model"], + "custom_chat_template": config["General"]["custom_chat_template"], + "default_chat_template": config["General"]["default_chat_template"], + "model_default_chat_template": config["General"]["model_default_chat_template"], }, "lr_scheduler": { "enable": True, diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index 3a638aeab..fe6f2b34b 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -61,12 +61,13 @@ class General(BaseModel): enable_gradient_checkpointing: bool = False custom_chat_template: Optional[str] = None default_chat_template: str = ( - "{{'### Below is an instruction that describes a task. " - "Write a response that appropriately completes the request. \n'}}" - "{% for message in messages %}{{'### Instruction: ' + message['instruction'] " - "+ ' Input:' + message['context'] + ' ### Response:' + message['response'] " - "+ '### End \n'}}{% endfor %}" + "{{'### Below is an instruction that describes a task." + "Write a response that appropriately completes the request. '}}" + "{{'### Instruction: ' + messages['instruction'] " + "+ ' Input:' + messages['context'] + ' ### Response:' + messages['response'] " + "+ '### End \n'}}" ) + model_default_chat_template: Optional[str] = None class Dataset(BaseModel): From 0e5aca8cca3602038867b067ab540fcd5d26140b Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 8 Apr 2024 06:18:40 +0000 Subject: [PATCH 25/47] integrate gbt for transformer 4.26.0 Signed-off-by: minmingzhu --- .../common/dataprocesser/general_processer.py | 61 +++++++++++++++++++ llm_on_ray/finetune/finetune.py | 1 + 2 files changed, 62 insertions(+) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 5fafa3694..3aef3b7b7 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -23,9 +23,53 @@ from llm_on_ray.common.dataprocesser import DataProcesser +INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request." +INSTRUCTION_KEY = "### Instruction:" +INPUT_KEY = "Input:" RESPONSE_KEY = "### Response:" +END_KEY = "### End" RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n" +PROMPT_NO_INPUT_FORMAT = """{intro} + +{instruction_key} +{instruction} + +{response_key} +{response} + +{end_key}""".format( + intro=INTRO_BLURB, + instruction_key=INSTRUCTION_KEY, + instruction="{instruction}", + response_key=RESPONSE_KEY, + response="{response}", + end_key=END_KEY, +) + +PROMPT_WITH_INPUT_FORMAT = """{intro} + +{instruction_key} +{instruction} + +{input_key} +{input} + +{response_key} +{response} + +{end_key}""".format( + intro=INTRO_BLURB, + instruction_key=INSTRUCTION_KEY, + instruction="{instruction}", + input_key=INPUT_KEY, + input="{input}", + response_key=RESPONSE_KEY, + response="{response}", + end_key=END_KEY, +) +TEXT_COLUMN_NAME = "text" + class DataCollatorForCompletionOnlyLM(transformers.DataCollatorForLanguageModeling): def torch_call(self, examples): @@ -74,6 +118,23 @@ def prepare(self, tokenizer, dataset): column_names = dataset["train"].column_names def tokenize_function(examples): + if self.config.get("gpt_base_model"): + instruction = examples["instruction"] + response = examples["response"] + context = examples.get("context") + if not instruction: + raise ValueError(f"Expected an instruction in: {examples}") + if not response: + raise ValueError(f"Expected a response in: {examples}") + if context: + examples["text"] = PROMPT_WITH_INPUT_FORMAT.format( + instruction=instruction, response=response, input=context + ) + else: + examples["text"] = PROMPT_NO_INPUT_FORMAT.format( + instruction=instruction, response=response + ) + return tokenizer(examples["text"], max_length=max_length, truncation=True) if self.config.get("is_base_model"): if custom_chat_template: tokenizer.chat_template = custom_chat_template diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 502a985f6..ac679d0d7 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -248,6 +248,7 @@ def train_func(config: Dict[str, Any]): "group": config["Dataset"].get("group", True), "block_size": config["Dataset"].get("block_size", 512), "shuffle": config["Dataset"].get("shuffle", False), + "gpt_base_model": config["General"].get("gpt_base_model", False), "is_base_model": config["General"]["is_base_model"], "custom_chat_template": config["General"]["custom_chat_template"], "default_chat_template": config["General"]["default_chat_template"], From df9e84e5339b2ff078108e1a1558c5b37de22a79 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 8 Apr 2024 06:40:23 +0000 Subject: [PATCH 26/47] update Signed-off-by: minmingzhu --- .../common/dataprocesser/general_processer.py | 85 ++++++++++--------- 1 file changed, 43 insertions(+), 42 deletions(-) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 3aef3b7b7..9163b19df 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -127,55 +127,56 @@ def tokenize_function(examples): if not response: raise ValueError(f"Expected a response in: {examples}") if context: - examples["text"] = PROMPT_WITH_INPUT_FORMAT.format( + new_message = PROMPT_WITH_INPUT_FORMAT.format( instruction=instruction, response=response, input=context ) else: - examples["text"] = PROMPT_NO_INPUT_FORMAT.format( + new_message = PROMPT_NO_INPUT_FORMAT.format( instruction=instruction, response=response ) - return tokenizer(examples["text"], max_length=max_length, truncation=True) - if self.config.get("is_base_model"): - if custom_chat_template: - tokenizer.chat_template = custom_chat_template - new_tokenizer = tokenizer.apply_chat_template( - examples, - tokenize=False, - max_length=max_length, - ) - else: - tokenizer.chat_template = self.config.get("default_chat_template") - new_tokenizer = tokenizer.apply_chat_template( - examples, - tokenize=False, - max_length=max_length, - ) + return tokenizer.tokenize(new_message, max_length=max_length) else: - if model_default_chat_template: - tokenizer.chat_template = model_default_chat_template - new_tokenizer = tokenizer.apply_chat_template( - examples, - tokenize=False, - max_length=max_length, - ) + if self.config.get("is_base_model"): + if custom_chat_template: + tokenizer.chat_template = custom_chat_template + new_tokenizer = tokenizer.apply_chat_template( + examples, + tokenize=False, + max_length=max_length, + ) + else: + tokenizer.chat_template = self.config.get("default_chat_template") + new_tokenizer = tokenizer.apply_chat_template( + examples, + tokenize=False, + max_length=max_length, + ) else: - new_messages = [ - { - "role": "user", - "content": "instruction: " - + examples["instruction"] - + " context: " - + examples["context"], - }, - {"role": "assistant", "content": "response: " + examples["response"]}, - ] - - new_tokenizer = tokenizer.apply_chat_template( - new_messages, - tokenize=False, - max_length=max_length, - ) - return new_tokenizer + if model_default_chat_template: + tokenizer.chat_template = model_default_chat_template + new_tokenizer = tokenizer.apply_chat_template( + examples, + tokenize=False, + max_length=max_length, + ) + else: + new_messages = [ + { + "role": "user", + "content": "instruction: " + + examples["instruction"] + + " context: " + + examples["context"], + }, + {"role": "assistant", "content": "response: " + examples["response"]}, + ] + + new_tokenizer = tokenizer.apply_chat_template( + new_messages, + tokenize=False, + max_length=max_length, + ) + return new_tokenizer tokenized_datasets = dataset.map( tokenize_function, From 0a603790f8097935b304a838bea4b5776595c0aa Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 8 Apr 2024 08:56:23 +0000 Subject: [PATCH 27/47] update Signed-off-by: minmingzhu --- llm_on_ray/common/dataprocesser/general_processer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 9163b19df..84af7973c 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -134,7 +134,7 @@ def tokenize_function(examples): new_message = PROMPT_NO_INPUT_FORMAT.format( instruction=instruction, response=response ) - return tokenizer.tokenize(new_message, max_length=max_length) + return tokenizer(new_message, max_length=max_length) else: if self.config.get("is_base_model"): if custom_chat_template: @@ -176,7 +176,7 @@ def tokenize_function(examples): tokenize=False, max_length=max_length, ) - return new_tokenizer + return tokenizer(new_tokenizer, max_length=max_length) tokenized_datasets = dataset.map( tokenize_function, From b2429930d7896975ca202f8110cae8e5436a08cc Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 9 Apr 2024 07:10:41 +0000 Subject: [PATCH 28/47] 1. remove is_base_model tag 2. modify chat template Signed-off-by: minmingzhu --- .../common/dataprocesser/general_processer.py | 72 ++++++++----------- llm_on_ray/finetune/finetune.py | 4 +- llm_on_ray/finetune/finetune.yaml | 1 - llm_on_ray/finetune/finetune_config.py | 22 ++++-- llm_on_ray/finetune/models/mpt-7b.yaml | 1 + 5 files changed, 48 insertions(+), 52 deletions(-) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 84af7973c..b7282ca7e 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -104,7 +104,6 @@ def prepare(self, tokenizer, dataset): per_device_eval_batch_size = self.config.get("per_device_eval_batch_size") max_length = self.config.get("max_length") custom_chat_template = self.config.get("custom_chat_template") - model_default_chat_template = self.config.get("model_default_chat_template") group = self.config.get("group") block_size = self.config.get("block_size") @@ -136,46 +135,37 @@ def tokenize_function(examples): ) return tokenizer(new_message, max_length=max_length) else: - if self.config.get("is_base_model"): - if custom_chat_template: - tokenizer.chat_template = custom_chat_template - new_tokenizer = tokenizer.apply_chat_template( - examples, - tokenize=False, - max_length=max_length, - ) - else: - tokenizer.chat_template = self.config.get("default_chat_template") - new_tokenizer = tokenizer.apply_chat_template( - examples, - tokenize=False, - max_length=max_length, - ) + new_messages = [ + { + "role": "user", + "content": INTRO_BLURB + "\n\n" + + "###Instruction:\n" + + examples["instruction"] + "\n\n" + + "###context:\n" + + examples["context"] + "\n\n", + }, + {"role": "assistant", "content": examples["response"]}, + ] + if custom_chat_template: + tokenizer.chat_template = custom_chat_template + new_tokenizer = tokenizer.apply_chat_template( + new_messages, + tokenize=False, + max_length=max_length, + ) + elif tokenizer.chat_template is not None: + new_tokenizer = tokenizer.apply_chat_template( + new_messages, + tokenize=False, + max_length=max_length, + ) else: - if model_default_chat_template: - tokenizer.chat_template = model_default_chat_template - new_tokenizer = tokenizer.apply_chat_template( - examples, - tokenize=False, - max_length=max_length, - ) - else: - new_messages = [ - { - "role": "user", - "content": "instruction: " - + examples["instruction"] - + " context: " - + examples["context"], - }, - {"role": "assistant", "content": "response: " + examples["response"]}, - ] - - new_tokenizer = tokenizer.apply_chat_template( - new_messages, - tokenize=False, - max_length=max_length, - ) + tokenizer.chat_template = self.config.get("chat_template") + new_tokenizer = tokenizer.apply_chat_template( + new_messages, + tokenize=False, + max_length=max_length, + ) return tokenizer(new_tokenizer, max_length=max_length) tokenized_datasets = dataset.map( @@ -197,7 +187,7 @@ def group_texts(examples): total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { - k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + k: [t[i: i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index ac679d0d7..02a6e189c 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -249,10 +249,8 @@ def train_func(config: Dict[str, Any]): "block_size": config["Dataset"].get("block_size", 512), "shuffle": config["Dataset"].get("shuffle", False), "gpt_base_model": config["General"].get("gpt_base_model", False), - "is_base_model": config["General"]["is_base_model"], "custom_chat_template": config["General"]["custom_chat_template"], - "default_chat_template": config["General"]["default_chat_template"], - "model_default_chat_template": config["General"]["model_default_chat_template"], + "chat_template": config["General"]["chat_template"], }, "lr_scheduler": { "enable": True, diff --git a/llm_on_ray/finetune/finetune.yaml b/llm_on_ray/finetune/finetune.yaml index d8e46331a..1dee1ebf8 100644 --- a/llm_on_ray/finetune/finetune.yaml +++ b/llm_on_ray/finetune/finetune.yaml @@ -1,6 +1,5 @@ General: base_model: EleutherAI/gpt-j-6b - is_base_model: false gpt_base_model: true output_dir: /tmp/llm-ray/output checkpoint_dir: null diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index fe6f2b34b..4817f8c30 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -60,14 +60,22 @@ class General(BaseModel): deltatuner_config: Optional[DeltatunerConfig] = None enable_gradient_checkpointing: bool = False custom_chat_template: Optional[str] = None - default_chat_template: str = ( - "{{'### Below is an instruction that describes a task." - "Write a response that appropriately completes the request. '}}" - "{{'### Instruction: ' + messages['instruction'] " - "+ ' Input:' + messages['context'] + ' ### Response:' + messages['response'] " - "+ '### End \n'}}" + chat_template: Optional[str] = ( + "{{ bos_token }}" + "{% if messages[0]['role'] == 'system' %}" + "{{ raise_exception('System role not supported') }}" + "{% endif %}" + "{% for message in messages %}" + "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" + "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" + "{% endif %}" + "{% if message['role'] == 'user' %}" + "{{ '### Instruction: ' + message['content'] + eos_token }}" + "{% elif message['role'] == 'assistant' %}" + "{{ '### Response:' + message['content'] + eos_token }}" + "{% endif %}{% endfor %}" + "{{'### End \n'}}" ) - model_default_chat_template: Optional[str] = None class Dataset(BaseModel): diff --git a/llm_on_ray/finetune/models/mpt-7b.yaml b/llm_on_ray/finetune/models/mpt-7b.yaml index 5bceeee4d..eb2f8f119 100644 --- a/llm_on_ray/finetune/models/mpt-7b.yaml +++ b/llm_on_ray/finetune/models/mpt-7b.yaml @@ -1,6 +1,7 @@ General: base_model: mosaicml/mpt-7b tokenizer_name: EleutherAI/gpt-neox-20b + is_base_model: false gpt_base_model: false output_dir: /tmp/llm-ray/output checkpoint_dir: /tmp/llm-ray/checkpoint From 5afd1586b9bbe268819e1a865b7952a3f599864c Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Wed, 10 Apr 2024 01:25:29 +0000 Subject: [PATCH 29/47] update Signed-off-by: minmingzhu --- .../common/dataprocesser/general_processer.py | 114 +++++++++--------- pyproject.toml | 3 +- 2 files changed, 61 insertions(+), 56 deletions(-) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index b7282ca7e..7bf0d2804 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -99,11 +99,67 @@ def torch_call(self, examples): class GeneralProcesser(DataProcesser): + def tokenize_function(self, examples, tokenizer): + print(examples) + if self.config.get("gpt_base_model"): + instruction = examples["instruction"] + response = examples["response"] + context = examples.get("context") + if not instruction: + raise ValueError(f"Expected an instruction in: {examples}") + if not response: + raise ValueError(f"Expected a response in: {examples}") + if context: + new_message = PROMPT_WITH_INPUT_FORMAT.format( + instruction=instruction, response=response, input=context + ) + else: + new_message = PROMPT_NO_INPUT_FORMAT.format( + instruction=instruction, response=response + ) + return tokenizer(new_message, max_length=self.config.get("max_length")) + else: + new_messages = [ + { + "role": "user", + "content": "###Instruction:\n" + + examples["instruction"] + "\n\n" + + "###context:\n" + + examples["context"] + "\n\n", + }, + {"role": "assistant", "content": examples["response"] + "\n\n"}, + ] + print(new_messages) + if self.config.get("custom_chat_template") is not None: + print("custom_chat_template") + tokenizer.chat_template = self.config.get("custom_chat_template") + new_tokenizer = tokenizer.apply_chat_template( + new_messages, + tokenize=False, + max_length=self.config.get("max_length"), + ) + elif tokenizer.chat_template is not None: + print("tokenizer.chat_template") + new_tokenizer = tokenizer.apply_chat_template( + new_messages, + tokenize=False, + max_length=self.config.get("max_length"), + ) + else: + print("chat_template") + tokenizer.chat_template = self.config.get("chat_template") + new_tokenizer = tokenizer.apply_chat_template( + new_messages, + tokenize=False, + max_length=self.config.get("max_length"), + ) + tokenizer = tokenizer(new_tokenizer, max_length=self.config.get("max_length")) + print(tokenizer) + return tokenizer + def prepare(self, tokenizer, dataset): per_device_train_batch_size = self.config.get("per_device_train_batch_size") per_device_eval_batch_size = self.config.get("per_device_eval_batch_size") - max_length = self.config.get("max_length") - custom_chat_template = self.config.get("custom_chat_template") group = self.config.get("group") block_size = self.config.get("block_size") @@ -116,60 +172,8 @@ def prepare(self, tokenizer, dataset): if isinstance(dataset, datasets.DatasetDict): column_names = dataset["train"].column_names - def tokenize_function(examples): - if self.config.get("gpt_base_model"): - instruction = examples["instruction"] - response = examples["response"] - context = examples.get("context") - if not instruction: - raise ValueError(f"Expected an instruction in: {examples}") - if not response: - raise ValueError(f"Expected a response in: {examples}") - if context: - new_message = PROMPT_WITH_INPUT_FORMAT.format( - instruction=instruction, response=response, input=context - ) - else: - new_message = PROMPT_NO_INPUT_FORMAT.format( - instruction=instruction, response=response - ) - return tokenizer(new_message, max_length=max_length) - else: - new_messages = [ - { - "role": "user", - "content": INTRO_BLURB + "\n\n" - + "###Instruction:\n" - + examples["instruction"] + "\n\n" - + "###context:\n" - + examples["context"] + "\n\n", - }, - {"role": "assistant", "content": examples["response"]}, - ] - if custom_chat_template: - tokenizer.chat_template = custom_chat_template - new_tokenizer = tokenizer.apply_chat_template( - new_messages, - tokenize=False, - max_length=max_length, - ) - elif tokenizer.chat_template is not None: - new_tokenizer = tokenizer.apply_chat_template( - new_messages, - tokenize=False, - max_length=max_length, - ) - else: - tokenizer.chat_template = self.config.get("chat_template") - new_tokenizer = tokenizer.apply_chat_template( - new_messages, - tokenize=False, - max_length=max_length, - ) - return tokenizer(new_tokenizer, max_length=max_length) - tokenized_datasets = dataset.map( - tokenize_function, + lambda examples: self.tokenize_function(examples, tokenizer), remove_columns=column_names, load_from_cache_file=False, desc="Tokenize dataset", diff --git a/pyproject.toml b/pyproject.toml index b319045cc..a18574675 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,8 @@ dependencies = [ "py-cpuinfo", "pydantic-yaml", "async_timeout", - "typer" + "typer", + "jinja2>=3.0.0" ] [project.optional-dependencies] From bbf79251ea367ece3602b0a82c73471074f8c477 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Wed, 10 Apr 2024 08:32:22 +0000 Subject: [PATCH 30/47] 1. update doc/finetune_parameters.md 2. add unit test Signed-off-by: minmingzhu --- docs/finetune_parameters.md | 2 + .../common/dataprocesser/general_processer.py | 15 +- tests/finetune/test_chat_template.py | 139 ++++++++++++++++++ 3 files changed, 145 insertions(+), 11 deletions(-) create mode 100644 tests/finetune/test_chat_template.py diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md index 5d24f42e6..f9432b11b 100644 --- a/docs/finetune_parameters.md +++ b/docs/finetune_parameters.md @@ -15,6 +15,8 @@ The following are the parameters supported in the finetuning workflow. |lora_config|task_type: CAUSAL_LM
r: 8
lora_alpha: 32
lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.| |deltatuner_config|"algo": "lora"
"denas": True
"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.| |enable_gradient_checkpointing|False|enable gradient checkpointing to save GPU memory, but will cost more compute runtime| +|chat_template|"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### Instruction: ' + message['content'] + eos_token }}{{ '### Response:' + message['content'] + eos_token }}{% endif %}{% endfor %}{{'### End \n'}}"|LLM-on-Ray default chat default.| +|custom_chat_template|None|User-defined chat template.| ## Dataset Parameters diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 7bf0d2804..12f172bd7 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -100,7 +100,6 @@ def torch_call(self, examples): class GeneralProcesser(DataProcesser): def tokenize_function(self, examples, tokenizer): - print(examples) if self.config.get("gpt_base_model"): instruction = examples["instruction"] response = examples["response"] @@ -117,7 +116,7 @@ def tokenize_function(self, examples, tokenizer): new_message = PROMPT_NO_INPUT_FORMAT.format( instruction=instruction, response=response ) - return tokenizer(new_message, max_length=self.config.get("max_length")) + return tokenizer(new_message, add_special_tokens=False, max_length=self.config.get("max_length")) else: new_messages = [ { @@ -129,32 +128,26 @@ def tokenize_function(self, examples, tokenizer): }, {"role": "assistant", "content": examples["response"] + "\n\n"}, ] - print(new_messages) if self.config.get("custom_chat_template") is not None: - print("custom_chat_template") tokenizer.chat_template = self.config.get("custom_chat_template") new_tokenizer = tokenizer.apply_chat_template( new_messages, tokenize=False, - max_length=self.config.get("max_length"), ) elif tokenizer.chat_template is not None: - print("tokenizer.chat_template") new_tokenizer = tokenizer.apply_chat_template( new_messages, tokenize=False, - max_length=self.config.get("max_length"), ) else: - print("chat_template") tokenizer.chat_template = self.config.get("chat_template") new_tokenizer = tokenizer.apply_chat_template( new_messages, tokenize=False, - max_length=self.config.get("max_length"), ) - tokenizer = tokenizer(new_tokenizer, max_length=self.config.get("max_length")) - print(tokenizer) + tokenizer = tokenizer(new_tokenizer, + add_special_tokens=False, + max_length=self.config.get("max_length")) return tokenizer def prepare(self, tokenizer, dataset): diff --git a/tests/finetune/test_chat_template.py b/tests/finetune/test_chat_template.py new file mode 100644 index 000000000..7cdda115c --- /dev/null +++ b/tests/finetune/test_chat_template.py @@ -0,0 +1,139 @@ +import unittest + +import transformers +from transformers import AutoTokenizer +from llm_on_ray.common.dataprocesser.general_processer import GeneralProcesser + + +class TestTokenizeFunction(unittest.TestCase): + def setUp(self): + self.tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf') + self.config = { + 'gpt_base_model': True, + 'max_length': 512, + 'trust_remote_code': False, + 'chat_template': "Below is an instruction that describes a task. Write a response that appropriately " + "completes the request\n {% if messages[0]['role'] == 'system' %}{{ raise_exception(" + "'System role not supported') }}{% endif %}{% for message in messages %}{% if (message[" + "'role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles " + "must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] " + "== 'user' %}{{ '### Instruction: ' + message['content'] }}{% elif message['role'] == " + "'assistant' %}{{ '### Response: ' + message['content'] }}{% endif %}{% endfor %}{{'### " + "End \n'}}", + } + self.processer = GeneralProcesser(self.config) + + def test_tokenize_function_with_gpt_model(self): + self.tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-j-6b') + + examples = \ + { + "instruction": "Test instruction", + "response": "Test response", + "context": "Test context", + } + + # Verify the format of the result + expected_result = 'Below is an instruction that describes a task. Write a response that '\ + 'appropriately completes the request.\n'\ + '\n'\ + '### Instruction:\n'\ + 'Test instruction\n'\ + '\n'\ + 'Input:\n'\ + 'Test context\n'\ + '\n'\ + '### Response:\n'\ + 'Test response\n'\ + '\n'\ + '### End' + + result = self.processer.tokenize_function(examples, self.tokenizer) + self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) + + def test_tokenize_function_with_custom_chat_template(self): + examples = \ + { + "instruction": "Test instruction", + "response": "Test response", + "context": "Test context", + } + + # Verify the format of the result + expected_result = '<|im_start|>user\n' \ + '###Instruction:\n' \ + 'Test instruction\n' \ + '\n' \ + '###context:\n' \ + 'Test context\n' \ + '\n' \ + '<|im_end|><|im_start|>assistant\n' \ + 'Test response\n' \ + '\n' \ + '<|im_end|>' + # Set custom chat template + self.config['custom_chat_template'] = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'"\ + "+ message['content'] + '<|im_end|>'}}{% endfor %}" + + self.config['gpt_base_model'] = False + result = self.processer.tokenize_function(examples, self.tokenizer) + self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) + + def test_tokenize_function_with_chat_template(self): + examples = \ + { + "instruction": "Test instruction", + "response": "Test response", + "context": "Test context", + } + + # Verify the format of the result + expected_result = 'Below is an instruction that describes a task. Write a response that '\ + 'appropriately completes the request\n'\ + '### Instruction: ###Instruction:\n'\ + 'Test instruction\n'\ + '\n'\ + '###context:\n'\ + 'Test context\n'\ + '\n'\ + '### Response: Test response\n'\ + '\n'\ + '### End \n'\ + + self.config['gpt_base_model'] = False + result = self.processer.tokenize_function(examples, self.tokenizer) + self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) + + def test_tokenize_function_with_default_chat_template(self): + self.tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b-it') + examples = \ + { + "instruction": "Test instruction", + "response": "Test response", + "context": "Test context", + } + + chat_example = [ + { + "role": "user", + "content": "###Instruction:\nTest instruction\n\n###context:\nTest context\n\n", + + }, + { + "role": "assistant", + "content": "Test response\n\n", + } + ] + + # Verify the format of the result + expected_result = self.tokenizer.apply_chat_template(chat_example, + tokenize=False, + max_length=self.config.get("max_length")) + + self.config['gpt_base_model'] = False + result = self.processer.tokenize_function(examples, self.tokenizer) + self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) + + +if __name__ == '__main__': + unittest.main() From c026adfcb10382a998081b104b8db9eeaef9afb9 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Wed, 10 Apr 2024 08:44:21 +0000 Subject: [PATCH 31/47] update Signed-off-by: minmingzhu --- docs/finetune_parameters.md | 3 +-- llm_on_ray/common/dataprocesser/general_processer.py | 6 +++--- llm_on_ray/finetune/finetune.py | 6 +++--- llm_on_ray/finetune/finetune_config.py | 4 ++-- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md index f9432b11b..ee3615d5e 100644 --- a/docs/finetune_parameters.md +++ b/docs/finetune_parameters.md @@ -15,8 +15,7 @@ The following are the parameters supported in the finetuning workflow. |lora_config|task_type: CAUSAL_LM
r: 8
lora_alpha: 32
lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.| |deltatuner_config|"algo": "lora"
"denas": True
"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.| |enable_gradient_checkpointing|False|enable gradient checkpointing to save GPU memory, but will cost more compute runtime| -|chat_template|"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### Instruction: ' + message['content'] + eos_token }}{{ '### Response:' + message['content'] + eos_token }}{% endif %}{% endfor %}{{'### End \n'}}"|LLM-on-Ray default chat default.| -|custom_chat_template|None|User-defined chat template.| +|chat_template|None|User-defined chat template.| ## Dataset Parameters diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 12f172bd7..92c94338c 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -128,8 +128,8 @@ def tokenize_function(self, examples, tokenizer): }, {"role": "assistant", "content": examples["response"] + "\n\n"}, ] - if self.config.get("custom_chat_template") is not None: - tokenizer.chat_template = self.config.get("custom_chat_template") + if self.config.get("chat_template") is not None: + tokenizer.chat_template = self.config.get("chat_template") new_tokenizer = tokenizer.apply_chat_template( new_messages, tokenize=False, @@ -140,7 +140,7 @@ def tokenize_function(self, examples, tokenizer): tokenize=False, ) else: - tokenizer.chat_template = self.config.get("chat_template") + tokenizer.chat_template = self.config.get("default_chat_template") new_tokenizer = tokenizer.apply_chat_template( new_messages, tokenize=False, diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 02a6e189c..9f25d9583 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -14,7 +14,7 @@ # limitations under the License. # -#!/usr/bin/env python +# !/usr/bin/env python import os import argparse @@ -249,8 +249,8 @@ def train_func(config: Dict[str, Any]): "block_size": config["Dataset"].get("block_size", 512), "shuffle": config["Dataset"].get("shuffle", False), "gpt_base_model": config["General"].get("gpt_base_model", False), - "custom_chat_template": config["General"]["custom_chat_template"], "chat_template": config["General"]["chat_template"], + "default_chat_template": config["General"]["default_chat_template"], }, "lr_scheduler": { "enable": True, @@ -358,7 +358,7 @@ def main(external_config=None): if "xpu" in ipex.__version__: num_cpus = ( - resources_per_worker["CPU"] * num_training_workers + 1 + resources_per_worker["CPU"] * num_training_workers + 1 ) # additional 1 for head worker ray.init(num_cpus=num_cpus, runtime_env=runtime_env) else: diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index 4817f8c30..98597901d 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -59,8 +59,8 @@ class General(BaseModel): lora_config: Optional[LoraConfig] = None deltatuner_config: Optional[DeltatunerConfig] = None enable_gradient_checkpointing: bool = False - custom_chat_template: Optional[str] = None - chat_template: Optional[str] = ( + chat_template: Optional[str] = None + default_chat_template: str = ( "{{ bos_token }}" "{% if messages[0]['role'] == 'system' %}" "{{ raise_exception('System role not supported') }}" From d51880bc24b89a2b8f86cd604422871fe76166ff Mon Sep 17 00:00:00 2001 From: Xiaochang Wu Date: Tue, 9 Apr 2024 09:19:24 +0800 Subject: [PATCH 32/47] Support latest Ray 2.10 release (#158) * update * fix blocking * update Signed-off-by: Wu, Xiaochang * update Signed-off-by: Wu, Xiaochang * fix setup and getting started Signed-off-by: Wu, Xiaochang * update Signed-off-by: Wu, Xiaochang * update Signed-off-by: Wu, Xiaochang * nit Signed-off-by: Wu, Xiaochang * Add dependencies for tests and update pyproject.toml Signed-off-by: Wu, Xiaochang * Update dependencies and test workflow Signed-off-by: Wu, Xiaochang * Update dependencies and fix torch_dist.py Signed-off-by: Wu, Xiaochang * Update OpenAI SDK installation and start ray cluster Signed-off-by: Wu, Xiaochang --------- Signed-off-by: Wu, Xiaochang --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a18574675..451d2649d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,9 +34,9 @@ dependencies = [ "deltatuner==1.1.9", "py-cpuinfo", "pydantic-yaml", - "async_timeout", - "typer", - "jinja2>=3.0.0" + "async-timeout", + "jinja2>=3.0.0", + "typer" ] [project.optional-dependencies] From 63d2ef82e2080f373bf3c1fb16f1a078881d6d69 Mon Sep 17 00:00:00 2001 From: yutianchen Date: Tue, 9 Apr 2024 15:38:35 +0800 Subject: [PATCH 33/47] [Tests] Add query single test (#156) * single test * single test * single test * single test * fix hang error --- tests/inference/test_query_single.py | 107 +++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 tests/inference/test_query_single.py diff --git a/tests/inference/test_query_single.py b/tests/inference/test_query_single.py new file mode 100644 index 000000000..1c32f6b73 --- /dev/null +++ b/tests/inference/test_query_single.py @@ -0,0 +1,107 @@ +import subprocess +import pytest +import os + +os.environ["no_proxy"] = "localhost,127.0.0.1" + + +def start_serve(model_name): + current_path = os.path.dirname(os.path.abspath(__file__)) + + config_path = os.path.join( + current_path, "../../.github/workflows/config/" + model_name + "-ci.yaml" + ) + + cmd_serve = ["llm_on_ray-serve", "--config_file", config_path, "--simple"] + + result_serve = subprocess.run(cmd_serve, capture_output=True, text=True) + + # Ensure there are no errors in the serve script execution + assert result_serve.returncode == 0, print( + "\n" + "Serve error stderr message: " + "\n", result_serve.stderr + ) + + # Print the output of subprocess.run for checking if output is expected + print("\n" + "Serve message: " + "\n", result_serve.stdout) + + # Ensure there are no errors in the serve script execution + assert "Error" not in result_serve.stderr + + +def script_with_args( + base_url, model_name, streaming_response, max_new_tokens, temperature, top_p, top_k +): + current_path = os.path.dirname(os.path.abspath(__file__)) + + os.path.join(current_path, "../../.github/workflows/config/" + model_name + "-ci.yaml") + + example_query_single_path = os.path.join( + current_path, "../../examples/inference/api_server_simple/query_single.py" + ) + + cmd_single = [ + "python", + example_query_single_path, + "--model_endpoint", + base_url + model_name, + ] + + if streaming_response: + cmd_single.append("--streaming_response") + + if max_new_tokens is not None: + cmd_single.extend(["--max_new_tokens", str(max_new_tokens)]) + + if temperature is not None: + cmd_single.extend(["--temperature", str(temperature)]) + + if top_p is not None: + cmd_single.extend(["--top_p", str(top_p)]) + + if top_k is not None: + cmd_single.extend(["--top_k", str(top_k)]) + + result_query_single = subprocess.run(cmd_single, capture_output=True, text=True) + + # Print the output of subprocess.run for checking if output is expected + print(result_query_single) + + # Ensure there are no errors in the OpenAI API query script execution + assert "Error" not in result_query_single.stderr + + # Returncode should be 0 when there is no exception + assert result_query_single.returncode == 0 + + +executed_models = {} + + +# Parametrize the test function with different combinations of parameters +# TODO: more models and combinations will be added and tested. +@pytest.mark.parametrize( + "base_url,model_name,streaming_response,max_new_tokens,temperature,top_p, top_k", + [ + (base_url, model_name, streaming_response, max_new_tokens, temperature, top_p, top_k) + for base_url in ["http://localhost:8000/"] + for model_name in ["gpt2"] + for streaming_response in [None] + for max_new_tokens in [None] + for temperature in [None] + for top_p in [None] + for top_k in [None] + ], +) +def test_script( + base_url, model_name, streaming_response, max_new_tokens, temperature, top_p, top_k +): + global executed_models + + # Check if this modelname has already executed start_serve + if model_name not in executed_models: + start_serve(model_name) + # Mark this modelname has already executed start_serve + executed_models[model_name] = True + + script_with_args( + base_url, model_name, streaming_response, max_new_tokens, temperature, top_p, top_k + ) From 05d63ef49a3c4570060245a3f104acc3a86eee16 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Wed, 10 Apr 2024 08:50:32 +0000 Subject: [PATCH 34/47] format Signed-off-by: minmingzhu --- .../common/dataprocesser/general_processer.py | 20 +- llm_on_ray/finetune/finetune.py | 2 +- tests/finetune/test_chat_template.py | 180 +++++++++--------- 3 files changed, 104 insertions(+), 98 deletions(-) diff --git a/llm_on_ray/common/dataprocesser/general_processer.py b/llm_on_ray/common/dataprocesser/general_processer.py index 92c94338c..b2727e97b 100644 --- a/llm_on_ray/common/dataprocesser/general_processer.py +++ b/llm_on_ray/common/dataprocesser/general_processer.py @@ -116,15 +116,19 @@ def tokenize_function(self, examples, tokenizer): new_message = PROMPT_NO_INPUT_FORMAT.format( instruction=instruction, response=response ) - return tokenizer(new_message, add_special_tokens=False, max_length=self.config.get("max_length")) + return tokenizer( + new_message, add_special_tokens=False, max_length=self.config.get("max_length") + ) else: new_messages = [ { "role": "user", "content": "###Instruction:\n" - + examples["instruction"] + "\n\n" - + "###context:\n" - + examples["context"] + "\n\n", + + examples["instruction"] + + "\n\n" + + "###context:\n" + + examples["context"] + + "\n\n", }, {"role": "assistant", "content": examples["response"] + "\n\n"}, ] @@ -145,9 +149,9 @@ def tokenize_function(self, examples, tokenizer): new_messages, tokenize=False, ) - tokenizer = tokenizer(new_tokenizer, - add_special_tokens=False, - max_length=self.config.get("max_length")) + tokenizer = tokenizer( + new_tokenizer, add_special_tokens=False, max_length=self.config.get("max_length") + ) return tokenizer def prepare(self, tokenizer, dataset): @@ -184,7 +188,7 @@ def group_texts(examples): total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { - k: [t[i: i + block_size] for i in range(0, total_length, block_size)] + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() diff --git a/llm_on_ray/finetune/finetune.py b/llm_on_ray/finetune/finetune.py index 9f25d9583..14422967b 100644 --- a/llm_on_ray/finetune/finetune.py +++ b/llm_on_ray/finetune/finetune.py @@ -358,7 +358,7 @@ def main(external_config=None): if "xpu" in ipex.__version__: num_cpus = ( - resources_per_worker["CPU"] * num_training_workers + 1 + resources_per_worker["CPU"] * num_training_workers + 1 ) # additional 1 for head worker ray.init(num_cpus=num_cpus, runtime_env=runtime_env) else: diff --git a/tests/finetune/test_chat_template.py b/tests/finetune/test_chat_template.py index 7cdda115c..2270a5781 100644 --- a/tests/finetune/test_chat_template.py +++ b/tests/finetune/test_chat_template.py @@ -7,133 +7,135 @@ class TestTokenizeFunction(unittest.TestCase): def setUp(self): - self.tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf') + self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") self.config = { - 'gpt_base_model': True, - 'max_length': 512, - 'trust_remote_code': False, - 'chat_template': "Below is an instruction that describes a task. Write a response that appropriately " - "completes the request\n {% if messages[0]['role'] == 'system' %}{{ raise_exception(" - "'System role not supported') }}{% endif %}{% for message in messages %}{% if (message[" - "'role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles " - "must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] " - "== 'user' %}{{ '### Instruction: ' + message['content'] }}{% elif message['role'] == " - "'assistant' %}{{ '### Response: ' + message['content'] }}{% endif %}{% endfor %}{{'### " - "End \n'}}", + "gpt_base_model": True, + "max_length": 512, + "trust_remote_code": False, + "chat_template": "Below is an instruction that describes a task. Write a response that appropriately " + "completes the request\n {% if messages[0]['role'] == 'system' %}{{ raise_exception(" + "'System role not supported') }}{% endif %}{% for message in messages %}{% if (message[" + "'role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles " + "must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] " + "== 'user' %}{{ '### Instruction: ' + message['content'] }}{% elif message['role'] == " + "'assistant' %}{{ '### Response: ' + message['content'] }}{% endif %}{% endfor %}{{'### " + "End \n'}}", } self.processer = GeneralProcesser(self.config) def test_tokenize_function_with_gpt_model(self): - self.tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-j-6b') + self.tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b") - examples = \ - { - "instruction": "Test instruction", - "response": "Test response", - "context": "Test context", - } + examples = { + "instruction": "Test instruction", + "response": "Test response", + "context": "Test context", + } # Verify the format of the result - expected_result = 'Below is an instruction that describes a task. Write a response that '\ - 'appropriately completes the request.\n'\ - '\n'\ - '### Instruction:\n'\ - 'Test instruction\n'\ - '\n'\ - 'Input:\n'\ - 'Test context\n'\ - '\n'\ - '### Response:\n'\ - 'Test response\n'\ - '\n'\ - '### End' + expected_result = ( + "Below is an instruction that describes a task. Write a response that " + "appropriately completes the request.\n" + "\n" + "### Instruction:\n" + "Test instruction\n" + "\n" + "Input:\n" + "Test context\n" + "\n" + "### Response:\n" + "Test response\n" + "\n" + "### End" + ) result = self.processer.tokenize_function(examples, self.tokenizer) - self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) + self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) def test_tokenize_function_with_custom_chat_template(self): - examples = \ - { - "instruction": "Test instruction", - "response": "Test response", - "context": "Test context", - } + examples = { + "instruction": "Test instruction", + "response": "Test response", + "context": "Test context", + } # Verify the format of the result - expected_result = '<|im_start|>user\n' \ - '###Instruction:\n' \ - 'Test instruction\n' \ - '\n' \ - '###context:\n' \ - 'Test context\n' \ - '\n' \ - '<|im_end|><|im_start|>assistant\n' \ - 'Test response\n' \ - '\n' \ - '<|im_end|>' + expected_result = ( + "<|im_start|>user\n" + "###Instruction:\n" + "Test instruction\n" + "\n" + "###context:\n" + "Test context\n" + "\n" + "<|im_end|><|im_start|>assistant\n" + "Test response\n" + "\n" + "<|im_end|>" + ) # Set custom chat template - self.config['custom_chat_template'] = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'"\ - "+ message['content'] + '<|im_end|>'}}{% endfor %}" + self.config["custom_chat_template"] = ( + "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'" + "+ message['content'] + '<|im_end|>'}}{% endfor %}" + ) - self.config['gpt_base_model'] = False + self.config["gpt_base_model"] = False result = self.processer.tokenize_function(examples, self.tokenizer) - self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) + self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) def test_tokenize_function_with_chat_template(self): - examples = \ - { - "instruction": "Test instruction", - "response": "Test response", - "context": "Test context", - } + examples = { + "instruction": "Test instruction", + "response": "Test response", + "context": "Test context", + } # Verify the format of the result - expected_result = 'Below is an instruction that describes a task. Write a response that '\ - 'appropriately completes the request\n'\ - '### Instruction: ###Instruction:\n'\ - 'Test instruction\n'\ - '\n'\ - '###context:\n'\ - 'Test context\n'\ - '\n'\ - '### Response: Test response\n'\ - '\n'\ - '### End \n'\ - - self.config['gpt_base_model'] = False + expected_result = ( + "Below is an instruction that describes a task. Write a response that " + "appropriately completes the request\n" + "### Instruction: ###Instruction:\n" + "Test instruction\n" + "\n" + "###context:\n" + "Test context\n" + "\n" + "### Response: Test response\n" + "\n" + "### End \n" + ) + self.config["gpt_base_model"] = False result = self.processer.tokenize_function(examples, self.tokenizer) - self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) + self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) def test_tokenize_function_with_default_chat_template(self): - self.tokenizer = AutoTokenizer.from_pretrained('google/gemma-2b-it') - examples = \ - { - "instruction": "Test instruction", - "response": "Test response", - "context": "Test context", - } + self.tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it") + examples = { + "instruction": "Test instruction", + "response": "Test response", + "context": "Test context", + } chat_example = [ { "role": "user", "content": "###Instruction:\nTest instruction\n\n###context:\nTest context\n\n", - }, { "role": "assistant", "content": "Test response\n\n", - } + }, ] # Verify the format of the result - expected_result = self.tokenizer.apply_chat_template(chat_example, - tokenize=False, - max_length=self.config.get("max_length")) + expected_result = self.tokenizer.apply_chat_template( + chat_example, tokenize=False, max_length=self.config.get("max_length") + ) - self.config['gpt_base_model'] = False + self.config["gpt_base_model"] = False result = self.processer.tokenize_function(examples, self.tokenizer) - self.assertEqual(self.tokenizer.decode(result['input_ids']), expected_result) + self.assertEqual(self.tokenizer.decode(result["input_ids"]), expected_result) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() From 3f0b7bcbdfcf5f9f25b80d1ee0680ce29aa59331 Mon Sep 17 00:00:00 2001 From: minmingzhu <45281494+minmingzhu@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:22:03 +0000 Subject: [PATCH 35/47] [Finetune] use base model mpt-7b instead of mpt-7b-chat (#181) * use base model mpt-7b instead of mpt-7b-chat Signed-off-by: minmingzhu * manual setting specify tokenizer Signed-off-by: minmingzhu * update Signed-off-by: minmingzhu * update doc/finetune_parameters.md Signed-off-by: minmingzhu --------- Signed-off-by: minmingzhu --- llm_on_ray/finetune/models/mpt-7b.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/llm_on_ray/finetune/models/mpt-7b.yaml b/llm_on_ray/finetune/models/mpt-7b.yaml index eb2f8f119..5bceeee4d 100644 --- a/llm_on_ray/finetune/models/mpt-7b.yaml +++ b/llm_on_ray/finetune/models/mpt-7b.yaml @@ -1,7 +1,6 @@ General: base_model: mosaicml/mpt-7b tokenizer_name: EleutherAI/gpt-neox-20b - is_base_model: false gpt_base_model: false output_dir: /tmp/llm-ray/output checkpoint_dir: /tmp/llm-ray/checkpoint From 42ecf6375dabbbd6de736db0b77cd22e158acf54 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 22 Apr 2024 06:27:14 +0000 Subject: [PATCH 36/47] fix license issues Signed-off-by: minmingzhu --- tests/finetune/test_chat_template.py | 15 +++++++++++++++ tests/inference/test_query_single.py | 16 ++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/tests/finetune/test_chat_template.py b/tests/finetune/test_chat_template.py index 2270a5781..a416d8f7b 100644 --- a/tests/finetune/test_chat_template.py +++ b/tests/finetune/test_chat_template.py @@ -1,3 +1,18 @@ +# +# Copyright 2023 The LLM-on-Ray Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# import unittest import transformers diff --git a/tests/inference/test_query_single.py b/tests/inference/test_query_single.py index 1c32f6b73..d48727a30 100644 --- a/tests/inference/test_query_single.py +++ b/tests/inference/test_query_single.py @@ -1,3 +1,19 @@ +# +# Copyright 2023 The LLM-on-Ray Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import subprocess import pytest import os From 85520e90df55c34e239234c30a0a38f49005dd28 Mon Sep 17 00:00:00 2001 From: minmingzhu <45281494+minmingzhu@users.noreply.github.com> Date: Mon, 22 Apr 2024 14:18:31 +0800 Subject: [PATCH 37/47] Update finetune.yaml --- llm_on_ray/finetune/finetune.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/llm_on_ray/finetune/finetune.yaml b/llm_on_ray/finetune/finetune.yaml index 1dee1ebf8..1f1cc46ca 100644 --- a/llm_on_ray/finetune/finetune.yaml +++ b/llm_on_ray/finetune/finetune.yaml @@ -12,7 +12,6 @@ General: lora_alpha: 32 lora_dropout: 0.1 enable_gradient_checkpointing: false - custom_chat_template: null Dataset: train_file: examples/data/sample_finetune_data_small.jsonl group: true From 968e61627edbad2069b88c6f82d0e58427016c36 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Fri, 19 Apr 2024 14:36:48 +0000 Subject: [PATCH 38/47] integrate inference chat template Signed-off-by: minmingzhu --- .../inference/models/CodeLlama-7b-hf.yaml | 2 +- llm_on_ray/inference/models/bloom-560m.yaml | 4 +- llm_on_ray/inference/models/deplot.yaml | 2 +- llm_on_ray/inference/models/falcon-7b.yaml | 2 +- llm_on_ray/inference/models/fuyu8b.yaml | 2 +- llm_on_ray/inference/models/gemma-2b.yaml | 2 +- llm_on_ray/inference/models/gpt-j-6b.yaml | 2 +- llm_on_ray/inference/models/gpt2.yaml | 3 +- .../inference/models/llama-2-7b-chat-hf.yaml | 2 +- .../models/mistral-7b-Instruct-v0.2.yaml | 4 +- .../inference/models/mistral-7b-v0.1.yaml | 4 +- llm_on_ray/inference/models/mpt-7b.yaml | 2 +- .../inference/models/neural-chat-7b-v3-1.yaml | 2 +- llm_on_ray/inference/models/opt-125m.yaml | 2 +- .../inference/models/sqlcoder-7b-2.yaml | 2 +- llm_on_ray/inference/models/starcoder.yaml | 2 +- llm_on_ray/inference/predictor_deployment.py | 83 ++++++++++++++++--- 17 files changed, 93 insertions(+), 29 deletions(-) diff --git a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml index 8a2ef79fd..5cad7e6aa 100644 --- a/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml +++ b/llm_on_ray/inference/models/CodeLlama-7b-hf.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: "cpu" ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/bloom-560m.yaml b/llm_on_ray/inference/models/bloom-560m.yaml index be5e9414e..92dbb8809 100644 --- a/llm_on_ray/inference/models/bloom-560m.yaml +++ b/llm_on_ray/inference/models/bloom-560m.yaml @@ -6,9 +6,9 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: - enabled: true + enabled: false precision: bf16 model_description: model_id_or_path: bigscience/bloom-560m diff --git a/llm_on_ray/inference/models/deplot.yaml b/llm_on_ray/inference/models/deplot.yaml index bb4ea7ec5..acfbe3e87 100644 --- a/llm_on_ray/inference/models/deplot.yaml +++ b/llm_on_ray/inference/models/deplot.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: "cpu" ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/falcon-7b.yaml b/llm_on_ray/inference/models/falcon-7b.yaml index 88a088350..fbbbdda08 100644 --- a/llm_on_ray/inference/models/falcon-7b.yaml +++ b/llm_on_ray/inference/models/falcon-7b.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: "cpu" ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/fuyu8b.yaml b/llm_on_ray/inference/models/fuyu8b.yaml index 7c4977adc..3f5fa7ab7 100644 --- a/llm_on_ray/inference/models/fuyu8b.yaml +++ b/llm_on_ray/inference/models/fuyu8b.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: "cpu" ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/gemma-2b.yaml b/llm_on_ray/inference/models/gemma-2b.yaml index 09e971081..b6d16b18c 100644 --- a/llm_on_ray/inference/models/gemma-2b.yaml +++ b/llm_on_ray/inference/models/gemma-2b.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 2 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: enabled: true precision: bf16 diff --git a/llm_on_ray/inference/models/gpt-j-6b.yaml b/llm_on_ray/inference/models/gpt-j-6b.yaml index 9719b2f7e..3bdb9997f 100644 --- a/llm_on_ray/inference/models/gpt-j-6b.yaml +++ b/llm_on_ray/inference/models/gpt-j-6b.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: "cpu" ipex: # false here for ci coverage enabled: false diff --git a/llm_on_ray/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml index ca008cba2..cddc45cd8 100644 --- a/llm_on_ray/inference/models/gpt2.yaml +++ b/llm_on_ray/inference/models/gpt2.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: enabled: true precision: bf16 @@ -15,3 +15,4 @@ model_description: tokenizer_name_or_path: gpt2 gpt_base_model: true chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" + diff --git a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml index 7fdae3933..4f2ed0194 100644 --- a/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml +++ b/llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: "cpu" ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml index ea50f6af7..ab901eb95 100644 --- a/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml +++ b/llm_on_ray/inference/models/mistral-7b-Instruct-v0.2.yaml @@ -5,13 +5,13 @@ cpus_per_worker: 48 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: enabled: false precision: bf16 model_description: model_id_or_path: mistralai/Mistral-7B-Instruct-v0.2 - ipexllm: false + bigdl: false tokenizer_name_or_path: mistralai/Mistral-7B-Instruct-v0.2 config: trust_remote_code: true diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml index db2eec1e4..7b5427669 100644 --- a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml +++ b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml @@ -6,13 +6,13 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: enabled: false precision: bf16 model_description: model_id_or_path: mistralai/Mistral-7B-v0.1 - ipexllm: false + bigdl: false tokenizer_name_or_path: mistralai/Mistral-7B-v0.1 config: trust_remote_code: true diff --git a/llm_on_ray/inference/models/mpt-7b.yaml b/llm_on_ray/inference/models/mpt-7b.yaml index 89ce086ed..80f062a82 100644 --- a/llm_on_ray/inference/models/mpt-7b.yaml +++ b/llm_on_ray/inference/models/mpt-7b.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml index be7b8d611..2d6ac4d29 100644 --- a/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml +++ b/llm_on_ray/inference/models/neural-chat-7b-v3-1.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml index 92fd30260..81aa7093e 100644 --- a/llm_on_ray/inference/models/opt-125m.yaml +++ b/llm_on_ray/inference/models/opt-125m.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: CPU ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml index 0a1b43766..e4e629599 100644 --- a/llm_on_ray/inference/models/sqlcoder-7b-2.yaml +++ b/llm_on_ray/inference/models/sqlcoder-7b-2.yaml @@ -5,7 +5,7 @@ cpus_per_worker: 22 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: cpu +device: "cpu" ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/models/starcoder.yaml b/llm_on_ray/inference/models/starcoder.yaml index 660b10ba8..a57ae351d 100644 --- a/llm_on_ray/inference/models/starcoder.yaml +++ b/llm_on_ray/inference/models/starcoder.yaml @@ -9,7 +9,7 @@ workers_per_group: 2 ipex: enabled: false precision: bf16 -device: cpu +device: "cpu" model_description: model_id_or_path: bigcode/starcoder tokenizer_name_or_path: bigcode/starcoder diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index 2e642bfff..3cc22c870 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -288,12 +288,12 @@ async def handle_static_batch(self, prompts: List[str], **config: Dict[str, Any] preprocessing_time=0, ) - def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=None): + def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=None): """ Preprocesses the input prompts. Args: - input (Union[str, List[str]]): The input prompt(s) to be preprocessed. + input (Union[str, List[dict]]): The input prompt(s) to be preprocessed. tools (List[str]): The list of tools to be used. tool_choice: The choice of tool to be used. @@ -313,10 +313,7 @@ def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=No if isinstance(input, str): return input - elif isinstance(input, List): - prompts = [] - images = [] - + elif isinstance(input, list): prompt_format = get_prompt_format(input) if prompt_format == PromptFormat.CHAT_FORMAT: # Process the input prompts with tools @@ -344,16 +341,32 @@ def preprocess_prompts(self, input: Union[str, List], tools=None, tool_choice=No prompt = self.process_tool.get_prompt(input) return prompt else: - prompts.extend(input) + if isinstance(input, list) and input and isinstance(input[0], dict): + prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False) + elif isinstance(input, list) and input and isinstance(input[0], list): + prompt = [self.predictor.tokenizer.apply_chat_template(t, tokenize=False) for t in input] + elif isinstance(input, list) and input and isinstance(input[0], ChatMessage): + messages = [] + for chat_message in input: + message = {"role": chat_message.role, "content": chat_message.content} + messages.append(message) + prompt = self.predictor.tokenizer.apply_chat_template(messages, tokenize=False) + elif isinstance(input, list) and input and isinstance(input[0], str): + prompt = input + elif isinstance(input, str): + prompt = input + else: + raise TypeError(f"Unsupported type {type(input)} for text. Expected dict or list of dicts.") + logger.info(prompt) + return prompt elif prompt_format == PromptFormat.PROMPTS_FORMAT: - prompts.extend(input) - else: raise HTTPException(400, "Invalid prompt format.") - return prompts + return input else: raise HTTPException(400, "Invalid prompt format.") async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSONResponse, str]: + logger.info("PredictorDeployment call") self.use_openai = False try: @@ -372,6 +385,7 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON content="Empty prompt is not supported.", ) config = json_request["config"] if "config" in json_request else {} + # return prompt or list of prompts preprocessed prompts = self.preprocess_prompts(input) @@ -401,3 +415,52 @@ async def openai_call( yield result else: yield await self.handle_non_streaming(prompts, config) + + + def _extract_messages(messages): + texts, images = [], [] + for message in messages: + if message['role'] == 'user' and isinstance(message['content'], list): + texts.append({"role": "user", "content": message['content'][0]['text']}) + images.append({"role": "user", "content": message['content'][1]['image_url']['url']}) + else: + texts.append(message) + return texts, images + + def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]): + """Prepare image from history messages.""" + from PIL import Image + import requests + from io import BytesIO + import base64 + import re + + # prepare images + images = [] + if isinstance(messages[0], list): + for i in len(messages): + for msg in messages[i]: + msg = dict(msg) + role, content = msg["role"], msg["content"] + if "url" not in content: + continue + is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 + if is_data: + encoded_str = re.sub("^data:image/.+;base64,", "", content["url"]) + images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str)))) + else: + images[i].append(Image.open(requests.get(content["url"], stream=True).raw)) + else: + for msg in messages: + msg = dict(msg) + role, content = msg["role"], msg["content"] + if "url" not in content: + continue + is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 + if is_data: + encoded_str = re.sub("^data:image/.+;base64,", "", content["url"]) + images.append(Image.open(BytesIO(base64.b64decode(encoded_str)))) + else: + images.append(Image.open(requests.get(content["url"], stream=True).raw)) + + return images \ No newline at end of file From 43c333fad0d89b7c8ae9eb67e3c77f3700795194 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 22 Apr 2024 01:00:41 +0000 Subject: [PATCH 39/47] update Signed-off-by: minmingzhu --- llm_on_ray/inference/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llm_on_ray/inference/utils.py b/llm_on_ray/inference/utils.py index 91e311088..5a8db0401 100644 --- a/llm_on_ray/inference/utils.py +++ b/llm_on_ray/inference/utils.py @@ -162,13 +162,13 @@ def is_cpu_without_ipex(infer_conf: InferenceConfig) -> bool: return (not infer_conf.ipex.enabled) and infer_conf.device == DEVICE_CPU -def get_prompt_format(input: Union[List[str], List[dict], List[ChatMessage]]): +def get_prompt_format(input: Union[List[str], List[dict], List[List[dict]], List[ChatMessage]]): chat_format = True prompts_format = True for item in input: - if isinstance(item, str) or isinstance(item, list): + if isinstance(item, str): chat_format = False - elif isinstance(item, dict) or isinstance(item, ChatMessage): + elif isinstance(item, dict) or isinstance(item, ChatMessage) or isinstance(item, list): prompts_format = False else: chat_format = False From b5b7f28de6be40810a58e8bd72a232f67ad87203 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 22 Apr 2024 02:31:09 +0000 Subject: [PATCH 40/47] update Signed-off-by: minmingzhu --- llm_on_ray/inference/predictor_deployment.py | 41 ++++++++++++-------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index 3cc22c870..4937ed9ce 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -288,6 +288,7 @@ async def handle_static_batch(self, prompts: List[str], **config: Dict[str, Any] preprocessing_time=0, ) + # TODO:Abstract the preprocess_prompts function into a class for handling chat templates def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=None): """ Preprocesses the input prompts. @@ -344,19 +345,26 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No if isinstance(input, list) and input and isinstance(input[0], dict): prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False) elif isinstance(input, list) and input and isinstance(input[0], list): - prompt = [self.predictor.tokenizer.apply_chat_template(t, tokenize=False) for t in input] + prompt = [ + self.predictor.tokenizer.apply_chat_template(t, tokenize=False) + for t in input + ] elif isinstance(input, list) and input and isinstance(input[0], ChatMessage): messages = [] for chat_message in input: message = {"role": chat_message.role, "content": chat_message.content} messages.append(message) - prompt = self.predictor.tokenizer.apply_chat_template(messages, tokenize=False) + prompt = self.predictor.tokenizer.apply_chat_template( + messages, tokenize=False + ) elif isinstance(input, list) and input and isinstance(input[0], str): prompt = input elif isinstance(input, str): prompt = input else: - raise TypeError(f"Unsupported type {type(input)} for text. Expected dict or list of dicts.") + raise TypeError( + f"Unsupported type {type(input)} for text. Expected dict or list of dicts." + ) logger.info(prompt) return prompt elif prompt_format == PromptFormat.PROMPTS_FORMAT: @@ -416,18 +424,19 @@ async def openai_call( else: yield await self.handle_non_streaming(prompts, config) - - def _extract_messages(messages): + def _extract_messages(self, messages): texts, images = [], [] for message in messages: - if message['role'] == 'user' and isinstance(message['content'], list): - texts.append({"role": "user", "content": message['content'][0]['text']}) - images.append({"role": "user", "content": message['content'][1]['image_url']['url']}) + if message["role"] == "user" and isinstance(message["content"], list): + texts.append({"role": "user", "content": message["content"][0]["text"]}) + images.append( + {"role": "user", "content": message["content"][1]["image_url"]["url"]} + ) else: texts.append(message) return texts, images - def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]): + def _prepare_image(self, messages: list): """Prepare image from history messages.""" from PIL import Image import requests @@ -436,12 +445,12 @@ def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]): import re # prepare images - images = [] - if isinstance(messages[0], list): - for i in len(messages): + images: List = [] + if isinstance(messages[0], List): + for i in range(len(messages)): for msg in messages[i]: msg = dict(msg) - role, content = msg["role"], msg["content"] + content = msg["content"] if "url" not in content: continue is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 @@ -450,10 +459,10 @@ def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]): images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str)))) else: images[i].append(Image.open(requests.get(content["url"], stream=True).raw)) - else: + elif isinstance(messages[0], dict): for msg in messages: msg = dict(msg) - role, content = msg["role"], msg["content"] + content = msg["content"] if "url" not in content: continue is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 @@ -463,4 +472,4 @@ def _prepare_image(self, messages: Union[List[dict], List[List[dict]]]): else: images.append(Image.open(requests.get(content["url"], stream=True).raw)) - return images \ No newline at end of file + return images From 9500d96bf9177e7a7fbfb4850b6aa84887255607 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 22 Apr 2024 03:24:45 +0000 Subject: [PATCH 41/47] update Signed-off-by: minmingzhu --- llm_on_ray/inference/models/opt-125m.yaml | 2 +- llm_on_ray/inference/predictor_deployment.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/llm_on_ray/inference/models/opt-125m.yaml b/llm_on_ray/inference/models/opt-125m.yaml index 81aa7093e..92fd30260 100644 --- a/llm_on_ray/inference/models/opt-125m.yaml +++ b/llm_on_ray/inference/models/opt-125m.yaml @@ -6,7 +6,7 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: CPU +device: cpu ipex: enabled: false precision: bf16 diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index 4937ed9ce..aab110727 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -311,7 +311,6 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No Raises: HTTPException: If the input prompt format is invalid or not supported. """ - if isinstance(input, str): return input elif isinstance(input, list): @@ -365,7 +364,6 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No raise TypeError( f"Unsupported type {type(input)} for text. Expected dict or list of dicts." ) - logger.info(prompt) return prompt elif prompt_format == PromptFormat.PROMPTS_FORMAT: raise HTTPException(400, "Invalid prompt format.") From 0c41b8b891cb7a54d43a4d36ff09c51c276260a8 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 22 Apr 2024 03:27:15 +0000 Subject: [PATCH 42/47] update Signed-off-by: minmingzhu --- llm_on_ray/inference/models/mistral-7b-v0.1.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml index 7b5427669..db2eec1e4 100644 --- a/llm_on_ray/inference/models/mistral-7b-v0.1.yaml +++ b/llm_on_ray/inference/models/mistral-7b-v0.1.yaml @@ -6,13 +6,13 @@ cpus_per_worker: 24 gpus_per_worker: 0 deepspeed: false workers_per_group: 2 -device: CPU +device: cpu ipex: enabled: false precision: bf16 model_description: model_id_or_path: mistralai/Mistral-7B-v0.1 - bigdl: false + ipexllm: false tokenizer_name_or_path: mistralai/Mistral-7B-v0.1 config: trust_remote_code: true From 0ff3d0bd5d83c149286e4711f74fe732a46960cb Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Wed, 24 Apr 2024 08:32:11 +0000 Subject: [PATCH 43/47] Integrate Web UI Signed-off-by: minmingzhu --- llm_on_ray/inference/inference_config.py | 1 - llm_on_ray/ui/start_ui.py | 341 +++++++++++------------ 2 files changed, 161 insertions(+), 181 deletions(-) diff --git a/llm_on_ray/inference/inference_config.py b/llm_on_ray/inference/inference_config.py index e1ea2e2c3..b7b58598a 100644 --- a/llm_on_ray/inference/inference_config.py +++ b/llm_on_ray/inference/inference_config.py @@ -191,7 +191,6 @@ def _check_workers_per_group(cls, v: int): all_models: Dict[str, InferenceConfig] = {} -base_models: Dict[str, InferenceConfig] = {} _models: Dict[str, InferenceConfig] = {} _cur = os.path.dirname(os.path.abspath(__file__)) diff --git a/llm_on_ray/ui/start_ui.py b/llm_on_ray/ui/start_ui.py index 5cdece259..7b011cb9c 100644 --- a/llm_on_ray/ui/start_ui.py +++ b/llm_on_ray/ui/start_ui.py @@ -29,8 +29,10 @@ from ray.train.base_trainer import TrainingFailedError from ray.tune.logger import LoggerCallback from ray.util import queue -from llm_on_ray.inference.inference_config import all_models, ModelDescription, Prompt -from llm_on_ray.inference.inference_config import InferenceConfig as FinetunedConfig + +from llm_on_ray.finetune.finetune_config import base_models, FinetuneConfig +from llm_on_ray.inference.inference_config import ModelDescription, all_models +from llm_on_ray.inference.inference_config import InferenceConfig from llm_on_ray.inference.predictor_deployment import PredictorDeployment from llm_on_ray.ui.html_format import cpu_memory_html, ray_status_html, custom_css @@ -108,20 +110,20 @@ def get_result(self): class ChatBotUI: def __init__( - self, - all_models: Dict[str, FinetunedConfig], - base_models: Dict[str, FinetunedConfig], - finetune_model_path: str, - finetuned_checkpoint_path: str, - repo_code_path: str, - default_data_path: str, - default_rag_path: str, - config: dict, - head_node_ip: str, - node_port: str, - node_user_name: str, - conda_env_name: str, - master_ip_port: str, + self, + all_models: Dict[str, InferenceConfig], + base_models: Dict[str, FinetuneConfig], + finetune_model_path: str, + finetuned_checkpoint_path: str, + repo_code_path: str, + default_data_path: str, + default_rag_path: str, + config: dict, + head_node_ip: str, + node_port: str, + node_user_name: str, + conda_env_name: str, + master_ip_port: str, ): self._all_models = all_models self._base_models = base_models @@ -147,7 +149,6 @@ def __init__( "What is Ray?", "What is chatbot?", ] - self.process_tool = None self.finetune_actor = None self.finetune_status = False self.default_rag_path = default_rag_path @@ -214,8 +215,10 @@ def user(self, user_message, history): return "", history + [[user_message, None]] def model_generate(self, prompt, request_url, model_name, config, simple_api=True): + print("model_generate") + print("prompt: ", prompt) + if simple_api: - prompt = self.process_tool.get_prompt(prompt) sample_input = {"text": prompt, "config": config, "stream": True} else: sample_input = { @@ -227,42 +230,42 @@ def model_generate(self, prompt, request_url, model_name, config, simple_api=Tru "top_p": config["top_p"], "top_k": config["top_k"], } + proxies = {"http": None, "https": None} outputs = requests.post(request_url, proxies=proxies, json=sample_input, stream=True) + outputs.raise_for_status() for output in outputs.iter_content(chunk_size=None, decode_unicode=True): - # remove context - if simple_api: - if prompt in output: - output = output[len(prompt) :] - else: - if output is None or output == "": - continue + if not simple_api: import json import re - chunk_data = re.sub("^data: ", "", output) - if chunk_data != "[DONE]": - decoded_output = json.loads(chunk_data) - if "choices" in decoded_output: - choices = decoded_output["choices"] + if output is not None and output != "": + # Get data from reponse chunk + chunk_data = re.sub("^data: ", "", output) + if chunk_data.strip() != "[DONE]": + # Get message choices from data + choices = json.loads(chunk_data)["choices"] + + # Pick content from first choice output = choices[0]["delta"].get("content", "") - else: - output = "" + + else: + output = "" yield output def bot( - self, - history, - deploy_model_endpoint, - model_endpoint, - Max_new_tokens, - Temperature, - Top_p, - Top_k, - model_name=None, - image=None, - enhance_knowledge=None, + self, + history, + deploy_model_endpoint, + model_endpoint, + Max_new_tokens, + Temperature, + Top_p, + Top_k, + model_name=None, + image=None, + enhance_knowledge=None, ): request_url = model_endpoint if model_endpoint != "" else deploy_model_endpoint simple_api = is_simple_api(request_url, model_name) @@ -295,11 +298,7 @@ def bot( for output in outputs: if len(output) != 0: time_end = time.time() - if simple_api: - history[-1][1] += output - history[-1][1] = self.process_tool.convert_output(history[-1][1]) - else: - history[-1][1] += output + history[-1][1] += output time_spend = round(time_end - time_start, 3) token_num += 1 new_token_latency = f""" @@ -310,16 +309,16 @@ def bot( yield [history, new_token_latency] def bot_test( - self, - bot_queue, - queue_id, - history, - model_endpoint, - Max_new_tokens, - Temperature, - Top_p, - Top_k, - model_name=None, + self, + bot_queue, + queue_id, + history, + model_endpoint, + Max_new_tokens, + Temperature, + Top_p, + Top_k, + model_name=None, ): request_url = model_endpoint simple_api = is_simple_api(request_url, model_name) @@ -355,19 +354,19 @@ def bot_test( bot_queue.put([queue_id, "", ""]) def bot_rag( - self, - history, - deploy_model_endpoint, - model_endpoint, - Max_new_tokens, - Temperature, - Top_p, - Top_k, - rag_selector, - rag_path, - returned_k, - model_name=None, - image=None, + self, + history, + deploy_model_endpoint, + model_endpoint, + Max_new_tokens, + Temperature, + Top_p, + Top_k, + rag_selector, + rag_path, + returned_k, + model_name=None, + image=None, ): enhance_knowledge = None if os.path.isabs(rag_path): @@ -413,16 +412,16 @@ def bot_rag( yield output def regenerate( - self, - db_dir, - upload_type, - input_type, - input_texts, - depth, - upload_files, - embedding_model, - splitter_chunk_size, - cpus_per_worker, + self, + db_dir, + upload_type, + input_type, + input_texts, + depth, + upload_files, + embedding_model, + splitter_chunk_size, + cpus_per_worker, ): if upload_type == "Youtube": input_texts = input_texts.split(";") @@ -501,16 +500,16 @@ def regenerate( return db_dir def send_all_bot( - self, - id, - history, - deployed_model_endpoint, - model_endpoint, - Max_new_tokens, - Temperature, - Top_p, - Top_k, - model_name, + self, + id, + history, + deployed_model_endpoint, + model_endpoint, + Max_new_tokens, + Temperature, + Top_p, + Top_k, + model_name, ): id = int(id) self.bot_queue[id] = Queue() @@ -536,21 +535,20 @@ def send_all_bot( yield res[1] def finetune( - self, - model_name, - custom_model_name, - custom_tokenizer_name, - dataset, - new_model_name, - batch_size, - num_epochs, - max_train_step, - lr, - worker_num, - cpus_per_worker_ftn, + self, + model_name, + custom_model_name, + custom_tokenizer_name, + dataset, + new_model_name, + batch_size, + num_epochs, + max_train_step, + lr, + worker_num, + cpus_per_worker_ftn, ): if model_name == "specify other models": - model_desc = None origin_model_path = custom_model_name tokenizer_path = custom_tokenizer_name if "gpt" in model_name.lower() or "pythia" in model_name.lower(): @@ -558,35 +556,33 @@ def finetune( else: gpt_base_model = False else: - model_desc = self._base_models[model_name].model_description - origin_model_path = model_desc.model_id_or_path - tokenizer_path = model_desc.tokenizer_name_or_path - gpt_base_model = model_desc.gpt_base_model + finetune_config = self._base_models[model_name] + gpt_base_model = finetune_config.General.gpt_base_model + + print(type(finetune_config)) + print(f"Finetune config: {finetune_config}") + finetune_config = finetune_config.dict() + print(type(finetune_config)) + print(f"Finetune config: {finetune_config}") last_gpt_base_model = False finetuned_model_path = os.path.join(self.finetuned_model_path, model_name, new_model_name) - finetuned_checkpoint_path = ( - os.path.join(self.finetuned_checkpoint_path, model_name, new_model_name) - if self.finetuned_checkpoint_path != "" - else None - ) - finetune_config = self.config.copy() - training_config = finetune_config.get("Training") - exist_worker = int(training_config["num_training_workers"]) - exist_cpus_per_worker_ftn = int(training_config["resources_per_worker"]["CPU"]) + exist_worker = int(finetune_config["Training"].get("num_training_workers")) + + exist_cpus_per_worker_ftn = int(finetune_config["Training"].get("resources_per_worker")["CPU"]) ray_resources = ray.available_resources() if "CPU" not in ray_resources or cpus_per_worker_ftn * worker_num + 1 > int( - ray.available_resources()["CPU"] + ray.available_resources()["CPU"] ): num_req = cpus_per_worker_ftn * worker_num + 1 num_act = int(ray.available_resources()["CPU"]) error_msg = f"Resources are not meeting the demand, required num_cpu is {num_req}, actual num_cpu is {num_act}" raise gr.Error(error_msg) if ( - worker_num != exist_worker - or cpus_per_worker_ftn != exist_cpus_per_worker_ftn - or not (gpt_base_model and last_gpt_base_model) + worker_num != exist_worker + or cpus_per_worker_ftn != exist_cpus_per_worker_ftn + or not (gpt_base_model and last_gpt_base_model) ): ray.shutdown() new_ray_init_config = { @@ -606,22 +602,17 @@ def finetune( if gpt_base_model: new_ray_init_config["runtime_env"]["pip"] = ["transformers==4.26.0"] else: - new_ray_init_config["runtime_env"]["pip"] = ["transformers==4.31.0"] - last_gpt_base_model = gpt_base_model - finetune_config["Training"]["num_training_workers"] = int(worker_num) - finetune_config["Training"]["resources_per_worker"]["CPU"] = int(cpus_per_worker_ftn) + new_ray_init_config["runtime_env"]["pip"] = ["transformers==4.38.1"] ray.init(**new_ray_init_config) - exist_worker = worker_num - exist_cpus_per_worker_ftn = cpus_per_worker_ftn finetune_config["Dataset"]["train_file"] = dataset - finetune_config["General"]["base_model"] = origin_model_path + # finetune_config["General"]["base_model"] = origin_model_path finetune_config["Training"]["epochs"] = num_epochs finetune_config["General"]["output_dir"] = finetuned_model_path - finetune_config["General"]["config"]["trust_remote_code"] = True - if finetuned_checkpoint_path: - finetune_config["General"]["checkpoint_dir"] = finetuned_checkpoint_path + + # if finetuned_checkpoint_path: + # finetune_config["General"]["checkpoint_dir"] = finetuned_checkpoint_path finetune_config["Training"]["batch_size"] = batch_size finetune_config["Training"]["learning_rate"] = lr if max_train_step != 0: @@ -653,6 +644,9 @@ def finetune( self.finetune_status = False # todo: a more reasonable solution is needed try: + print("Start fine-tuning") + print(finetune_config) + results = main(finetune_config) if results.metrics["done"]: self.finetune_status = True @@ -668,21 +662,18 @@ def finetune( ray.kill(self.finetune_actor) self.finetune_actor = None - new_prompt = Prompt() - new_prompt.intro = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n" - new_prompt.human_id = "\n### Instruction" - new_prompt.bot_id = "\n### Response" - new_prompt.stop_words.extend( - ["### Instruction", "# Instruction", "### Question", "##", " ="] - ) - new_model_desc = ModelDescription( - model_id_or_path=finetuned_model_path, - tokenizer_name_or_path=tokenizer_path, - prompt=new_prompt, - chat_processor=model_desc.chat_processor if model_desc is not None else "ChatModelGptJ", - ) + if finetune_config["General"].get("lora_config", None) is not None: + new_model_desc = ModelDescription( + model_id_or_path=finetune_config["General"].get("base_model"), + tokenizer_name_or_path=finetuned_model_path, + ) + else: + new_model_desc = ModelDescription( + model_id_or_path=finetuned_model_path, + tokenizer_name_or_path=finetuned_model_path, + ) new_model_desc.config.trust_remote_code = True - new_finetuned = FinetunedConfig( + new_finetuned = InferenceConfig( name=new_model_name, route_prefix="/" + new_model_name, model_description=new_model_desc, @@ -711,14 +702,14 @@ def finetune_progress(self, progress=gr.Progress()): progress( float(int(value_step) / int(total_steps)), desc="Start Training: epoch " - + str(value_epoch) - + " / " - + str(total_epochs) - + " " - + "step " - + str(value_step) - + " / " - + str(total_steps), + + str(value_epoch) + + " / " + + str(total_epochs) + + " " + + "step " + + str(value_step) + + " / " + + str(total_steps), ) except Exception: pass @@ -726,15 +717,15 @@ def finetune_progress(self, progress=gr.Progress()): return "

Completed the fine-tuning process.

" def deploy_func( - self, - model_name: str, - replica_num: int, - cpus_per_worker_deploy: int, - hpus_per_worker_deploy: int, + self, + model_name: str, + replica_num: int, + cpus_per_worker_deploy: int, + hpus_per_worker_deploy: int, ): self.shutdown_deploy() if cpus_per_worker_deploy * replica_num > int( - ray.available_resources()["CPU"] + ray.available_resources()["CPU"] ) or hpus_per_worker_deploy * replica_num > int( ray.available_resources()["HPU"] if "HPU" in ray.available_resources() else 0 ): @@ -744,20 +735,8 @@ def deploy_func( finetuned = self._all_models[model_name] model_desc = finetuned.model_description - prompt = model_desc.prompt print("model path: ", model_desc.model_id_or_path) - if model_desc.chat_processor is not None: - chat_model = getattr(sys.modules[__name__], model_desc.chat_processor, None) - if chat_model is None: - return ( - model_name - + " deployment failed. " - + model_desc.chat_processor - + " does not exist." - ) - self.process_tool = chat_model(**prompt.dict()) - finetuned_deploy = finetuned.copy(deep=True) if hpus_per_worker_deploy > 0: finetuned_deploy.device = "hpu" @@ -776,13 +755,14 @@ def deploy_func( elif "fuyu-8b" in model_name: pip_env = "transformers==4.37.2" else: - pip_env = "transformers==4.31.0" + pip_env = "transformers==4.38.1" if finetuned_deploy.device == "cpu": ray_actor_options["runtime_env"] = {"pip": [pip_env]} deployment = PredictorDeployment.options( # type: ignore num_replicas=replica_num, ray_actor_options=ray_actor_options, ).bind(finetuned_deploy) + print("deployment: ", deployment) serve.start(http_options={"host": finetuned_deploy.host, "port": finetuned_deploy.port}) serve.run( deployment, @@ -810,7 +790,7 @@ def exec_command(self, index, command, ray=False): return stdout.read().decode("utf-8"), stderr.read().decode("utf-8") def get_ray_cluster(self): - command = "ray status" + command = "/home/damon/anaconda3/bin/ray status" out, err = self.exec_command(-1, command, ray=True) try: out_words = [word for word in out.split("\n") if "CPU" in word][0] @@ -854,9 +834,9 @@ def kill_node(self, btn_txt, index): elif btn_txt == "Start": index = int(index) command = ( - "RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address=" - + self.master_ip_port - + r""" --resources='{"special_hardware": 2}'""" + "RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address=" + + self.master_ip_port + + r""" --resources='{"special_hardware": 2}'""" ) self.exec_command(index, command, ray=True) self.ray_nodes[index]["Alive"] = "True" @@ -1321,8 +1301,8 @@ def _init_ui(self): rag_input_text = gr.Textbox( label="Local file path", placeholder="Support types: " - + " ".join(recdp_support_suffix) - + ". Support multiple absolute paths, separated by ';'", + + " ".join(recdp_support_suffix) + + ". Support multiple absolute paths, separated by ';'", visible=True, scale=2, ) @@ -1777,7 +1757,7 @@ def _init_ui(self): ray_init_config: Dict[str, Any] = { "runtime_env": { "env_vars": { - "OMP_NUM_THREADS": "24", + "OMP_NUM_THREADS": "32", "ACCELERATE_USE_CPU": "True", "ACCELERATE_MIXED_PRECISION": "no", "CCL_WORKER_COUNT": "1", @@ -1799,9 +1779,10 @@ def _init_ui(self): default_rag_path = args.default_rag_path initial_model_list = {k: all_models[k] for k in sorted(all_models.keys())} + initial_base_model_list = {k: base_models[k] for k in sorted(base_models.keys())} ui = ChatBotUI( initial_model_list, - initial_model_list, + initial_base_model_list, finetune_model_path, finetune_checkpoint_path, repo_path, From 0ec92054ec7543287135455db0addc752c6c93bb Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Fri, 26 Apr 2024 01:33:00 +0000 Subject: [PATCH 44/47] update Signed-off-by: minmingzhu --- llm_on_ray/finetune/finetune_config.py | 20 ++- llm_on_ray/ui/start_ui.py | 171 ++++++++++++------------- 2 files changed, 102 insertions(+), 89 deletions(-) diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index 98597901d..1f7cb191e 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -13,9 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import os from pydantic import BaseModel, validator -from typing import Optional, List +from typing import Optional, List, Dict + +from pydantic_yaml import parse_yaml_raw_as PRECISION_BF16 = "bf16" PRECISION_FP16 = "fp16" @@ -162,3 +165,18 @@ class FinetuneConfig(BaseModel): General: General Dataset: Dataset Training: Training + +base_models: Dict[str, FinetuneConfig] = {} +_models: Dict[str, FinetuneConfig] = {} + +_cur = os.path.dirname(os.path.abspath(__file__)) +_models_folder = _cur + "/models" +for model_file in os.listdir(_models_folder): + file_path = _models_folder + "/" + model_file + if os.path.isdir(file_path): + continue + with open(file_path, "r") as f: + m: FinetuneConfig = parse_yaml_raw_as(FinetuneConfig, f) + _models[m.name] = m + +all_models = _models.copy() diff --git a/llm_on_ray/ui/start_ui.py b/llm_on_ray/ui/start_ui.py index 7b011cb9c..58965627b 100644 --- a/llm_on_ray/ui/start_ui.py +++ b/llm_on_ray/ui/start_ui.py @@ -215,9 +215,6 @@ def user(self, user_message, history): return "", history + [[user_message, None]] def model_generate(self, prompt, request_url, model_name, config, simple_api=True): - print("model_generate") - print("prompt: ", prompt) - if simple_api: sample_input = {"text": prompt, "config": config, "stream": True} else: @@ -255,17 +252,17 @@ def model_generate(self, prompt, request_url, model_name, config, simple_api=Tru yield output def bot( - self, - history, - deploy_model_endpoint, - model_endpoint, - Max_new_tokens, - Temperature, - Top_p, - Top_k, - model_name=None, - image=None, - enhance_knowledge=None, + self, + history, + deploy_model_endpoint, + model_endpoint, + Max_new_tokens, + Temperature, + Top_p, + Top_k, + model_name=None, + image=None, + enhance_knowledge=None, ): request_url = model_endpoint if model_endpoint != "" else deploy_model_endpoint simple_api = is_simple_api(request_url, model_name) @@ -309,16 +306,16 @@ def bot( yield [history, new_token_latency] def bot_test( - self, - bot_queue, - queue_id, - history, - model_endpoint, - Max_new_tokens, - Temperature, - Top_p, - Top_k, - model_name=None, + self, + bot_queue, + queue_id, + history, + model_endpoint, + Max_new_tokens, + Temperature, + Top_p, + Top_k, + model_name=None, ): request_url = model_endpoint simple_api = is_simple_api(request_url, model_name) @@ -354,19 +351,19 @@ def bot_test( bot_queue.put([queue_id, "", ""]) def bot_rag( - self, - history, - deploy_model_endpoint, - model_endpoint, - Max_new_tokens, - Temperature, - Top_p, - Top_k, - rag_selector, - rag_path, - returned_k, - model_name=None, - image=None, + self, + history, + deploy_model_endpoint, + model_endpoint, + Max_new_tokens, + Temperature, + Top_p, + Top_k, + rag_selector, + rag_path, + returned_k, + model_name=None, + image=None, ): enhance_knowledge = None if os.path.isabs(rag_path): @@ -412,16 +409,16 @@ def bot_rag( yield output def regenerate( - self, - db_dir, - upload_type, - input_type, - input_texts, - depth, - upload_files, - embedding_model, - splitter_chunk_size, - cpus_per_worker, + self, + db_dir, + upload_type, + input_type, + input_texts, + depth, + upload_files, + embedding_model, + splitter_chunk_size, + cpus_per_worker, ): if upload_type == "Youtube": input_texts = input_texts.split(";") @@ -500,16 +497,16 @@ def regenerate( return db_dir def send_all_bot( - self, - id, - history, - deployed_model_endpoint, - model_endpoint, - Max_new_tokens, - Temperature, - Top_p, - Top_k, - model_name, + self, + id, + history, + deployed_model_endpoint, + model_endpoint, + Max_new_tokens, + Temperature, + Top_p, + Top_k, + model_name, ): id = int(id) self.bot_queue[id] = Queue() @@ -535,18 +532,18 @@ def send_all_bot( yield res[1] def finetune( - self, - model_name, - custom_model_name, - custom_tokenizer_name, - dataset, - new_model_name, - batch_size, - num_epochs, - max_train_step, - lr, - worker_num, - cpus_per_worker_ftn, + self, + model_name, + custom_model_name, + custom_tokenizer_name, + dataset, + new_model_name, + batch_size, + num_epochs, + max_train_step, + lr, + worker_num, + cpus_per_worker_ftn, ): if model_name == "specify other models": origin_model_path = custom_model_name @@ -559,11 +556,8 @@ def finetune( finetune_config = self._base_models[model_name] gpt_base_model = finetune_config.General.gpt_base_model - print(type(finetune_config)) - print(f"Finetune config: {finetune_config}") + finetune_config = finetune_config.dict() - print(type(finetune_config)) - print(f"Finetune config: {finetune_config}") last_gpt_base_model = False finetuned_model_path = os.path.join(self.finetuned_model_path, model_name, new_model_name) @@ -573,16 +567,16 @@ def finetune( ray_resources = ray.available_resources() if "CPU" not in ray_resources or cpus_per_worker_ftn * worker_num + 1 > int( - ray.available_resources()["CPU"] + ray.available_resources()["CPU"] ): num_req = cpus_per_worker_ftn * worker_num + 1 num_act = int(ray.available_resources()["CPU"]) error_msg = f"Resources are not meeting the demand, required num_cpu is {num_req}, actual num_cpu is {num_act}" raise gr.Error(error_msg) if ( - worker_num != exist_worker - or cpus_per_worker_ftn != exist_cpus_per_worker_ftn - or not (gpt_base_model and last_gpt_base_model) + worker_num != exist_worker + or cpus_per_worker_ftn != exist_cpus_per_worker_ftn + or not (gpt_base_model and last_gpt_base_model) ): ray.shutdown() new_ray_init_config = { @@ -607,12 +601,13 @@ def finetune( ray.init(**new_ray_init_config) finetune_config["Dataset"]["train_file"] = dataset - # finetune_config["General"]["base_model"] = origin_model_path + if origin_model_path is not None: + finetune_config["General"]["base_model"] = origin_model_path + if tokenizer_path is not None: + finetune_config["General"]["tokenizer_name"] = tokenizer_path finetune_config["Training"]["epochs"] = num_epochs finetune_config["General"]["output_dir"] = finetuned_model_path - # if finetuned_checkpoint_path: - # finetune_config["General"]["checkpoint_dir"] = finetuned_checkpoint_path finetune_config["Training"]["batch_size"] = batch_size finetune_config["Training"]["learning_rate"] = lr if max_train_step != 0: @@ -666,6 +661,7 @@ def finetune( new_model_desc = ModelDescription( model_id_or_path=finetune_config["General"].get("base_model"), tokenizer_name_or_path=finetuned_model_path, + peft_model_id_or_path=finetuned_model_path, ) else: new_model_desc = ModelDescription( @@ -762,7 +758,6 @@ def deploy_func( num_replicas=replica_num, ray_actor_options=ray_actor_options, ).bind(finetuned_deploy) - print("deployment: ", deployment) serve.start(http_options={"host": finetuned_deploy.host, "port": finetuned_deploy.port}) serve.run( deployment, @@ -790,7 +785,7 @@ def exec_command(self, index, command, ray=False): return stdout.read().decode("utf-8"), stderr.read().decode("utf-8") def get_ray_cluster(self): - command = "/home/damon/anaconda3/bin/ray status" + command = "ray status" out, err = self.exec_command(-1, command, ray=True) try: out_words = [word for word in out.split("\n") if "CPU" in word][0] @@ -834,9 +829,9 @@ def kill_node(self, btn_txt, index): elif btn_txt == "Start": index = int(index) command = ( - "RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address=" - + self.master_ip_port - + r""" --resources='{"special_hardware": 2}'""" + "RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address=" + + self.master_ip_port + + r""" --resources='{"special_hardware": 2}'""" ) self.exec_command(index, command, ray=True) self.ray_nodes[index]["Alive"] = "True" @@ -1301,8 +1296,8 @@ def _init_ui(self): rag_input_text = gr.Textbox( label="Local file path", placeholder="Support types: " - + " ".join(recdp_support_suffix) - + ". Support multiple absolute paths, separated by ';'", + + " ".join(recdp_support_suffix) + + ". Support multiple absolute paths, separated by ';'", visible=True, scale=2, ) @@ -1757,7 +1752,7 @@ def _init_ui(self): ray_init_config: Dict[str, Any] = { "runtime_env": { "env_vars": { - "OMP_NUM_THREADS": "32", + "OMP_NUM_THREADS": "24", "ACCELERATE_USE_CPU": "True", "ACCELERATE_MIXED_PRECISION": "no", "CCL_WORKER_COUNT": "1", From a3284947e06fe50537d96e7a0929b91e3278c2ed Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Fri, 26 Apr 2024 02:02:13 +0000 Subject: [PATCH 45/47] update Signed-off-by: minmingzhu --- llm_on_ray/finetune/finetune_config.py | 3 +- llm_on_ray/ui/start_ui.py | 65 +++++++++++++------------- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index 1f7cb191e..2cb9d0ce8 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -166,6 +166,7 @@ class FinetuneConfig(BaseModel): Dataset: Dataset Training: Training + base_models: Dict[str, FinetuneConfig] = {} _models: Dict[str, FinetuneConfig] = {} @@ -177,6 +178,6 @@ class FinetuneConfig(BaseModel): continue with open(file_path, "r") as f: m: FinetuneConfig = parse_yaml_raw_as(FinetuneConfig, f) - _models[m.name] = m + _models[m.General.base_model] = m all_models = _models.copy() diff --git a/llm_on_ray/ui/start_ui.py b/llm_on_ray/ui/start_ui.py index 58965627b..e7188b283 100644 --- a/llm_on_ray/ui/start_ui.py +++ b/llm_on_ray/ui/start_ui.py @@ -110,20 +110,20 @@ def get_result(self): class ChatBotUI: def __init__( - self, - all_models: Dict[str, InferenceConfig], - base_models: Dict[str, FinetuneConfig], - finetune_model_path: str, - finetuned_checkpoint_path: str, - repo_code_path: str, - default_data_path: str, - default_rag_path: str, - config: dict, - head_node_ip: str, - node_port: str, - node_user_name: str, - conda_env_name: str, - master_ip_port: str, + self, + all_models: Dict[str, InferenceConfig], + base_models: Dict[str, FinetuneConfig], + finetune_model_path: str, + finetuned_checkpoint_path: str, + repo_code_path: str, + default_data_path: str, + default_rag_path: str, + config: dict, + head_node_ip: str, + node_port: str, + node_user_name: str, + conda_env_name: str, + master_ip_port: str, ): self._all_models = all_models self._base_models = base_models @@ -556,14 +556,15 @@ def finetune( finetune_config = self._base_models[model_name] gpt_base_model = finetune_config.General.gpt_base_model - finetune_config = finetune_config.dict() last_gpt_base_model = False finetuned_model_path = os.path.join(self.finetuned_model_path, model_name, new_model_name) exist_worker = int(finetune_config["Training"].get("num_training_workers")) - exist_cpus_per_worker_ftn = int(finetune_config["Training"].get("resources_per_worker")["CPU"]) + exist_cpus_per_worker_ftn = int( + finetune_config["Training"].get("resources_per_worker")["CPU"] + ) ray_resources = ray.available_resources() if "CPU" not in ray_resources or cpus_per_worker_ftn * worker_num + 1 > int( @@ -602,9 +603,9 @@ def finetune( finetune_config["Dataset"]["train_file"] = dataset if origin_model_path is not None: - finetune_config["General"]["base_model"] = origin_model_path + finetune_config["General"]["base_model"] = origin_model_path if tokenizer_path is not None: - finetune_config["General"]["tokenizer_name"] = tokenizer_path + finetune_config["General"]["tokenizer_name"] = tokenizer_path finetune_config["Training"]["epochs"] = num_epochs finetune_config["General"]["output_dir"] = finetuned_model_path @@ -698,14 +699,14 @@ def finetune_progress(self, progress=gr.Progress()): progress( float(int(value_step) / int(total_steps)), desc="Start Training: epoch " - + str(value_epoch) - + " / " - + str(total_epochs) - + " " - + "step " - + str(value_step) - + " / " - + str(total_steps), + + str(value_epoch) + + " / " + + str(total_epochs) + + " " + + "step " + + str(value_step) + + " / " + + str(total_steps), ) except Exception: pass @@ -713,15 +714,15 @@ def finetune_progress(self, progress=gr.Progress()): return "

Completed the fine-tuning process.

" def deploy_func( - self, - model_name: str, - replica_num: int, - cpus_per_worker_deploy: int, - hpus_per_worker_deploy: int, + self, + model_name: str, + replica_num: int, + cpus_per_worker_deploy: int, + hpus_per_worker_deploy: int, ): self.shutdown_deploy() if cpus_per_worker_deploy * replica_num > int( - ray.available_resources()["CPU"] + ray.available_resources()["CPU"] ) or hpus_per_worker_deploy * replica_num > int( ray.available_resources()["HPU"] if "HPU" in ray.available_resources() else 0 ): From a8e7b384e6215d622a96c3a8ef13a5638a4abc42 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 29 Apr 2024 16:09:46 +0800 Subject: [PATCH 46/47] update --- .../api_server_simple/query_single.py | 7 +- llm_on_ray/finetune/finetune_config.py | 2 +- llm_on_ray/inference/chat_template_process.py | 3 - llm_on_ray/inference/models/gpt2.yaml | 1 - llm_on_ray/inference/predictor_deployment.py | 80 ++----------------- llm_on_ray/inference/utils.py | 4 +- 6 files changed, 14 insertions(+), 83 deletions(-) diff --git a/examples/inference/api_server_simple/query_single.py b/examples/inference/api_server_simple/query_single.py index 62bb4dc45..b6d935c9a 100644 --- a/examples/inference/api_server_simple/query_single.py +++ b/examples/inference/api_server_simple/query_single.py @@ -55,7 +55,12 @@ ) args = parser.parse_args() -prompt = "Once upon a time," +# prompt = "Once upon a time," +prompt = [ + {"role": "user", "content": "Which is bigger, the moon or the sun?"}, +] + + config: Dict[str, Union[int, float]] = {} if args.max_new_tokens: config["max_new_tokens"] = int(args.max_new_tokens) diff --git a/llm_on_ray/finetune/finetune_config.py b/llm_on_ray/finetune/finetune_config.py index 2cb9d0ce8..136b698eb 100644 --- a/llm_on_ray/finetune/finetune_config.py +++ b/llm_on_ray/finetune/finetune_config.py @@ -64,7 +64,7 @@ class General(BaseModel): enable_gradient_checkpointing: bool = False chat_template: Optional[str] = None default_chat_template: str = ( - "{{ bos_token }}" + "Below is an instruction that describes a task. Write a response that appropriately completes the request." "{% if messages[0]['role'] == 'system' %}" "{{ raise_exception('System role not supported') }}" "{% endif %}" diff --git a/llm_on_ray/inference/chat_template_process.py b/llm_on_ray/inference/chat_template_process.py index 2f7a64d27..851004b01 100644 --- a/llm_on_ray/inference/chat_template_process.py +++ b/llm_on_ray/inference/chat_template_process.py @@ -14,7 +14,6 @@ # limitations under the License. # from typing import List - from llm_on_ray.inference.api_openai_backend.openai_protocol import ChatMessage @@ -63,14 +62,12 @@ def _extract_messages(self, messages): return texts, images def _prepare_image(self, messages: list): - """Prepare image from history messages.""" from PIL import Image import requests from io import BytesIO import base64 import re - # prepare images images: List = [] for msg in messages: msg = dict(msg) diff --git a/llm_on_ray/inference/models/gpt2.yaml b/llm_on_ray/inference/models/gpt2.yaml index cddc45cd8..9ad098c24 100644 --- a/llm_on_ray/inference/models/gpt2.yaml +++ b/llm_on_ray/inference/models/gpt2.yaml @@ -15,4 +15,3 @@ model_description: tokenizer_name_or_path: gpt2 gpt_base_model: true chat_template: "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ message['content'].strip() }}{% endif %}{% endfor %}" - diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index aab110727..502ad150f 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -340,31 +340,6 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No else: prompt = self.process_tool.get_prompt(input) return prompt - else: - if isinstance(input, list) and input and isinstance(input[0], dict): - prompt = self.predictor.tokenizer.apply_chat_template(input, tokenize=False) - elif isinstance(input, list) and input and isinstance(input[0], list): - prompt = [ - self.predictor.tokenizer.apply_chat_template(t, tokenize=False) - for t in input - ] - elif isinstance(input, list) and input and isinstance(input[0], ChatMessage): - messages = [] - for chat_message in input: - message = {"role": chat_message.role, "content": chat_message.content} - messages.append(message) - prompt = self.predictor.tokenizer.apply_chat_template( - messages, tokenize=False - ) - elif isinstance(input, list) and input and isinstance(input[0], str): - prompt = input - elif isinstance(input, str): - prompt = input - else: - raise TypeError( - f"Unsupported type {type(input)} for text. Expected dict or list of dicts." - ) - return prompt elif prompt_format == PromptFormat.PROMPTS_FORMAT: raise HTTPException(400, "Invalid prompt format.") return input @@ -411,9 +386,14 @@ async def openai_call( tool_choice=None, ): self.use_openai = True + print("openai_call") + print(input) + print(type(input)) # return prompt or list of prompts preprocessed prompts = self.preprocess_prompts(input, tools, tool_choice) + print(prompts) + print(type(prompts)) # Handle streaming response if streaming_response: @@ -421,53 +401,3 @@ async def openai_call( yield result else: yield await self.handle_non_streaming(prompts, config) - - def _extract_messages(self, messages): - texts, images = [], [] - for message in messages: - if message["role"] == "user" and isinstance(message["content"], list): - texts.append({"role": "user", "content": message["content"][0]["text"]}) - images.append( - {"role": "user", "content": message["content"][1]["image_url"]["url"]} - ) - else: - texts.append(message) - return texts, images - - def _prepare_image(self, messages: list): - """Prepare image from history messages.""" - from PIL import Image - import requests - from io import BytesIO - import base64 - import re - - # prepare images - images: List = [] - if isinstance(messages[0], List): - for i in range(len(messages)): - for msg in messages[i]: - msg = dict(msg) - content = msg["content"] - if "url" not in content: - continue - is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 - if is_data: - encoded_str = re.sub("^data:image/.+;base64,", "", content["url"]) - images[i].append(Image.open(BytesIO(base64.b64decode(encoded_str)))) - else: - images[i].append(Image.open(requests.get(content["url"], stream=True).raw)) - elif isinstance(messages[0], dict): - for msg in messages: - msg = dict(msg) - content = msg["content"] - if "url" not in content: - continue - is_data = len(re.findall("^data:image/.+;base64,", content["url"])) > 0 - if is_data: - encoded_str = re.sub("^data:image/.+;base64,", "", content["url"]) - images.append(Image.open(BytesIO(base64.b64decode(encoded_str)))) - else: - images.append(Image.open(requests.get(content["url"], stream=True).raw)) - - return images diff --git a/llm_on_ray/inference/utils.py b/llm_on_ray/inference/utils.py index 5a8db0401..56b9146e5 100644 --- a/llm_on_ray/inference/utils.py +++ b/llm_on_ray/inference/utils.py @@ -162,13 +162,13 @@ def is_cpu_without_ipex(infer_conf: InferenceConfig) -> bool: return (not infer_conf.ipex.enabled) and infer_conf.device == DEVICE_CPU -def get_prompt_format(input: Union[List[str], List[dict], List[List[dict]], List[ChatMessage]]): +def get_prompt_format(input: Union[List[str], List[dict], List[ChatMessage]]): chat_format = True prompts_format = True for item in input: if isinstance(item, str): chat_format = False - elif isinstance(item, dict) or isinstance(item, ChatMessage) or isinstance(item, list): + elif isinstance(item, dict) or isinstance(item, ChatMessage): prompts_format = False else: chat_format = False From cbae21340d16133265fd75cece7f18d1ac2935d6 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Mon, 6 May 2024 11:12:07 +0800 Subject: [PATCH 47/47] update --- llm_on_ray/inference/predictor_deployment.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index 502ad150f..74b9430e8 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -311,9 +311,13 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No Raises: HTTPException: If the input prompt format is invalid or not supported. """ + if isinstance(input, str): return input - elif isinstance(input, list): + elif isinstance(input, List): + prompts = [] + images = [] + prompt_format = get_prompt_format(input) if prompt_format == PromptFormat.CHAT_FORMAT: # Process the input prompts with tools @@ -340,14 +344,17 @@ def preprocess_prompts(self, input: Union[str, list], tools=None, tool_choice=No else: prompt = self.process_tool.get_prompt(input) return prompt + else: + prompts.extend(input) elif prompt_format == PromptFormat.PROMPTS_FORMAT: + prompts.extend(input) + else: raise HTTPException(400, "Invalid prompt format.") - return input + return prompts else: raise HTTPException(400, "Invalid prompt format.") async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSONResponse, str]: - logger.info("PredictorDeployment call") self.use_openai = False try: @@ -366,7 +373,6 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON content="Empty prompt is not supported.", ) config = json_request["config"] if "config" in json_request else {} - # return prompt or list of prompts preprocessed prompts = self.preprocess_prompts(input)