diff --git a/common/logging.py b/common/logging.py index f181ba915..c71620623 100644 --- a/common/logging.py +++ b/common/logging.py @@ -11,9 +11,9 @@ logging_config = { "version": 1, "loggers": { - "root": {"level": "DEBUG", "handlers": ["consoleHandler"]}, + "root": {"level": "INFO", "handlers": ["consoleHandler"]}, "common": { - "level": "DEBUG", + "level": "INFO", "handlers": ["consoleHandler"], "qualname": "common", "propagate": 0, @@ -22,7 +22,7 @@ "handlers": { "consoleHandler": { "class": "logging.StreamHandler", - "level": "DEBUG", + "level": "INFO", "formatter": "standardFormatter", }, }, diff --git a/dev/scripts/install-ui.sh b/dev/scripts/install-ui.sh new file mode 100644 index 000000000..67ae06a82 --- /dev/null +++ b/dev/scripts/install-ui.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +# install dependency +pip install "gradio<=3.36.1" "langchain<=0.0.329" "langchain_community<=0.0.13" "paramiko<=3.4.0" "sentence-transformers" "faiss-cpu" + +# install pyrecdp from source +pip install 'git+https://github.com/intel/e2eAIOK.git#egg=pyrecdp&subdirectory=RecDP' diff --git a/docs/web_ui.md b/docs/web_ui.md index 3e247f06f..b1a4e0e1b 100644 --- a/docs/web_ui.md +++ b/docs/web_ui.md @@ -2,17 +2,21 @@ LLM-on-Ray introduces a Web UI, allowing users to easily finetune and deploy LLMs through a user-friendly interface. Additionally, the UI includes a chatbot application, enabling users to immediately test and refine the models. -## Setup -Please follow [setup.md](setup.md) to setup the environment first. +## Setup Base Environment +Please follow [setup.md](setup.md) to setup the base environment first. + +## Setup UI Environment +After activating the environment installed from the previous step, please run the following script to install environment for Web UI. +```bash +$ dev/scripts/install-ui.sh +``` ## Start Web UI ```bash python -u ui/start_ui.py --node_user_name $user --conda_env_name $conda_env --master_ip_port "$node_ip:6379" -# Get urls from the log -# Running on local URL: http://0.0.0.0:8080 -# Running on public URL: https://180cd5f7c31a1cfd3c.gradio.live ``` +You will get URL from the command line output (E.g. http://0.0.0.0:8080 for local network and https://180cd5f7c31a1cfd3c.gradio.live for public network) and use the web browser to open it. ## Finetune LLMs On the `Finetune` tab, you can configure the base model, finetuning parameters, the dataset path and the new model name. Click `Start To Finetune` to start finetuning. diff --git a/finetune/finetune.py b/finetune/finetune.py index 0815dabfe..5b2c62a6d 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -24,7 +24,7 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) import common -from finetune.finetune_config import FinetuneConfig +from finetune_config import FinetuneConfig def get_accelerate_environment_variable(mode: str, config: Union[Dict[str, Any], None]) -> dict: diff --git a/inference/api_server_openai.py b/inference/api_server_openai.py index 77831a9d2..6572059d1 100644 --- a/inference/api_server_openai.py +++ b/inference/api_server_openai.py @@ -34,8 +34,8 @@ import os from ray import serve -from inference.api_openai_backend.query_client import RouterQueryClient -from inference.api_openai_backend.router_app import Router, router_app +from api_openai_backend.query_client import RouterQueryClient +from api_openai_backend.router_app import Router, router_app def router_application(deployments): diff --git a/inference/deepspeed_predictor.py b/inference/deepspeed_predictor.py index 167181c17..1fb552055 100644 --- a/inference/deepspeed_predictor.py +++ b/inference/deepspeed_predictor.py @@ -14,7 +14,7 @@ import os from predictor import Predictor from utils import get_torch_dtype -from inference.inference_config import ( +from inference_config import ( InferenceConfig, DEVICE_CPU, DEVICE_XPU, diff --git a/inference/models/template/export_inference_config_to_yaml.py b/inference/models/template/export_inference_config_to_yaml.py index 62cfd4b75..0f2a5f9f7 100644 --- a/inference/models/template/export_inference_config_to_yaml.py +++ b/inference/models/template/export_inference_config_to_yaml.py @@ -1,6 +1,6 @@ import yaml import os -from inference.inference_config import InferenceConfig +from inference_config import InferenceConfig ic = InferenceConfig() diff --git a/inference/predictor.py b/inference/predictor.py index 4f7c9d3af..b6f126b1c 100644 --- a/inference/predictor.py +++ b/inference/predictor.py @@ -1,7 +1,7 @@ import re import torch from transformers import AutoTokenizer, StoppingCriteriaList -from inference.inference_config import InferenceConfig +from inference_config import InferenceConfig from utils import StoppingCriteriaSub from typing import List, AsyncGenerator, Union diff --git a/inference/predictor_deployment.py b/inference/predictor_deployment.py index 0ecff2585..f7228918d 100644 --- a/inference/predictor_deployment.py +++ b/inference/predictor_deployment.py @@ -23,10 +23,10 @@ from queue import Empty import torch from transformers import TextIteratorStreamer -from inference.inference_config import InferenceConfig +from inference_config import InferenceConfig from typing import Union, Dict, Any from starlette.responses import StreamingResponse, JSONResponse -from inference.api_openai_backend.openai_protocol import ModelResponse +from api_openai_backend.openai_protocol import ModelResponse from utils import get_input_format diff --git a/inference/serve.py b/inference/serve.py index e73397a79..c8e52c4c6 100644 --- a/inference/serve.py +++ b/inference/serve.py @@ -21,7 +21,7 @@ from api_server_simple import serve_run from api_server_openai import openai_serve_run from predictor_deployment import PredictorDeployment -from inference.inference_config import ModelDescription, InferenceConfig, all_models +from inference_config import ModelDescription, InferenceConfig, all_models def get_deployed_models(args): diff --git a/inference/transformer_predictor.py b/inference/transformer_predictor.py index 553ea4eef..33c612cbf 100644 --- a/inference/transformer_predictor.py +++ b/inference/transformer_predictor.py @@ -1,7 +1,7 @@ import torch from transformers import AutoModelForCausalLM, AutoConfig from transformers import TextIteratorStreamer -from inference.inference_config import InferenceConfig, PRECISION_BF16 +from inference_config import InferenceConfig, PRECISION_BF16 from predictor import Predictor from utils import get_torch_dtype diff --git a/inference/utils.py b/inference/utils.py index 338f1d326..85c05b31e 100644 --- a/inference/utils.py +++ b/inference/utils.py @@ -16,7 +16,7 @@ from transformers import StoppingCriteria import torch -from inference.inference_config import InferenceConfig, DEVICE_CPU +from inference_config import InferenceConfig, DEVICE_CPU from typing import Dict, Any, List, Union diff --git a/inference/vllm_predictor.py b/inference/vllm_predictor.py index 6123b3906..d39105b12 100644 --- a/inference/vllm_predictor.py +++ b/inference/vllm_predictor.py @@ -1,7 +1,7 @@ import asyncio from typing import AsyncGenerator, List, Union from predictor import Predictor -from inference.inference_config import InferenceConfig, PRECISION_BF16 +from inference_config import InferenceConfig, PRECISION_BF16 from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.sampling_params import SamplingParams diff --git a/ui/start_ui.py b/ui/start_ui.py index d4c05e4a5..6621cf77c 100644 --- a/ui/start_ui.py +++ b/ui/start_ui.py @@ -19,11 +19,14 @@ import os import sys -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) -from inference.inference_config import all_models, ModelDescription, Prompt -from inference.inference_config import InferenceConfig as FinetunedConfig -from inference.chat_process import ChatModelGptJ, ChatModelLLama # noqa: F401 -from inference.predictor_deployment import PredictorDeployment +ui_folder = os.path.dirname(__file__) +sys.path.append(os.path.join(ui_folder, "..")) +sys.path.append(os.path.join(ui_folder, "../inference")) + +from predictor_deployment import PredictorDeployment +from chat_process import ChatModelGptJ, ChatModelLLama # noqa: F401 +from inference_config import all_models, ModelDescription, Prompt +from inference_config import InferenceConfig as FinetunedConfig from ray import serve import ray import gradio as gr @@ -31,8 +34,9 @@ from ray.tune import Stopper from ray.train.base_trainer import TrainingFailedError from ray.tune.logger import LoggerCallback -from multiprocessing import Process, Queue from ray.util import queue +from ray.job_config import JobConfig +from multiprocessing import Process, Queue import paramiko from html_format import cpu_memory_html, ray_status_html, custom_css from typing import Dict, List, Any @@ -109,6 +113,7 @@ def __init__( default_data_path: str, default_rag_path: str, config: dict, + ray_init_config: dict, head_node_ip: str, node_port: str, node_user_name: str, @@ -122,6 +127,7 @@ def __init__( self.repo_code_path = repo_code_path self.default_data_path = default_data_path self.config = config + self.ray_init_config = ray_init_config self.head_node_ip = head_node_ip self.node_port = node_port self.user_name = node_user_name @@ -455,7 +461,7 @@ def finetune( origin_model_path = model_desc.model_id_or_path tokenizer_path = model_desc.tokenizer_name_or_path gpt_base_model = model_desc.gpt_base_model - last_gpt_base_model = False + finetuned_model_path = os.path.join(self.finetuned_model_path, model_name, new_model_name) finetuned_checkpoint_path = ( os.path.join(self.finetuned_checkpoint_path, model_name, new_model_name) @@ -464,46 +470,24 @@ def finetune( ) finetune_config = self.config.copy() - training_config = finetune_config.get("Training") - exist_worker = int(training_config["num_training_workers"]) - exist_cpus_per_worker_ftn = int(training_config["resources_per_worker"]["CPU"]) + new_ray_init_config = self.ray_init_config.copy() ray_resources = ray.available_resources() if "CPU" not in ray_resources or cpus_per_worker_ftn * worker_num + 1 > int( ray.available_resources()["CPU"] ): raise gr.Error("Resources are not meeting the demand") - if ( - worker_num != exist_worker - or cpus_per_worker_ftn != exist_cpus_per_worker_ftn - or not (gpt_base_model and last_gpt_base_model) - ): - ray.shutdown() - new_ray_init_config = { - "runtime_env": { - "env_vars": { - "OMP_NUM_THREADS": str(cpus_per_worker_ftn), - "ACCELERATE_USE_CPU": "True", - "ACCELERATE_MIXED_PRECISION": "no", - "CCL_WORKER_COUNT": "1", - "CCL_LOG_LEVEL": "info", - "WORLD_SIZE": str(worker_num), - } - }, - "address": "auto", - "_node_ip_address": "127.0.0.1", - } - if gpt_base_model: - new_ray_init_config["runtime_env"]["pip"] = ["transformers==4.26.0"] - else: - new_ray_init_config["runtime_env"]["pip"] = ["transformers==4.31.0"] - last_gpt_base_model = gpt_base_model - finetune_config["Training"]["num_training_workers"] = int(worker_num) - finetune_config["Training"]["resources_per_worker"]["CPU"] = int(cpus_per_worker_ftn) - ray.init(**new_ray_init_config) - exist_worker = worker_num - exist_cpus_per_worker_ftn = cpus_per_worker_ftn + ray.shutdown() + if gpt_base_model: + new_ray_init_config["runtime_env"]["pip"] = ["transformers==4.26.0"] + else: + new_ray_init_config["runtime_env"]["pip"] = ["transformers==4.31.0"] + new_ray_init_config["runtime_env"]["env_vars"]["WORLD_SIZE"] = str(worker_num) + finetune_config["Training"]["num_training_workers"] = int(worker_num) + finetune_config["Training"]["resources_per_worker"]["CPU"] = int(cpus_per_worker_ftn) + + ray.init(**new_ray_init_config) finetune_config["Dataset"]["train_file"] = dataset finetune_config["General"]["base_model"] = origin_model_path @@ -617,6 +601,16 @@ def finetune_progress(self, progress=gr.Progress()): def deploy_func(self, model_name: str, replica_num: int, cpus_per_worker_deploy: int): self.shutdown_deploy() + ray.shutdown() + new_ray_init_config = self.ray_init_config.copy() + new_ray_init_config["job_config"] = JobConfig( + code_search_path=[ + os.path.join(ui_folder, "../finetune"), + os.path.join(ui_folder, "../inference"), + ] + ) + ray.init(**new_ray_init_config) + if cpus_per_worker_deploy * replica_num > int(ray.available_resources()["CPU"]): raise gr.Error("Resources are not meeting the demand") @@ -834,10 +828,14 @@ def _init_ui(self): gr.HTML("

" + step1 + "

") with gr.Group(): base_models_list = list(self._base_models.keys()) + # set the default value of finetuning to gpt2 + model_index = ( + base_models_list.index("gpt2") if "gpt2" in base_models_list else 0 + ) base_models_list.append("specify other models") base_model_dropdown = gr.Dropdown( base_models_list, - value=base_models_list[2], + value=base_models_list[model_index], label="Select Base Model", allow_custom_value=True, ) @@ -934,9 +932,15 @@ def _init_ui(self): with gr.Row(): with gr.Column(scale=0.8): all_models_list = list(self._all_models.keys()) + # set the default value of deployment to llama-2-7b-chat-hf + model_index = ( + all_models_list.index("llama-2-7b-chat-hf") + if "llama-2-7b-chat-hf" in all_models_list + else 0 + ) all_model_dropdown = gr.Dropdown( all_models_list, - value=all_models_list[3], + value=all_models_list[model_index], label="Select Model to Deploy", elem_classes="disable_status", allow_custom_value=True, @@ -1563,8 +1567,6 @@ def _init_ui(self): default_data_path = os.path.abspath( infer_path + os.path.sep + "../examples/data/sample_finetune_data.jsonl" ) - - sys.path.append(repo_path) from finetune.finetune import get_accelerate_environment_variable finetune_config: Dict[str, Any] = { @@ -1617,6 +1619,7 @@ def _init_ui(self): default_data_path, default_rag_path, finetune_config, + ray_init_config, head_node_ip, args.node_port, args.node_user_name,