diff --git a/llm_on_ray/inference/api_openai_backend/query_client.py b/llm_on_ray/inference/api_openai_backend/query_client.py index 569be40dc..0e52c7960 100644 --- a/llm_on_ray/inference/api_openai_backend/query_client.py +++ b/llm_on_ray/inference/api_openai_backend/query_client.py @@ -37,6 +37,9 @@ from fastapi import HTTPException from llm_on_ray.inference.api_openai_backend.openai_protocol import ModelCard, Prompt from llm_on_ray.inference.api_openai_backend.request_handler import handle_request +from llm_on_ray.inference.logger import get_logger + +logger = get_logger(__name__) class RouterQueryClient: @@ -54,10 +57,11 @@ async def query(self, model: str, prompt: Prompt, request_id: str, streaming_rep top_p = request_config.get("top_p", 1.0) max_new_tokens = request_config.get("max_tokens", None) gen_config = {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p} - if temperature != 1.0 or top_p != 1.0: - gen_config.update({"do_sample": True}) - if request_config.get("ignore_eos", False): - gen_config.update({"ignore_eos": True}) + gen_config.update({"do_sample": temperature != 1.0 or top_p != 1.0}) + gen_config.update({"ignore_eos": request_config.get("ignore_eos", False)}) + + logger.debug(f"Print request_config: {request_config}") + # TODO: set debug mode in request_config, add and set debug mode to gen_config, since gen_config is the config to be passed down async for x in handle_request( model=model, diff --git a/llm_on_ray/inference/predictor_deployment.py b/llm_on_ray/inference/predictor_deployment.py index ed67f5119..448ce5197 100644 --- a/llm_on_ray/inference/predictor_deployment.py +++ b/llm_on_ray/inference/predictor_deployment.py @@ -394,8 +394,11 @@ async def __call__(self, http_request: Request) -> Union[StreamingResponse, JSON streaming_response = request["stream"] input = request["text"] config = request["config"] - + logger.debug(f"Print config received from json: {config}") + logger.debug(f"Print inputs for prompts: {input}") + # return prompt or list of prompts preprocessed prompts = self.preprocess_prompts(input) + logger.debug(f"Print prompts from inputs: {prompts}") # Handle streaming response if streaming_response: @@ -414,8 +417,12 @@ async def openai_call( ): self.use_openai = True + # TODO: Pass down config into preprocess_prompts for more logs. + logger.debug(f"Print config received from query_client: {config}") + logger.debug(f"Print inputs for prompts: {input}") # return prompt or list of prompts preprocessed input = self.preprocess_prompts(input, tools, tool_choice) + logger.debug(f"Print prompts from inputs: {input}") # Handle streaming response if streaming_response: diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py index 5e19c8733..97ec1dcb1 100644 --- a/llm_on_ray/inference/predictors/hpu_predictor.py +++ b/llm_on_ray/inference/predictors/hpu_predictor.py @@ -69,6 +69,9 @@ MllmPromptInput, ) from llm_on_ray.inference.utils import decide_torch_dtype +from llm_on_ray.inference.logger import get_logger + +logger = get_logger(__name__) class HPUPredictor(Predictor): @@ -79,9 +82,16 @@ def __init__(self, infer_conf: InferenceConfig): # decide correct torch dtype for loading HF model decide_torch_dtype(infer_conf) + logger.debug(f"Print inference config: {infer_conf}") + self.use_lazy_mode = not infer_conf.hpu_model_config.torch_compile self.use_hpu_graphs = infer_conf.hpu_model_config.use_hpu_graphs + # optimize transformers for gaudi + from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi + + adapt_transformers_to_gaudi() + if infer_conf.deepspeed: # DeepSpeed is enabled, start worker group # Prepare placement group @@ -105,13 +115,6 @@ def __init__(self, infer_conf: InferenceConfig): htcore.hpu_set_env() - # Tweak transformer to optimize performance on Gaudi - from optimum.habana.transformers.modeling_utils import ( - adapt_transformers_to_gaudi, - ) - - adapt_transformers_to_gaudi() - self.device = torch.device("hpu") model = AutoModelForCausalLM.from_pretrained( model_desc.model_id_or_path, **model_desc.config.dict() @@ -219,6 +222,7 @@ def generate(self, input: GenerateInput, **config) -> GenerateOutput: def streaming_generate(self, prompt, streamer, **config): self._process_config(config) + # Q1: Why it is handled here when using both deepspeed and hpu? if self.infer_conf.deepspeed: self.deepspeed_workers[0].streaming_generate.remote(prompt, streamer, **config) for worker in self.deepspeed_workers[1:]: @@ -284,10 +288,6 @@ def load_model_and_tokenizer(self): self.world_size = int(os.environ["WORLD_SIZE"]) self.local_rank = int(os.environ["LOCAL_RANK"]) self.device = torch.device("hpu") - # optimize transformers for gaudi - from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi - - adapt_transformers_to_gaudi() self.load_model() model_desc = self.infer_conf.model_description self.tokenizer = load_tokenizer(self.model, model_desc.tokenizer_name_or_path) diff --git a/llm_on_ray/inference/serve.py b/llm_on_ray/inference/serve.py index ecd3bdee8..5304c7b3b 100644 --- a/llm_on_ray/inference/serve.py +++ b/llm_on_ray/inference/serve.py @@ -25,6 +25,9 @@ InferenceConfig, all_models, ) +from llm_on_ray.inference.logger import get_logger + +logger = get_logger(__name__) def get_deployed_models(args): @@ -41,14 +44,16 @@ def get_deployed_models(args): set(all_models_name) ), f"models must be a subset of {all_models_name} predefined by inference/models/*.yaml, but found {models}." model_list = {model: all_models[model] for model in models} + logger.debug( + f"--config_file is not set while --models is set, serving model(s): {model_list}" + ) else: model_list = all_models + logger.debug(f"--config_file and --models is not set, serving all models: {model_list}") else: - # config_file has precedence over others - if args.config_file: - print("Reading from config file, " + args.config_file) - with open(args.config_file, "r") as f: - infer_conf = parse_yaml_raw_as(InferenceConfig, f) + print("DEBUG:serve.py: Reading from config file, " + args.config_file) + with open(args.config_file, "r") as f: + infer_conf = parse_yaml_raw_as(InferenceConfig, f) model_list = {} model_list[infer_conf.name] = infer_conf @@ -147,6 +152,8 @@ def main(argv=None): ray.init(address="auto") deployments, model_list = get_deployed_models(args) + logger.debug(f"Service is running with deployments: {str(deployments)}") + logger.debug(f"Service is running models: {str(model_list)}") if args.simple: # provide simple model endpoint # models can be served to customed URLs according to configuration files. @@ -156,8 +163,6 @@ def main(argv=None): # all models are served under the same URL and then accessed # through model_id, so it needs to pass in a unified URL. host = "127.0.0.1" if args.serve_local_only else "0.0.0.0" - print("Service is running with deployments:" + str(deployments)) - print("Service is running models:" + str(model_list)) openai_serve_run(deployments, model_list, host, "/", args.port, args.max_ongoing_requests) msg = "Service is deployed successfully."