diff --git a/llm_on_ray/inference/api_server_openai.py b/llm_on_ray/inference/api_server_openai.py index 4bc42e99b..a91f9f112 100644 --- a/llm_on_ray/inference/api_server_openai.py +++ b/llm_on_ray/inference/api_server_openai.py @@ -57,12 +57,11 @@ def router_application(deployments, max_concurrent_queries): def openai_serve_run(deployments, host, route_prefix, port, max_concurrent_queries): router_app = router_application(deployments, max_concurrent_queries) + serve.start(http_options={"host": host, "port": port}) serve.run( router_app, name="router", route_prefix=route_prefix, - host=host, - _blocking=True, ).options( stream=True, use_new_handle_api=True, diff --git a/llm_on_ray/inference/api_server_simple.py b/llm_on_ray/inference/api_server_simple.py index 0663700d8..f2cf0a1e7 100644 --- a/llm_on_ray/inference/api_server_simple.py +++ b/llm_on_ray/inference/api_server_simple.py @@ -22,11 +22,10 @@ def serve_run(deployments, model_list): for model_id, infer_conf in model_list.items(): print("deploy model: ", model_id) deployment = deployments[model_id] + + serve.start(http_options={"host": infer_conf.host, "port": infer_conf.port}) serve.run( deployment, - _blocking=True, - host=infer_conf.host, - port=infer_conf.port, name=infer_conf.name, route_prefix=infer_conf.route_prefix, ) diff --git a/llm_on_ray/inference/torch_dist.py b/llm_on_ray/inference/torch_dist.py index c99baf0c0..91358db03 100644 --- a/llm_on_ray/inference/torch_dist.py +++ b/llm_on_ray/inference/torch_dist.py @@ -44,7 +44,7 @@ import ray from ray.actor import ActorHandle from ray.train._internal.utils import get_address_and_port -from ray.air._internal.torch_utils import get_device +from ray.air._internal.torch_utils import get_devices from ray._private.accelerators.hpu import HPU_PACKAGE_AVAILABLE if HPU_PACKAGE_AVAILABLE: @@ -212,7 +212,7 @@ def _shutdown_torch_distributed(): return # Clean up cuda memory. - devices = get_device() + devices = get_devices() for device in devices: with torch.cuda.device(device): torch.cuda.empty_cache() diff --git a/pyproject.toml b/pyproject.toml index 4cd11d4a5..e9462638f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,11 +21,10 @@ dependencies = [ "accelerate", "datasets>=2.14.6", "numpy", - "ray<2.10", + "ray>=2.10", + "ray[serve,tune]>=2.10", "typing>=3.7.4.3", "tabulate", - "ray[tune]", - "ray[serve]", "gymnasium", "dm-tree", "tensorboard", @@ -35,7 +34,8 @@ dependencies = [ "deltatuner==1.1.9", "py-cpuinfo", "pydantic-yaml", - "async-timeout" + "async_timeout", + "typer" ] [project.optional-dependencies] @@ -85,4 +85,4 @@ llm_on_ray-pretrain = "llm_on_ray.pretrain.pretrain:main" llm_on_ray-megatron_deepspeed_pretrain = "llm_on_ray.pretrain.megatron_deepspeed_pretrain:main" [tool.black] -line-length = 100 +line-length = 100 \ No newline at end of file