intel · carsonwang · Apr 9, 2024 · Mar 25, 2024 · Mar 25, 2024 · Mar 25, 2024
diff --git a/llm_on_ray/inference/api_server_openai.py b/llm_on_ray/inference/api_server_openai.py
@@ -57,12 +57,11 @@ def router_application(deployments, max_concurrent_queries):
 def openai_serve_run(deployments, host, route_prefix, port, max_concurrent_queries):
     router_app = router_application(deployments, max_concurrent_queries)
 
+    serve.start(http_options={"host": host, "port": port})
     serve.run(
         router_app,
         name="router",
         route_prefix=route_prefix,
-        host=host,
-        _blocking=True,
     ).options(
         stream=True,
         use_new_handle_api=True,

diff --git a/llm_on_ray/inference/api_server_simple.py b/llm_on_ray/inference/api_server_simple.py
@@ -22,11 +22,10 @@ def serve_run(deployments, model_list):
     for model_id, infer_conf in model_list.items():
         print("deploy model: ", model_id)
         deployment = deployments[model_id]
+
+        serve.start(http_options={"host": infer_conf.host, "port": infer_conf.port})
         serve.run(
             deployment,
-            _blocking=True,
-            host=infer_conf.host,
-            port=infer_conf.port,
             name=infer_conf.name,
             route_prefix=infer_conf.route_prefix,
         )

diff --git a/llm_on_ray/inference/torch_dist.py b/llm_on_ray/inference/torch_dist.py
@@ -44,7 +44,7 @@
 import ray
 from ray.actor import ActorHandle
 from ray.train._internal.utils import get_address_and_port
-from ray.air._internal.torch_utils import get_device
+from ray.air._internal.torch_utils import get_devices
 from ray._private.accelerators.hpu import HPU_PACKAGE_AVAILABLE
 
 if HPU_PACKAGE_AVAILABLE:
@@ -212,7 +212,7 @@ def _shutdown_torch_distributed():
         return
 
     # Clean up cuda memory.
-    devices = get_device()
+    devices = get_devices()
     for device in devices:
         with torch.cuda.device(device):
             torch.cuda.empty_cache()

diff --git a/pyproject.toml b/pyproject.toml
@@ -21,11 +21,10 @@ dependencies = [
     "accelerate",
     "datasets>=2.14.6",
     "numpy",
-    "ray<2.10",
+    "ray>=2.10",
+    "ray[serve,tune]>=2.10",
     "typing>=3.7.4.3",
     "tabulate",
-    "ray[tune]",
-    "ray[serve]",
     "gymnasium",
     "dm-tree",
     "tensorboard",
@@ -35,7 +34,8 @@ dependencies = [
     "deltatuner==1.1.9",
     "py-cpuinfo",
     "pydantic-yaml",
-    "async-timeout"
+    "async_timeout",
+    "typer"
 ]
 
 [project.optional-dependencies]
@@ -85,4 +85,4 @@ llm_on_ray-pretrain = "llm_on_ray.pretrain.pretrain:main"
 llm_on_ray-megatron_deepspeed_pretrain = "llm_on_ray.pretrain.megatron_deepspeed_pretrain:main"
 
 [tool.black]
-line-length = 100
+line-length = 100