Skip to content
This repository was archived by the owner on Sep 23, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions llm_on_ray/inference/api_server_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,11 @@ def router_application(deployments, max_concurrent_queries):
def openai_serve_run(deployments, host, route_prefix, port, max_concurrent_queries):
router_app = router_application(deployments, max_concurrent_queries)

serve.start(http_options={"host": host, "port": port})
serve.run(
router_app,
name="router",
route_prefix=route_prefix,
host=host,
_blocking=True,
).options(
stream=True,
use_new_handle_api=True,
Expand Down
5 changes: 2 additions & 3 deletions llm_on_ray/inference/api_server_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,10 @@ def serve_run(deployments, model_list):
for model_id, infer_conf in model_list.items():
print("deploy model: ", model_id)
deployment = deployments[model_id]

serve.start(http_options={"host": infer_conf.host, "port": infer_conf.port})
serve.run(
deployment,
_blocking=True,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we pass keep_serve_terminal and set it to the new blocking parameter in serve.run? Better to also rename keep_serve_terminal to blocking in our script which will be more consistent and easy to understand.

Copy link
Author

@xwu-intel xwu-intel Apr 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's keep current logic for this PR. We printed something after this call.
Use a separate PR to address the parameter, In the new version, user should use Ctrl-C to stop the program after this blocking call.

host=infer_conf.host,
port=infer_conf.port,
name=infer_conf.name,
route_prefix=infer_conf.route_prefix,
)
Expand Down
4 changes: 2 additions & 2 deletions llm_on_ray/inference/torch_dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
import ray
from ray.actor import ActorHandle
from ray.train._internal.utils import get_address_and_port
from ray.air._internal.torch_utils import get_device
from ray.air._internal.torch_utils import get_devices
from ray._private.accelerators.hpu import HPU_PACKAGE_AVAILABLE

if HPU_PACKAGE_AVAILABLE:
Expand Down Expand Up @@ -212,7 +212,7 @@ def _shutdown_torch_distributed():
return

# Clean up cuda memory.
devices = get_device()
devices = get_devices()
for device in devices:
with torch.cuda.device(device):
torch.cuda.empty_cache()
Expand Down
10 changes: 5 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,10 @@ dependencies = [
"accelerate",
"datasets>=2.14.6",
"numpy",
"ray<2.10",
"ray>=2.10",
"ray[serve,tune]>=2.10",
"typing>=3.7.4.3",
"tabulate",
"ray[tune]",
"ray[serve]",
"gymnasium",
"dm-tree",
"tensorboard",
Expand All @@ -35,7 +34,8 @@ dependencies = [
"deltatuner==1.1.9",
"py-cpuinfo",
"pydantic-yaml",
"async-timeout"
"async_timeout",
"typer"
]

[project.optional-dependencies]
Expand Down Expand Up @@ -85,4 +85,4 @@ llm_on_ray-pretrain = "llm_on_ray.pretrain.pretrain:main"
llm_on_ray-megatron_deepspeed_pretrain = "llm_on_ray.pretrain.megatron_deepspeed_pretrain:main"

[tool.black]
line-length = 100
line-length = 100