From f5c84b60dc96fdefca1a2d9abf431f426c36248c Mon Sep 17 00:00:00 2001 From: ChristianZaccaria Date: Mon, 25 Sep 2023 12:25:12 +0100 Subject: [PATCH 1/2] Check for dashboard readiness after cluster is ready --- src/codeflare_sdk/cluster/cluster.py | 18 +++++++++++++----- tests/unit_test.py | 2 +- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 99b11582d..d9659079b 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -274,19 +274,27 @@ def wait_ready(self, timeout: Optional[int] = None): dashboard_ready = False status = None time = 0 - while not ready or not dashboard_ready: + while not ready: status, ready = self.status(print_to_console=False) - dashboard_ready = self.is_dashboard_ready() if status == CodeFlareClusterStatus.UNKNOWN: print( "WARNING: Current cluster status is unknown, have you run cluster.up yet?" ) - if not ready or not dashboard_ready: + if not ready: + if timeout and time >= timeout: + raise TimeoutError(f"wait() timed out after waiting {timeout}s for cluster to be ready") + sleep(5) + time += 5 + print("Requested cluster is up and running!") + + while not dashboard_ready: + dashboard_ready = self.is_dashboard_ready() + if not dashboard_ready: if timeout and time >= timeout: - raise TimeoutError(f"wait() timed out after waiting {timeout}s") + raise TimeoutError(f"wait() timed out after waiting {timeout}s for dashboard to be ready") sleep(5) time += 5 - print("Requested cluster and dashboard are up and running!") + print("Dashboard is ready!") def details(self, print_to_console: bool = True) -> RayCluster: cluster = _copy_to_ray(self) diff --git a/tests/unit_test.py b/tests/unit_test.py index 4a8e2f441..0b8e79bb3 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -1794,7 +1794,7 @@ def test_wait_ready(mocker, capsys): captured = capsys.readouterr() assert ( captured.out - == "Waiting for requested resources to be set up...\nRequested cluster and dashboard are up and running!\n" + == "Waiting for requested resources to be set up...\nRequested cluster is up and running!\nDashboard is ready!\n" ) From 758a891f51f2ee3b49cec825d158e29e2eb4642d Mon Sep 17 00:00:00 2001 From: ChristianZaccaria Date: Tue, 26 Sep 2023 17:27:28 +0100 Subject: [PATCH 2/2] Add argument dashboard_check bool and checks --- src/codeflare_sdk/cluster/cluster.py | 15 ++++++++++----- tests/unit_test.py | 6 ++++++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index d9659079b..5d00cdae8 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -264,7 +264,7 @@ def is_dashboard_ready(self) -> bool: else: return False - def wait_ready(self, timeout: Optional[int] = None): + def wait_ready(self, timeout: Optional[int] = None, dashboard_check: bool = True): """ Waits for requested cluster to be ready, up to an optional timeout (s). Checks every five seconds. @@ -282,19 +282,24 @@ def wait_ready(self, timeout: Optional[int] = None): ) if not ready: if timeout and time >= timeout: - raise TimeoutError(f"wait() timed out after waiting {timeout}s for cluster to be ready") + raise TimeoutError( + f"wait() timed out after waiting {timeout}s for cluster to be ready" + ) sleep(5) time += 5 print("Requested cluster is up and running!") - while not dashboard_ready: + while dashboard_check and not dashboard_ready: dashboard_ready = self.is_dashboard_ready() if not dashboard_ready: if timeout and time >= timeout: - raise TimeoutError(f"wait() timed out after waiting {timeout}s for dashboard to be ready") + raise TimeoutError( + f"wait() timed out after waiting {timeout}s for dashboard to be ready" + ) sleep(5) time += 5 - print("Dashboard is ready!") + if dashboard_ready: + print("Dashboard is ready!") def details(self, print_to_console: bool = True) -> RayCluster: cluster = _copy_to_ray(self) diff --git a/tests/unit_test.py b/tests/unit_test.py index 0b8e79bb3..78925226a 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -1796,6 +1796,12 @@ def test_wait_ready(mocker, capsys): captured.out == "Waiting for requested resources to be set up...\nRequested cluster is up and running!\nDashboard is ready!\n" ) + cf.wait_ready(dashboard_check=False) + captured = capsys.readouterr() + assert ( + captured.out + == "Waiting for requested resources to be set up...\nRequested cluster is up and running!\n" + ) def test_jobdefinition_coverage():