-
Notifications
You must be signed in to change notification settings - Fork 38
Poll Based Waiting for Job Completion #670
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7c633a6
45ef45b
cc12ebb
a6c5799
e3fe1f3
c624565
fb8715a
98b745f
d1eafc8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,112 @@ | ||
| # BSD 2-Clause License | ||
| # | ||
| # Copyright (c) 2021-2024, Hewlett Packard Enterprise | ||
| # All rights reserved. | ||
| # | ||
| # Redistribution and use in source and binary forms, with or without | ||
| # modification, are permitted provided that the following conditions are met: | ||
| # | ||
| # 1. Redistributions of source code must retain the above copyright notice, this | ||
| # list of conditions and the following disclaimer. | ||
| # | ||
| # 2. Redistributions in binary form must reproduce the above copyright notice, | ||
| # this list of conditions and the following disclaimer in the documentation | ||
| # and/or other materials provided with the distribution. | ||
| # | ||
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
| # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
| # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | ||
| # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE | ||
| # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
| # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | ||
| # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | ||
| # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||
| # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
| # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import time | ||
| import typing as t | ||
|
|
||
| Seconds = t.NewType("Seconds", float) | ||
|
|
||
|
|
||
| class SynchronousTimeInterval: | ||
| """A utility class to represent and synchronously block the execution of a | ||
| thread for an interval of time. | ||
| """ | ||
|
|
||
| def __init__(self, delta: float | None) -> None: | ||
| """Initialize a new `SynchronousTimeInterval` interval | ||
|
|
||
| :param delta: The difference in time the interval represents in | ||
| seconds. If `None`, the interval will represent an infinite amount | ||
| of time. | ||
| :raises ValueError: The `delta` is negative | ||
| """ | ||
| if delta is not None and delta < 0: | ||
| raise ValueError("Timeout value cannot be less than 0") | ||
| if delta is None: | ||
| delta = float("inf") | ||
| self._delta = Seconds(delta) | ||
| """The amount of time, in seconds, the interval spans.""" | ||
| self._start = time.perf_counter() | ||
| """The time of the creation of the interval""" | ||
|
|
||
| @property | ||
| def delta(self) -> Seconds: | ||
| """The difference in time the interval represents | ||
|
|
||
| :returns: The difference in time the interval represents | ||
| """ | ||
| return self._delta | ||
|
|
||
| @property | ||
| def elapsed(self) -> Seconds: | ||
| """The amount of time that has passed since the interval was created | ||
|
|
||
| :returns: The amount of time that has passed since the interval was | ||
| created | ||
| """ | ||
| return Seconds(time.perf_counter() - self._start) | ||
|
|
||
| @property | ||
| def remaining(self) -> Seconds: | ||
| """The amount of time remaining in the interval | ||
|
|
||
| :returns: The amount of time remaining in the interval | ||
| """ | ||
| return Seconds(max(self.delta - self.elapsed, 0)) | ||
|
|
||
| @property | ||
| def expired(self) -> bool: | ||
| """The amount of time remaining in interval | ||
MattToast marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| :returns: The amount of time left in the interval | ||
| """ | ||
| return self.remaining <= 0 | ||
|
|
||
| @property | ||
| def infinite(self) -> bool: | ||
| """Return true if the timeout interval is infinitely long | ||
|
|
||
| :returns: `True` if the delta is infinite, `False` otherwise | ||
| """ | ||
| return self.remaining == float("inf") | ||
|
|
||
| def new_interval(self) -> SynchronousTimeInterval: | ||
| """Make a new timeout with the same interval | ||
|
|
||
| :returns: The new time interval | ||
| """ | ||
| return type(self)(self.delta) | ||
|
|
||
| def block(self) -> None: | ||
| """Block the thread until the timeout completes | ||
|
|
||
| :raises RuntimeError: The thread would be blocked forever | ||
| """ | ||
| if self.remaining == float("inf"): | ||
| raise RuntimeError("Cannot block thread forever") | ||
| time.sleep(self.remaining) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,8 +24,6 @@ | |
| # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
| # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
|
||
| # pylint: disable=too-many-lines | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import datetime | ||
|
|
@@ -39,9 +37,11 @@ | |
|
|
||
| from smartsim._core import dispatch | ||
| from smartsim._core.config import CONFIG | ||
| from smartsim._core.control import interval as _interval | ||
MattToast marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| from smartsim._core.control.launch_history import LaunchHistory as _LaunchHistory | ||
| from smartsim._core.utils import helpers as _helpers | ||
| from smartsim.error import errors | ||
| from smartsim.status import InvalidJobStatus, JobStatus | ||
| from smartsim.status import TERMINAL_STATUSES, InvalidJobStatus, JobStatus | ||
|
|
||
| from ._core import Generator, Manifest, previewrenderer | ||
| from .entity import TelemetryConfiguration | ||
|
|
@@ -254,6 +254,84 @@ def get_status( | |
| stats = (stats_map.get(i, InvalidJobStatus.NEVER_STARTED) for i in ids) | ||
| return tuple(stats) | ||
|
|
||
| def wait( | ||
| self, *ids: LaunchedJobID, timeout: float | None = None, verbose: bool = True | ||
| ) -> None: | ||
| """Block execution until all of the provided launched jobs, represented | ||
MattToast marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| by an ID, have entered a terminal status. | ||
|
|
||
| :param ids: The ids of the launched jobs to wait for. | ||
| :param timeout: The max time to wait for all of the launched jobs to end. | ||
| :param verbose: Whether found statuses should be displayed in the console. | ||
| :raises ValueError: No IDs were provided. | ||
| """ | ||
| if not ids: | ||
| raise ValueError("No job ids to wait on provided") | ||
| self._poll_for_statuses( | ||
| ids, TERMINAL_STATUSES, timeout=timeout, verbose=verbose | ||
| ) | ||
|
|
||
| def _poll_for_statuses( | ||
| self, | ||
| ids: t.Sequence[LaunchedJobID], | ||
| statuses: t.Collection[JobStatus], | ||
| timeout: float | None = None, | ||
| interval: float = 5.0, | ||
| verbose: bool = True, | ||
| ) -> dict[LaunchedJobID, JobStatus | InvalidJobStatus]: | ||
| """Poll the experiment's launchers for the statuses of the launched | ||
| jobs with the provided ids, until the status of the changes to one of | ||
| the provided statuses. | ||
|
|
||
| :param ids: The ids of the launched jobs to wait for. | ||
| :param statuses: A collection of statuses to poll for. | ||
| :param timeout: The minimum amount of time to spend polling all jobs to | ||
| reach one of the supplied statuses. If not supplied or `None`, the | ||
| experiment will poll indefinitely. | ||
| :param interval: The minimum time between polling launchers. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is param interval just for us to use for testing? Seems like user cannot define
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exactly! It was something that, right now, could be hard coded, but if in the future we wanted to make it variable we can change the parameter. Totally willing to remove if we think the excess complexity is unnecessary in a YAGNI way!
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wouldn't mind leaving it in and keeping it in the docs
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I say keep if but not in docstring |
||
| :param verbose: Whether or not to log polled states to the console. | ||
| :raises ValueError: The interval between polling launchers is infinite | ||
| :raises TimeoutError: The polling interval was exceeded. | ||
| :returns: A mapping of ids to the status they entered that ended | ||
| polling. | ||
| """ | ||
| terminal = frozenset(itertools.chain(statuses, InvalidJobStatus)) | ||
| log = logger.info if verbose else lambda *_, **__: None | ||
MattToast marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| method_timeout = _interval.SynchronousTimeInterval(timeout) | ||
| iter_timeout = _interval.SynchronousTimeInterval(interval) | ||
| final: dict[LaunchedJobID, JobStatus | InvalidJobStatus] = {} | ||
|
|
||
| def is_finished( | ||
| id_: LaunchedJobID, status: JobStatus | InvalidJobStatus | ||
| ) -> bool: | ||
| job_title = f"Job({id_}): " | ||
| if done := status in terminal: | ||
| log(f"{job_title}Finished with status '{status.value}'") | ||
| else: | ||
| log(f"{job_title}Running with status '{status.value}'") | ||
| return done | ||
|
|
||
| if iter_timeout.infinite: | ||
amandarichardsonn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| raise ValueError("Polling interval cannot be infinite") | ||
| while ids and not method_timeout.expired: | ||
| iter_timeout = iter_timeout.new_interval() | ||
| stats = zip(ids, self.get_status(*ids)) | ||
| is_done = _helpers.group_by(_helpers.pack_params(is_finished), stats) | ||
| final |= dict(is_done.get(True, ())) | ||
| ids = tuple(id_ for id_, _ in is_done.get(False, ())) | ||
| if ids: | ||
| ( | ||
| iter_timeout | ||
| if iter_timeout.remaining < method_timeout.remaining | ||
| else method_timeout | ||
| ).block() | ||
| if ids: | ||
| raise TimeoutError( | ||
| f"Job ID(s) {', '.join(map(str, ids))} failed to reach " | ||
| "terminal status before timeout" | ||
| ) | ||
| return final | ||
|
|
||
| @_contextualize | ||
| def _generate( | ||
| self, generator: Generator, job: Job, job_index: int | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.