diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 88f270ba78..99f8cab2b9 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -21,13 +21,11 @@ build: fi pre_create_environment: - git clone --depth 1 https://github.com/CrayLabs/SmartRedis.git smartredis - - git clone --depth 1 https://github.com/CrayLabs/SmartDashboard.git smartdashboard post_create_environment: - python -m pip install .[dev,docs] - cd smartredis; python -m pip install . - cd smartredis/doc; doxygen Doxyfile_c; doxygen Doxyfile_cpp; doxygen Doxyfile_fortran - ln -s smartredis/examples ./examples - - cd smartdashboard; python -m pip install . pre_build: - pip install typing_extensions==4.8.0 - pip install pydantic==1.10.13 diff --git a/conftest.py b/conftest.py index ae0a0d06ef..721f99a4d3 100644 --- a/conftest.py +++ b/conftest.py @@ -26,7 +26,6 @@ from __future__ import annotations -import asyncio from collections import defaultdict from dataclasses import dataclass import json @@ -43,7 +42,6 @@ import uuid import warnings from subprocess import run -import time import psutil import pytest @@ -51,10 +49,8 @@ import smartsim from smartsim import Experiment from smartsim._core.launcher.dragon.dragonConnector import DragonConnector -from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher from smartsim._core.config import CONFIG from smartsim._core.config.config import Config -from smartsim._core.utils.telemetry.telemetry import JobEntity from smartsim.database import Orchestrator from smartsim.entity import Model from smartsim.error import SSConfigError, SSInternalError @@ -68,6 +64,7 @@ RunSettings, SrunSettings, ) +from collections.abc import Callable, Collection logger = get_logger(__name__) @@ -83,7 +80,7 @@ test_alloc_specs_path = os.getenv("SMARTSIM_TEST_ALLOC_SPEC_SHEET_PATH", None) test_ports = CONFIG.test_ports test_account = CONFIG.test_account or "" -test_batch_resources: t.Dict[t.Any, t.Any] = CONFIG.test_batch_resources +test_batch_resources: dict[t.Any, t.Any] = CONFIG.test_batch_resources test_output_dirs = 0 mpi_app_exe = None built_mpi_app = False @@ -173,7 +170,7 @@ def pytest_sessionfinish( kill_all_test_spawned_processes() -def build_mpi_app() -> t.Optional[pathlib.Path]: +def build_mpi_app() -> pathlib.Path | None: global built_mpi_app built_mpi_app = True cc = shutil.which("cc") @@ -194,7 +191,7 @@ def build_mpi_app() -> t.Optional[pathlib.Path]: return None @pytest.fixture(scope="session") -def mpi_app_path() -> t.Optional[pathlib.Path]: +def mpi_app_path() -> pathlib.Path | None: """Return path to MPI app if it was built return None if it could not or will not be built @@ -227,7 +224,7 @@ def kill_all_test_spawned_processes() -> None: -def get_hostlist() -> t.Optional[t.List[str]]: +def get_hostlist() -> list[str] | None: global test_hostlist if not test_hostlist: if "PBS_NODEFILE" in os.environ and test_launcher == "pals": @@ -255,14 +252,14 @@ def get_hostlist() -> t.Optional[t.List[str]]: return test_hostlist -def _parse_hostlist_file(path: str) -> t.List[str]: +def _parse_hostlist_file(path: str) -> list[str]: with open(path, "r", encoding="utf-8") as nodefile: return list({line.strip() for line in nodefile.readlines()}) @pytest.fixture(scope="session") -def alloc_specs() -> t.Dict[str, t.Any]: - specs: t.Dict[str, t.Any] = {} +def alloc_specs() -> dict[str, t.Any]: + specs: dict[str, t.Any] = {} if test_alloc_specs_path: try: with open(test_alloc_specs_path, encoding="utf-8") as spec_file: @@ -297,7 +294,7 @@ def _reset(): ) -def _find_free_port(ports: t.Collection[int]) -> int: +def _find_free_port(ports: Collection[int]) -> int: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: for port in ports: try: @@ -314,7 +311,7 @@ def _find_free_port(ports: t.Collection[int]) -> int: @pytest.fixture(scope="session") -def wlmutils() -> t.Type[WLMUtils]: +def wlmutils() -> type[WLMUtils]: return WLMUtils @@ -339,22 +336,22 @@ def get_test_account() -> str: return get_account() @staticmethod - def get_test_interface() -> t.List[str]: + def get_test_interface() -> list[str]: return test_nic @staticmethod - def get_test_hostlist() -> t.Optional[t.List[str]]: + def get_test_hostlist() -> list[str] | None: return get_hostlist() @staticmethod - def get_batch_resources() -> t.Dict: + def get_batch_resources() -> dict: return test_batch_resources @staticmethod def get_base_run_settings( - exe: str, args: t.List[str], nodes: int = 1, ntasks: int = 1, **kwargs: t.Any + exe: str, args: list[str], nodes: int = 1, ntasks: int = 1, **kwargs: t.Any ) -> RunSettings: - run_args: t.Dict[str, t.Union[int, str, float, None]] = {} + run_args: dict[str, int, str | float | None] = {} if test_launcher == "slurm": run_args = {"--nodes": nodes, "--ntasks": ntasks, "--time": "00:10:00"} @@ -395,9 +392,9 @@ def get_base_run_settings( @staticmethod def get_run_settings( - exe: str, args: t.List[str], nodes: int = 1, ntasks: int = 1, **kwargs: t.Any + exe: str, args: list[str], nodes: int = 1, ntasks: int = 1, **kwargs: t.Any ) -> RunSettings: - run_args: t.Dict[str, t.Union[int, str, float, None]] = {} + run_args: dict[str, int, str | float | None] = {} if test_launcher == "slurm": run_args = {"nodes": nodes, "ntasks": ntasks, "time": "00:10:00"} @@ -427,7 +424,7 @@ def get_run_settings( return RunSettings(exe, args) @staticmethod - def choose_host(rs: RunSettings) -> t.Optional[str]: + def choose_host(rs: RunSettings) -> str | None: if isinstance(rs, (MpirunSettings, MpiexecSettings)): hl = get_hostlist() if hl is not None: @@ -454,13 +451,13 @@ def check_output_dir() -> None: @pytest.fixture -def dbutils() -> t.Type[DBUtils]: +def dbutils() -> type[DBUtils]: return DBUtils class DBUtils: @staticmethod - def get_db_configs() -> t.Dict[str, t.Any]: + def get_db_configs() -> dict[str, t.Any]: config_settings = { "enable_checkpoints": 1, "set_max_memory": "3gb", @@ -474,7 +471,7 @@ def get_db_configs() -> t.Dict[str, t.Any]: return config_settings @staticmethod - def get_smartsim_error_db_configs() -> t.Dict[str, t.Any]: + def get_smartsim_error_db_configs() -> dict[str, t.Any]: bad_configs = { "save": [ "-1", # frequency must be positive @@ -501,8 +498,8 @@ def get_smartsim_error_db_configs() -> t.Dict[str, t.Any]: return bad_configs @staticmethod - def get_type_error_db_configs() -> t.Dict[t.Union[int, str], t.Any]: - bad_configs: t.Dict[t.Union[int, str], t.Any] = { + def get_type_error_db_configs() -> dict[int | str, t.Any]: + bad_configs: dict[int | str, t.Any] = { "save": [2, True, ["2"]], # frequency must be specified as a string "maxmemory": [99, True, ["99"]], # memory form must be a string "maxclients": [3, True, ["3"]], # number of clients must be a string @@ -523,9 +520,9 @@ def get_type_error_db_configs() -> t.Dict[t.Union[int, str], t.Any]: @staticmethod def get_config_edit_method( db: Orchestrator, config_setting: str - ) -> t.Optional[t.Callable[..., None]]: + ) -> Callable[..., None] | None: """Get a db configuration file edit method from a str""" - config_edit_methods: t.Dict[str, t.Callable[..., None]] = { + config_edit_methods: dict[str, Callable[..., None]] = { "enable_checkpoints": db.enable_checkpoints, "set_max_memory": db.set_max_memory, "set_eviction_strategy": db.set_eviction_strategy, @@ -568,7 +565,7 @@ def test_dir(request: pytest.FixtureRequest) -> str: @pytest.fixture -def fileutils() -> t.Type[FileUtils]: +def fileutils() -> type[FileUtils]: return FileUtils @@ -593,7 +590,7 @@ def get_test_dir_path(dirname: str) -> str: @staticmethod def make_test_file( - file_name: str, file_dir: str, file_content: t.Optional[str] = None + file_name: str, file_dir: str, file_content: str | None = None ) -> str: """Create a dummy file in the test output directory. @@ -613,7 +610,7 @@ def make_test_file( @pytest.fixture -def mlutils() -> t.Type[MLUtils]: +def mlutils() -> type[MLUtils]: return MLUtils @@ -628,21 +625,21 @@ def get_test_num_gpus() -> int: @pytest.fixture -def coloutils() -> t.Type[ColoUtils]: +def coloutils() -> type[ColoUtils]: return ColoUtils class ColoUtils: @staticmethod def setup_test_colo( - fileutils: t.Type[FileUtils], + fileutils: type[FileUtils], db_type: str, exp: Experiment, application_file: str, - db_args: t.Dict[str, t.Any], - colo_settings: t.Optional[RunSettings] = None, + db_args: dict[str, t.Any], + colo_settings: RunSettings | None = None, colo_model_name: str = "colocated_model", - port: t.Optional[int] = None, + port: int | None = None, on_wlm: bool = False, ) -> Model: """Setup database needed for the colo pinning tests""" @@ -670,7 +667,7 @@ def setup_test_colo( socket_name = f"{colo_model_name}_{socket_suffix}.socket" db_args["unix_socket"] = os.path.join(tmp_dir, socket_name) - colocate_fun: t.Dict[str, t.Callable[..., None]] = { + colocate_fun: dict[str, Callable[..., None]] = { "tcp": colo_model.colocate_db_tcp, "deprecated": colo_model.colocate_db, "uds": colo_model.colocate_db_uds, @@ -706,149 +703,13 @@ def config() -> Config: return CONFIG -class MockSink: - """Telemetry sink that writes console output for testing purposes""" - - def __init__(self, delay_ms: int = 0) -> None: - self._delay_ms = delay_ms - self.num_saves = 0 - self.args: t.Any = None - - async def save(self, *args: t.Any) -> None: - """Save all arguments as console logged messages""" - self.num_saves += 1 - if self._delay_ms: - # mimic slow collection.... - delay_s = self._delay_ms / 1000 - await asyncio.sleep(delay_s) - self.args = args - - -@pytest.fixture -def mock_sink() -> t.Type[MockSink]: - return MockSink - - -@pytest.fixture -def mock_con() -> t.Callable[[int, int], t.Iterable[t.Any]]: - """Generates mock db connection telemetry""" - - def _mock_con(min: int = 1, max: int = 254) -> t.Iterable[t.Any]: - for i in range(min, max): - yield [ - {"addr": f"127.0.0.{i}:1234", "id": f"ABC{i}"}, - {"addr": f"127.0.0.{i}:2345", "id": f"XYZ{i}"}, - ] - - return _mock_con - - -@pytest.fixture -def mock_mem() -> t.Callable[[int, int], t.Iterable[t.Any]]: - """Generates mock db memory usage telemetry""" - - def _mock_mem(min: int = 1, max: int = 1000) -> t.Iterable[t.Any]: - for i in range(min, max): - yield { - "total_system_memory": 1000 * i, - "used_memory": 1111 * i, - "used_memory_peak": 1234 * i, - } - - return _mock_mem - - -@pytest.fixture -def mock_redis() -> t.Callable[..., t.Any]: - def _mock_redis( - conn_side_effect=None, - mem_stats=None, - client_stats=None, - coll_side_effect=None, - ): - """Generate a mock object for the redis.Redis contract""" - - class MockConn: - def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: - if conn_side_effect is not None: - conn_side_effect() - - async def info(self, *args: t.Any, **kwargs: t.Any) -> t.Dict[str, t.Any]: - if coll_side_effect: - await coll_side_effect() - - if mem_stats: - return next(mem_stats) - return { - "total_system_memory": "111", - "used_memory": "222", - "used_memory_peak": "333", - } - - async def client_list( - self, *args: t.Any, **kwargs: t.Any - ) -> t.Dict[str, t.Any]: - if coll_side_effect: - await coll_side_effect() - - if client_stats: - return next(client_stats) - return {"addr": "127.0.0.1", "id": "111"} - - async def ping(self): - return True - - return MockConn - - return _mock_redis - - -class MockCollectorEntityFunc(t.Protocol): - @staticmethod - def __call__( - host: str = "127.0.0.1", - port: int = 6379, - name: str = "", - type: str = "", - telemetry_on: bool = False, - ) -> "JobEntity": ... - - -@pytest.fixture -def mock_entity(test_dir: str) -> MockCollectorEntityFunc: - def _mock_entity( - host: str = "127.0.0.1", - port: int = 6379, - name: str = "", - type: str = "", - telemetry_on: bool = False, - ) -> "JobEntity": - test_path = pathlib.Path(test_dir) - - entity = JobEntity() - entity.name = name if name else str(uuid.uuid4()) - entity.status_dir = str(test_path / entity.name) - entity.type = type - entity.telemetry_on = True - entity.collectors = { - "client": "", - "client_count": "", - "memory": "", - } - entity.config = { - "host": host, - "port": str(port), - } - entity.telemetry_on = telemetry_on - return entity - return _mock_entity class CountingCallable: def __init__(self) -> None: self._num: int = 0 - self._details: t.List[t.Tuple[t.Tuple[t.Any, ...], t.Dict[str, t.Any]]] = [] + self._details: list[tuple[tuple[t.Any, ...], dict[str, t.Any]]] = [] def __call__(self, *args: t.Any, **kwargs: t.Any) -> t.Any: self._num += 1 @@ -859,12 +720,12 @@ def num_calls(self) -> int: return self._num @property - def details(self) -> t.List[t.Tuple[t.Tuple[t.Any, ...], t.Dict[str, t.Any]]]: + def details(self) -> list[tuple[tuple[t.Any, ...], dict[str, t.Any]]]: return self._details ## Reuse database across tests -database_registry: t.DefaultDict[str, t.Optional[Orchestrator]] = defaultdict(lambda: None) +database_registry: defaultdict[str, Orchestrator | None] = defaultdict(lambda: None) @pytest.fixture(scope="function") def local_experiment(test_dir: str) -> smartsim.Experiment: @@ -898,13 +759,13 @@ class DBConfiguration: name: str launcher: str num_nodes: int - interface: t.Union[str,t.List[str]] - hostlist: t.Optional[t.List[str]] + interface: str | list[str] + hostlist: list[str] | None port: int @dataclass class PrepareDatabaseOutput: - orchestrator: t.Optional[Orchestrator] # The actual orchestrator object + orchestrator: Orchestrator | None # The actual orchestrator object new_db: bool # True if a new database was created when calling prepare_db # Reuse databases @@ -957,7 +818,7 @@ def clustered_db(wlmutils: WLMUtils) -> t.Generator[DBConfiguration, None, None] @pytest.fixture -def register_new_db() -> t.Callable[[DBConfiguration], Orchestrator]: +def register_new_db() -> Callable[[DBConfiguration], Orchestrator]: def _register_new_db( config: DBConfiguration ) -> Orchestrator: @@ -985,11 +846,11 @@ def _register_new_db( @pytest.fixture(scope="function") def prepare_db( - register_new_db: t.Callable[ + register_new_db: Callable[ [DBConfiguration], Orchestrator ] -) -> t.Callable[ +) -> Callable[ [DBConfiguration], PrepareDatabaseOutput ]: diff --git a/doc/api/smartsim_api.rst b/doc/api/smartsim_api.rst index 91e2c2f0fc..10247ed510 100644 --- a/doc/api/smartsim_api.rst +++ b/doc/api/smartsim_api.rst @@ -27,7 +27,6 @@ Experiment Experiment.reconnect_orchestrator Experiment.preview Experiment.summary - Experiment.telemetry .. autoclass:: Experiment :show-inheritance: @@ -368,7 +367,6 @@ Orchestrator Orchestrator.set_max_clients Orchestrator.set_max_message_size Orchestrator.set_db_conf - Orchestrator.telemetry Orchestrator.checkpoint_file Orchestrator.batch diff --git a/doc/changelog.md b/doc/changelog.md index 433d542cee..88f9cbad4a 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -1,11 +1,9 @@ # Changelog -Listed here are the changes between each release of SmartSim, -SmartRedis and SmartDashboard. +Listed here are the changes between each release of SmartSim and SmartRedis. Jump to: - {ref}`SmartRedis changelog` -- {ref}`SmartDashboard changelog` ## SmartSim @@ -13,8 +11,12 @@ To be released at some point in the future Description +- Modernize typing syntax to Python 3.10+ standards +- **BREAKING CHANGE**: Removed telemetry functionality, LaunchedManifest tracking + classes, and SmartDashboard integration - Update copyright headers from 2021-2024 to 2021-2025 across the entire codebase -- Python 3.12 is now supported; where available, installed TensorFlow version is now 2.16.2, PyTorch is 2.7.1. +- Python 3.12 is now supported; where available, installed TensorFlow version + is now 2.16.2, PyTorch is 2.7.1. - Drop Python 3.9 support - Terminate LSF and LSB support - Implement workaround for Tensorflow that allows RedisAI to build with GCC-14 @@ -23,20 +25,47 @@ Description Detailed Notes -- Copyright headers have been updated from "2021-2024" to "2021-2025" across 271 files - including Python source files, configuration files, documentation, tests, Docker files, - shell scripts, and other supporting files to reflect the new year. +- Modernized typing syntax to use Python 3.10+ standards, replacing + `Union[X, Y]` with `X | Y`, `Optional[X]` with `X | None`, and generic + collections (`List[X]` → `list[X]`, `Dict[X, Y]` → `dict[X, Y]`, etc.). + ([SmartSim-PR791](https://github.com/CrayLabs/SmartSim/pull/791)) +- **BREAKING CHANGE**: Removed telemetry functionality, LaunchedManifest tracking + system, and SmartDashboard integration. + This includes complete removal of the telemetry monitor and collection system, + telemetry configuration classes (`TelemetryConfiguration`, + `ExperimentTelemetryConfiguration`), all telemetry-related API methods + (`Experiment.telemetry`, `Orchestrator.telemetry`), telemetry collectors and + sinks, and the `watchdog` dependency. Also removed SmartDashboard integration + and CLI plugin, along with the indirect entrypoint launching mechanism. + Additionally removed the `LaunchedManifest`, `_LaunchedManifestMetadata`, and + `LaunchedManifestBuilder` classes that were used for telemetry data collection + during entity launches. Simplified the controller launch workflow by removing + telemetry metadata tracking and launch manifest serialization. Cleaned up the + `serialize.py` module by removing orphaned telemetry functions (80% code + reduction), preserving only essential type definitions. Updated all test files + to remove LaunchedManifest dependencies and deleted obsolete telemetry test + files. The core `Manifest` class for entity organization remains unchanged, + maintaining backward compatibility for entity management while removing the + telemetry overhead. Enhanced the metadata directory system to use a centralized + `.smartsim/metadata/` structure for job output files with entity-specific + subdirectories (`ensemble/{name}`, `model/{name}`, `database/{name}`) and + proper symlink management. + ([SmartSim-PR789](https://github.com/CrayLabs/SmartSim/pull/789)) +- Copyright headers have been updated from "2021-2024" to "2021-2025" across + 271 files including Python source files, configuration files, documentation, + tests, Docker files, shell scripts, and other supporting files to reflect the + new year. ([SmartSim-PR790](https://github.com/CrayLabs/SmartSim/pull/790)) -- Python 3.12 is now supported. TensorFlow 2.16.2 and PyTorch 2.7.1 library files - are installed as part of `smart build` process when available. On Mac, ONNX runtime - 1.22.0 is now installed, together with ONNX 1.16. +- Python 3.12 is now supported. TensorFlow 2.16.2 and PyTorch 2.7.1 library + files are installed as part of `smart build` process when available. On Mac, + ONNX runtime 1.22.0 is now installed, together with ONNX 1.16. ([SmartSim-PR785](https://github.com/CrayLabs/SmartSim/pull/785)) - Python 3.9 will not be supported anymore, the last stable version of SmartSim with support for Python 3.9 will be 0.8. ([SmartSim-PR781](https://github.com/CrayLabs/SmartSim/pull/781)) - After the supercomputer Summit was decommissioned, a decision was made to - terminate SmartSim's support of the LSF launcher and LSB scheduler. If - this impacts your work, please contact us. + terminate SmartSim's support of the LSF launcher and LSB scheduler. If this + impacts your work, please contact us. ([SmartSim-PR780](https://github.com/CrayLabs/SmartSim/pull/780)) - Fix typos in the `train_surrogate` tutorial documentation. ([SmartSim-PR758](https://github.com/CrayLabs/SmartSim/pull/758)) @@ -1104,12 +1133,3 @@ Description: ```{include} ../smartredis/doc/changelog.md :start-line: 2 ``` - ------------------------------------------------------------------------- - -(smartdashboard-changelog)= -## SmartDashboard - -```{include} ../smartdashboard/doc/changelog.md -:start-line: 2 -``` diff --git a/doc/index.rst b/doc/index.rst index 4c64712b23..e6f6f0c3ba 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -55,12 +55,6 @@ sr_advanced_topics api/smartredis_api -.. toctree:: - :maxdepth: 2 - :caption: SmartDashboard - - smartdashboard - .. toctree:: :maxdepth: 2 :caption: Reference diff --git a/doc/smartdashboard.rst b/doc/smartdashboard.rst deleted file mode 100644 index 532fa6db08..0000000000 --- a/doc/smartdashboard.rst +++ /dev/null @@ -1,7 +0,0 @@ - -************** -SmartDashboard -************** - -.. include:: ../smartdashboard/doc/overview.rst - :start-line: 4 \ No newline at end of file diff --git a/docker/docs/dev/Dockerfile b/docker/docs/dev/Dockerfile index eac9c5e4d0..a45bc099d0 100644 --- a/docker/docs/dev/Dockerfile +++ b/docker/docs/dev/Dockerfile @@ -48,12 +48,6 @@ RUN git clone https://github.com/CrayLabs/SmartRedis.git --branch develop --dept && python -m pip install . \ && rm -rf ~/.cache/pip -# Install smartdashboard -RUN git clone https://github.com/CrayLabs/SmartDashboard.git --branch develop --depth=1 smartdashboard \ - && cd smartdashboard \ - && python -m pip install . \ - && rm -rf ~/.cache/pip - # Install docs dependencies and SmartSim RUN NO_CHECKS=1 SMARTSIM_SUFFIX=dev python -m pip install .[docs] diff --git a/setup.py b/setup.py index c618fb0076..97d142628a 100644 --- a/setup.py +++ b/setup.py @@ -176,7 +176,6 @@ class BuildError(Exception): "GitPython<=3.1.43", "protobuf<=3.20.3", "jinja2>=3.1.2", - "watchdog>4,<5", "pydantic>2", "pyzmq>=25.1.2", "pygithub>=2.3.0", diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py index 18863e7d19..e3ce64f231 100644 --- a/smartsim/_core/_cli/build.py +++ b/smartsim/_core/_cli/build.py @@ -31,7 +31,7 @@ import re import shutil import textwrap -import typing as t +from collections.abc import Callable, Collection from pathlib import Path from tabulate import tabulate @@ -139,7 +139,7 @@ def build_redis_ai( def parse_requirement( requirement: str, -) -> t.Tuple[str, t.Optional[str], t.Callable[[Version_], bool]]: +) -> tuple[str, str | None, Callable[[Version_], bool]]: operators = { "==": operator.eq, "<=": operator.le, @@ -199,10 +199,10 @@ def check_ml_python_packages(packages: MLPackageCollection) -> None: def _format_incompatible_python_env_message( - missing: t.Collection[str], conflicting: t.Collection[str] + missing: Collection[str], conflicting: Collection[str] ) -> str: indent = "\n\t" - fmt_list: t.Callable[[str, t.Collection[str]], str] = lambda n, l: ( + fmt_list: Callable[[str, Collection[str]], str] = lambda n, l: ( f"{n}:{indent}{indent.join(l)}" if l else "" ) missing_str = fmt_list("Missing", missing) @@ -237,7 +237,7 @@ def _configure_keydb_build(versions: Versioner) -> None: # pylint: disable-next=too-many-statements def execute( - args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / + args: argparse.Namespace, _unparsed_args: list[str] | None = None, / ) -> int: # Unpack various arguments diff --git a/smartsim/_core/_cli/clean.py b/smartsim/_core/_cli/clean.py index 2a60e7b362..eec3549e21 100644 --- a/smartsim/_core/_cli/clean.py +++ b/smartsim/_core/_cli/clean.py @@ -25,7 +25,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse -import typing as t from smartsim._core._cli.utils import clean, get_install_path @@ -41,13 +40,13 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: def execute( - args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / + args: argparse.Namespace, _unparsed_args: list[str] | None = None, / ) -> int: return clean(get_install_path() / "_core", _all=args.clobber) def execute_all( - args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / + args: argparse.Namespace, _unparsed_args: list[str] | None = None, / ) -> int: args.clobber = True return execute(args) diff --git a/smartsim/_core/_cli/cli.py b/smartsim/_core/_cli/cli.py index f44f66d049..ce7a490110 100644 --- a/smartsim/_core/_cli/cli.py +++ b/smartsim/_core/_cli/cli.py @@ -28,7 +28,6 @@ import argparse import os -import typing as t from smartsim._core._cli.build import configure_parser as build_parser from smartsim._core._cli.build import execute as build_execute @@ -47,8 +46,8 @@ class SmartCli: - def __init__(self, menu: t.List[MenuItemConfig]) -> None: - self.menu: t.Dict[str, MenuItemConfig] = {} + def __init__(self, menu: list[MenuItemConfig]) -> None: + self.menu: dict[str, MenuItemConfig] = {} self.parser = argparse.ArgumentParser( prog="smart", description="SmartSim command line interface", @@ -62,9 +61,11 @@ def __init__(self, menu: t.List[MenuItemConfig]) -> None: ) self.register_menu_items(menu) - self.register_menu_items([plugin() for plugin in plugins]) + # Register plugin menu items (currently empty since all plugins were removed) + plugin_items = [plugin() for plugin in plugins] + self.register_menu_items(plugin_items) - def execute(self, cli_args: t.List[str]) -> int: + def execute(self, cli_args: list[str]) -> int: if len(cli_args) < 2: self.parser.print_help() return os.EX_USAGE @@ -99,7 +100,7 @@ def _register_menu_item(self, item: MenuItemConfig) -> None: self.menu[item.command] = item - def register_menu_items(self, menu_items: t.List[MenuItemConfig]) -> None: + def register_menu_items(self, menu_items: list[MenuItemConfig]) -> None: for item in menu_items: self._register_menu_item(item) diff --git a/smartsim/_core/_cli/dbcli.py b/smartsim/_core/_cli/dbcli.py index cbf7f59b06..53f980301f 100644 --- a/smartsim/_core/_cli/dbcli.py +++ b/smartsim/_core/_cli/dbcli.py @@ -26,13 +26,12 @@ import argparse import os -import typing as t from smartsim._core._cli.utils import get_db_path def execute( - _args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / + _args: argparse.Namespace, _unparsed_args: list[str] | None = None, / ) -> int: if db_path := get_db_path(): print(db_path) diff --git a/smartsim/_core/_cli/info.py b/smartsim/_core/_cli/info.py index c08fcb1a35..a72c73f64d 100644 --- a/smartsim/_core/_cli/info.py +++ b/smartsim/_core/_cli/info.py @@ -2,7 +2,6 @@ import importlib.metadata import os import pathlib -import typing as t from tabulate import tabulate @@ -14,7 +13,7 @@ def execute( - _args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / + _args: argparse.Namespace, _unparsed_args: list[str] | None = None, / ) -> int: print("\nSmart Python Packages:") print( @@ -72,7 +71,7 @@ def execute( return os.EX_OK -def _fmt_installed_db(db_path: t.Optional[pathlib.Path]) -> str: +def _fmt_installed_db(db_path: pathlib.Path | None) -> str: if db_path is None: return _MISSING_DEP db_name, _ = db_path.name.split("-", 1) diff --git a/smartsim/_core/_cli/plugin.py b/smartsim/_core/_cli/plugin.py index 32c69b7e91..f59db02019 100644 --- a/smartsim/_core/_cli/plugin.py +++ b/smartsim/_core/_cli/plugin.py @@ -3,7 +3,7 @@ import os import subprocess as sp import sys -import typing as t +from collections.abc import Callable import smartsim.log from smartsim._core._cli.utils import SMART_LOGGER_FORMAT, MenuItemConfig @@ -14,10 +14,8 @@ def dynamic_execute( cmd: str, plugin_name: str -) -> t.Callable[[argparse.Namespace, t.List[str]], int]: - def process_execute( - _args: argparse.Namespace, unparsed_args: t.List[str], / - ) -> int: +) -> Callable[[argparse.Namespace, list[str]], int]: + def process_execute(_args: argparse.Namespace, unparsed_args: list[str], /) -> int: try: spec = importlib.util.find_spec(cmd) if spec is None: @@ -38,18 +36,5 @@ def process_execute( return process_execute -def dashboard() -> MenuItemConfig: - return MenuItemConfig( - "dashboard", - ( - "Start the SmartSim dashboard to monitor experiment output from a " - "graphical user interface. This requires that the SmartSim Dashboard " - "Package be installed. For more infromation please visit " - "https://github.com/CrayLabs/SmartDashboard" - ), - dynamic_execute("smartdashboard", "Dashboard"), - is_plugin=True, - ) - - -plugins = (dashboard,) +# No plugins currently available +plugins: tuple[Callable[[], MenuItemConfig], ...] = () diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py index cfdc51a9bb..45a06f6e57 100644 --- a/smartsim/_core/_cli/scripts/dragon_install.py +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -2,6 +2,7 @@ import pathlib import sys import typing as t +from collections.abc import Collection from github import Github from github.GitReleaseAsset import GitReleaseAsset @@ -83,7 +84,7 @@ def _pin_filter(asset_name: str) -> bool: return f"dragon-{dragon_pin()}" in asset_name -def _get_release_assets() -> t.Collection[GitReleaseAsset]: +def _get_release_assets() -> Collection[GitReleaseAsset]: """Retrieve a collection of available assets for all releases that satisfy the dragon version pin @@ -107,7 +108,7 @@ def _get_release_assets() -> t.Collection[GitReleaseAsset]: return assets -def filter_assets(assets: t.Collection[GitReleaseAsset]) -> t.Optional[GitReleaseAsset]: +def filter_assets(assets: Collection[GitReleaseAsset]) -> GitReleaseAsset | None: """Filter the available release assets so that HSTA agents are used when run on a Cray EX platform @@ -191,7 +192,7 @@ def install_package(asset_dir: pathlib.Path) -> int: def cleanup( - archive_path: t.Optional[pathlib.Path] = None, + archive_path: pathlib.Path | None = None, ) -> None: """Delete the downloaded asset and any files extracted during installation @@ -201,7 +202,7 @@ def cleanup( logger.debug(f"Deleted archive: {archive_path}") -def install_dragon(extraction_dir: t.Union[str, os.PathLike[str]]) -> int: +def install_dragon(extraction_dir: str | os.PathLike[str]) -> int: """Retrieve a dragon runtime appropriate for the current platform and install to the current python environment :param extraction_dir: path for download and extraction of assets @@ -211,8 +212,8 @@ def install_dragon(extraction_dir: t.Union[str, os.PathLike[str]]) -> int: return 1 extraction_dir = pathlib.Path(extraction_dir) - filename: t.Optional[pathlib.Path] = None - asset_dir: t.Optional[pathlib.Path] = None + filename: pathlib.Path | None = None + asset_dir: pathlib.Path | None = None try: asset_info = retrieve_asset_info() diff --git a/smartsim/_core/_cli/site.py b/smartsim/_core/_cli/site.py index 076fc0de72..e2c8e28139 100644 --- a/smartsim/_core/_cli/site.py +++ b/smartsim/_core/_cli/site.py @@ -26,11 +26,10 @@ import argparse import os -import typing as t from smartsim._core._cli.utils import get_install_path -def execute(_args: argparse.Namespace, _unparsed_args: t.List[str], /) -> int: +def execute(_args: argparse.Namespace, _unparsed_args: list[str], /) -> int: print(get_install_path()) return os.EX_OK diff --git a/smartsim/_core/_cli/teardown.py b/smartsim/_core/_cli/teardown.py index 8e900b0e6f..9d4d325728 100644 --- a/smartsim/_core/_cli/teardown.py +++ b/smartsim/_core/_cli/teardown.py @@ -27,7 +27,6 @@ import argparse import os import subprocess -import typing as t from smartsim._core.config import CONFIG @@ -66,7 +65,7 @@ def _do_dragon_teardown() -> int: def execute( - args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / + args: argparse.Namespace, _unparsed_args: list[str] | None = None, / ) -> int: if args.dragon: return _do_dragon_teardown() diff --git a/smartsim/_core/_cli/utils.py b/smartsim/_core/_cli/utils.py index 1e55c90173..44a668b6e2 100644 --- a/smartsim/_core/_cli/utils.py +++ b/smartsim/_core/_cli/utils.py @@ -29,8 +29,8 @@ import shutil import subprocess as sp import sys -import typing as t from argparse import ArgumentParser, Namespace +from collections.abc import Callable from pathlib import Path from smartsim._core._install.buildenv import SetupError @@ -118,7 +118,7 @@ def clean(core_path: Path, _all: bool = False) -> int: return os.EX_OK -def get_db_path() -> t.Optional[Path]: +def get_db_path() -> Path | None: bin_path = get_install_path() / "_core" / "bin" for option in bin_path.iterdir(): if option.name in ("redis-cli", "keydb-cli"): @@ -126,8 +126,8 @@ def get_db_path() -> t.Optional[Path]: return None -_CliHandler = t.Callable[[Namespace, t.List[str]], int] -_CliParseConfigurator = t.Callable[[ArgumentParser], None] +_CliHandler = Callable[[Namespace, list[str]], int] +_CliParseConfigurator = Callable[[ArgumentParser], None] class MenuItemConfig: @@ -136,7 +136,7 @@ def __init__( cmd: str, description: str, handler: _CliHandler, - configurator: t.Optional[_CliParseConfigurator] = None, + configurator: _CliParseConfigurator | None = None, is_plugin: bool = False, ): self.command = cmd diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py index 6d145a1987..bf1c48eed4 100644 --- a/smartsim/_core/_cli/validate.py +++ b/smartsim/_core/_cli/validate.py @@ -31,6 +31,7 @@ import os.path import tempfile import typing as t +from collections.abc import Callable, Mapping from types import TracebackType import numpy as np @@ -68,9 +69,9 @@ class _VerificationTempDir(_TemporaryDirectory): def __exit__( self, - exc: t.Optional[t.Type[BaseException]], - value: t.Optional[BaseException], - tb: t.Optional[TracebackType], + exc: type[BaseException] | None, + value: BaseException | None, + tb: TracebackType | None, ) -> None: if not value: # Yay, no error! Clean up as normal super().__exit__(exc, value, tb) @@ -79,7 +80,7 @@ def __exit__( def execute( - args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / + args: argparse.Namespace, _unparsed_args: list[str] | None = None, / ) -> int: """Validate the SmartSim installation works as expected given a simple experiment @@ -143,14 +144,13 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: def test_install( location: str, - port: t.Optional[int], + port: int | None, device: Device, with_tf: bool, with_pt: bool, with_onnx: bool, ) -> None: exp = Experiment("ValidationExperiment", exp_path=location, launcher="local") - exp.telemetry.disable() port = find_free_port() if port is None else port with _make_managed_local_orc(exp, port) as client: @@ -170,9 +170,7 @@ def test_install( @contextlib.contextmanager -def _env_vars_set_to( - evars: t.Mapping[str, t.Optional[str]] -) -> t.Generator[None, None, None]: +def _env_vars_set_to(evars: Mapping[str, str | None]) -> t.Generator[None, None, None]: envvars = tuple((var, os.environ.pop(var, None), val) for var, val in evars.items()) for var, _, tmpval in envvars: _set_or_del_env_var(var, tmpval) @@ -183,7 +181,7 @@ def _env_vars_set_to( _set_or_del_env_var(var, origval) -def _set_or_del_env_var(var: str, val: t.Optional[str]) -> None: +def _set_or_del_env_var(var: str, val: str | None) -> None: if val is not None: os.environ[var] = val else: @@ -222,7 +220,7 @@ def _test_tf_install(client: Client, tmp_dir: str, device: Device) -> None: client.get_tensor("keras-output") -def _build_tf_frozen_model(tmp_dir: str) -> t.Tuple[str, t.List[str], t.List[str]]: +def _build_tf_frozen_model(tmp_dir: str) -> tuple[str, list[str], list[str]]: from tensorflow import keras # pylint: disable=no-name-in-module @@ -251,7 +249,7 @@ def _test_torch_install(client: Client, device: Device) -> None: class Net(nn.Module): def __init__(self) -> None: super().__init__() - self.conv: t.Callable[..., torch.Tensor] = nn.Conv2d(1, 1, 3) + self.conv: Callable[..., torch.Tensor] = nn.Conv2d(1, 1, 3) def forward(self, x: torch.Tensor) -> torch.Tensor: return self.conv(x) diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py index 463b9c4136..f453187e70 100644 --- a/smartsim/_core/_install/buildenv.py +++ b/smartsim/_core/_install/buildenv.py @@ -64,7 +64,7 @@ class Version_(str): @staticmethod def _convert_to_version( - vers: t.Union[str, Iterable[Version], Version], + vers: str | Iterable[Version] | Version, ) -> t.Any: if isinstance(vers, Version): return vers @@ -172,7 +172,7 @@ class Versioner: ) REDISAI_BRANCH = get_env("SMARTSIM_REDISAI_BRANCH", f"v{REDISAI}") - def as_dict(self, db_name: DbEngine = "REDIS") -> t.Dict[str, t.Tuple[str, ...]]: + def as_dict(self, db_name: DbEngine = "REDIS") -> dict[str, tuple[str, ...]]: pkg_map = { "SMARTSIM": self.SMARTSIM, db_name: self.REDIS, @@ -259,7 +259,7 @@ def check_dependencies(self) -> None: for dep in deps: self.check_build_dependency(dep) - def __call__(self) -> t.Dict[str, str]: + def __call__(self) -> dict[str, str]: # return the build env for the build process env = os.environ.copy() env.update( @@ -272,8 +272,8 @@ def __call__(self) -> t.Dict[str, str]: ) return env - def as_dict(self) -> t.Dict[str, t.List[str]]: - variables: t.List[str] = [ + def as_dict(self) -> dict[str, list[str]]: + variables: list[str] = [ "CC", "CXX", "CFLAGS", @@ -283,7 +283,7 @@ def as_dict(self) -> t.Dict[str, t.List[str]]: "PYTHON_VERSION", "PLATFORM", ] - values: t.List[str] = [ + values: list[str] = [ self.CC, self.CXX, self.CFLAGS, @@ -316,7 +316,7 @@ def is_macos(cls) -> bool: return cls.PLATFORM == "darwin" @staticmethod - def get_cudnn_env() -> t.Optional[t.Dict[str, str]]: + def get_cudnn_env() -> dict[str, str] | None: """Collect the environment variables needed for Caffe (Pytorch) and throw an error if they are not found diff --git a/smartsim/_core/_install/builder.py b/smartsim/_core/_install/builder.py index 2bb5a99026..bae2db8968 100644 --- a/smartsim/_core/_install/builder.py +++ b/smartsim/_core/_install/builder.py @@ -38,12 +38,9 @@ from smartsim._core._install.utils import retrieve from smartsim._core.utils import expand_exe_path -if t.TYPE_CHECKING: - from typing_extensions import Never - # TODO: check cmake version and use system if possible to avoid conflicts -_PathLike = t.Union[str, "os.PathLike[str]"] +_PathLike = str | os.PathLike[str] _T = t.TypeVar("_T") _U = t.TypeVar("_U") @@ -67,7 +64,7 @@ class Builder: def __init__( self, - env: t.Dict[str, str], + env: dict[str, str], jobs: int = 1, verbose: bool = False, ) -> None: @@ -99,7 +96,7 @@ def __init__( self.jobs = jobs @property - def out(self) -> t.Optional[int]: + def out(self) -> int | None: return None if self.verbose else subprocess.DEVNULL # implemented in base classes @@ -115,16 +112,12 @@ def binary_path(binary: str) -> str: raise BuildError(f"{binary} not found in PATH") @staticmethod - def copy_file( - src: t.Union[str, Path], dst: t.Union[str, Path], set_exe: bool = False - ) -> None: + def copy_file(src: str | Path, dst: str | Path, set_exe: bool = False) -> None: shutil.copyfile(src, dst) if set_exe: Path(dst).chmod(stat.S_IXUSR | stat.S_IWUSR | stat.S_IRUSR) - def copy_dir( - self, src: t.Union[str, Path], dst: t.Union[str, Path], set_exe: bool = False - ) -> None: + def copy_dir(self, src: str | Path, dst: str | Path, set_exe: bool = False) -> None: src = Path(src) dst = Path(dst) dst.mkdir(exist_ok=True) @@ -144,10 +137,10 @@ def cleanup(self) -> None: def run_command( self, - cmd: t.List[str], + cmd: list[str], shell: bool = False, - out: t.Optional[int] = None, - cwd: t.Union[str, Path, None] = None, + out: int | None = None, + cwd: str | Path | None = None, ) -> None: # option to manually disable output if necessary if not out: @@ -179,7 +172,7 @@ class DatabaseBuilder(Builder): def __init__( self, - build_env: t.Optional[t.Dict[str, str]] = None, + build_env: dict[str, str] | None = None, malloc: str = "libc", jobs: int = 1, verbose: bool = False, diff --git a/smartsim/_core/_install/mlpackages.py b/smartsim/_core/_install/mlpackages.py index b5bae58452..baf978d36e 100644 --- a/smartsim/_core/_install/mlpackages.py +++ b/smartsim/_core/_install/mlpackages.py @@ -31,7 +31,7 @@ import subprocess import sys import typing as t -from collections.abc import MutableMapping +from collections.abc import MutableMapping, Sequence from dataclasses import dataclass from tabulate import tabulate @@ -73,9 +73,9 @@ class MLPackage: name: str version: str pip_index: str - python_packages: t.List[str] + python_packages: list[str] lib_source: PathLike - rai_patches: t.Tuple[RAIPatch, ...] = () + rai_patches: tuple[RAIPatch, ...] = () def retrieve(self, destination: PathLike) -> None: """Retrieve an archive and/or repository for the package @@ -105,7 +105,7 @@ class MLPackageCollection(MutableMapping[str, MLPackage]): Define a collection of MLPackages available for a specific platform """ - def __init__(self, platform: Platform, ml_packages: t.Sequence[MLPackage]): + def __init__(self, platform: Platform, ml_packages: Sequence[MLPackage]): self.platform = platform self._ml_packages = {pkg.name: pkg for pkg in ml_packages} @@ -173,7 +173,7 @@ def __str__(self, tablefmt: str = "github") -> str: def load_platform_configs( config_file_path: pathlib.Path, -) -> t.Dict[Platform, MLPackageCollection]: +) -> dict[Platform, MLPackageCollection]: """Create MLPackageCollections from JSON files in directory :param config_file_path: Directory with JSON files describing the diff --git a/smartsim/_core/_install/platform.py b/smartsim/_core/_install/platform.py index 60d704101d..0b5fe6142c 100644 --- a/smartsim/_core/_install/platform.py +++ b/smartsim/_core/_install/platform.py @@ -29,7 +29,6 @@ import os import pathlib import platform -import typing as t from dataclasses import dataclass from typing_extensions import Self @@ -98,7 +97,7 @@ def from_str(cls, str_: str) -> "Device": return cls(str_) @classmethod - def detect_cuda_version(cls) -> t.Optional["Device"]: + def detect_cuda_version(cls) -> "Device | None": """Find the enum based on environment CUDA :return: Enum for the version of CUDA currently available @@ -112,7 +111,7 @@ def detect_cuda_version(cls) -> t.Optional["Device"]: return None @classmethod - def detect_rocm_version(cls) -> t.Optional["Device"]: + def detect_rocm_version(cls) -> "Device | None": """Find the enum based on environment ROCm :return: Enum for the version of ROCm currently available @@ -149,7 +148,7 @@ def is_rocm(self) -> bool: return self in cls.rocm_enums() @classmethod - def cuda_enums(cls) -> t.Tuple["Device", ...]: + def cuda_enums(cls) -> tuple["Device", ...]: """Detect all CUDA devices supported by SmartSim :return: all enums associated with CUDA @@ -157,7 +156,7 @@ def cuda_enums(cls) -> t.Tuple["Device", ...]: return tuple(device for device in cls if "cuda" in device.value) @classmethod - def rocm_enums(cls) -> t.Tuple["Device", ...]: + def rocm_enums(cls) -> tuple["Device", ...]: """Detect all ROCm devices supported by SmartSim :return: all enums associated with ROCm diff --git a/smartsim/_core/_install/redisaiBuilder.py b/smartsim/_core/_install/redisaiBuilder.py index dc8872e03e..253d00eeb3 100644 --- a/smartsim/_core/_install/redisaiBuilder.py +++ b/smartsim/_core/_install/redisaiBuilder.py @@ -59,9 +59,9 @@ def __init__( build_env: BuildEnv, main_build_path: pathlib.Path, verbose: bool = False, - source: t.Union[ - str, pathlib.Path - ] = "https://github.com/RedisAI/redis-inference-optimization.git", + source: ( + str | pathlib.Path + ) = "https://github.com/RedisAI/redis-inference-optimization.git", version: str = "v1.2.7", ) -> None: @@ -196,7 +196,7 @@ def _set_execute(target: pathlib.Path) -> None: @staticmethod def _find_closest_object( start_path: pathlib.Path, target_obj: str - ) -> t.Optional[pathlib.Path]: + ) -> pathlib.Path | None: queue = deque([start_path]) while queue: current_dir = queue.popleft() @@ -234,7 +234,7 @@ def _prepare_packages(self) -> None: for file in actual_root.iterdir(): file.rename(target_dir / file.name) - def run_command(self, cmd: t.Union[str, t.List[str]], cwd: pathlib.Path) -> None: + def run_command(self, cmd: str | list[str], cwd: pathlib.Path) -> None: """Executor of commands usedi in the build :param cmd: The actual command to execute @@ -252,7 +252,7 @@ def run_command(self, cmd: t.Union[str, t.List[str]], cwd: pathlib.Path) -> None f"RedisAI build failed during command: {' '.join(cmd)}" ) - def _rai_cmake_cmd(self) -> t.List[str]: + def _rai_cmake_cmd(self) -> list[str]: """Build the CMake configuration command :return: CMake command with correct options @@ -281,7 +281,7 @@ def on_off(expression: bool) -> t.Literal["ON", "OFF"]: return cmd @property - def _rai_build_cmd(self) -> t.List[str]: + def _rai_build_cmd(self) -> list[str]: """Shell command to build RedisAI and modules With the CMake based install, very little needs to be done here. @@ -293,7 +293,7 @@ def _rai_build_cmd(self) -> t.List[str]: """ return "make install -j VERBOSE=1".split(" ") - def _patch_source_files(self, patches: t.Tuple[RAIPatch, ...]) -> None: + def _patch_source_files(self, patches: tuple[RAIPatch, ...]) -> None: """Apply specified RedisAI patches""" for patch in patches: with fileinput.input( diff --git a/smartsim/_core/_install/types.py b/smartsim/_core/_install/types.py index 9f57b928b0..c3b2e6c83b 100644 --- a/smartsim/_core/_install/types.py +++ b/smartsim/_core/_install/types.py @@ -25,6 +25,5 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pathlib -import typing as t -PathLike = t.Union[str, pathlib.Path] +PathLike = str | pathlib.Path diff --git a/smartsim/_core/_install/utils/retrieve.py b/smartsim/_core/_install/utils/retrieve.py index bc1da7d3e2..b5f0195764 100644 --- a/smartsim/_core/_install/utils/retrieve.py +++ b/smartsim/_core/_install/utils/retrieve.py @@ -51,8 +51,8 @@ class _TqdmUpTo(tqdm): # type: ignore[type-arg] """ def update_to( - self, num_blocks: int = 1, bsize: int = 1, tsize: t.Optional[int] = None - ) -> t.Optional[bool]: + self, num_blocks: int = 1, bsize: int = 1, tsize: int | None = None + ) -> bool | None: """Update progress in tqdm-like way :param b: number of blocks transferred so far, defaults to 1 diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 775ca0efe9..ee416f7dec 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -27,6 +27,7 @@ import json import os import typing as t +from collections.abc import Sequence from functools import lru_cache from pathlib import Path @@ -175,7 +176,7 @@ def dragon_dotenv(self) -> Path: return Path(self.conf_dir / "dragon" / ".env") @property - def dragon_server_path(self) -> t.Optional[str]: + def dragon_server_path(self) -> str | None: return os.getenv( "SMARTSIM_DRAGON_SERVER_PATH", os.getenv("SMARTSIM_DRAGON_SERVER_PATH_EXP", None), @@ -218,7 +219,7 @@ def test_num_gpus(self) -> int: # pragma: no cover return int(os.environ.get("SMARTSIM_TEST_NUM_GPUS") or 1) @property - def test_ports(self) -> t.Sequence[int]: # pragma: no cover + def test_ports(self) -> Sequence[int]: # pragma: no cover min_required_ports = 25 first_port = int(os.environ.get("SMARTSIM_TEST_PORT", 6780)) num_ports = max( @@ -228,7 +229,7 @@ def test_ports(self) -> t.Sequence[int]: # pragma: no cover return range(first_port, first_port + num_ports) @property - def test_batch_resources(self) -> t.Dict[t.Any, t.Any]: # pragma: no cover + def test_batch_resources(self) -> dict[t.Any, t.Any]: # pragma: no cover resource_str = os.environ.get("SMARTSIM_TEST_BATCH_RESOURCES", "{}") resources = json.loads(resource_str) if not isinstance(resources, dict): @@ -242,7 +243,7 @@ def test_batch_resources(self) -> t.Dict[t.Any, t.Any]: # pragma: no cover return resources @property - def test_interface(self) -> t.List[str]: # pragma: no cover + def test_interface(self) -> list[str]: # pragma: no cover if interfaces_cfg := os.environ.get("SMARTSIM_TEST_INTERFACE", None): return interfaces_cfg.split(",") @@ -262,7 +263,7 @@ def test_interface(self) -> t.List[str]: # pragma: no cover return ["lo"] @property - def test_account(self) -> t.Optional[str]: # pragma: no cover + def test_account(self) -> str | None: # pragma: no cover # no account by default return os.environ.get("SMARTSIM_TEST_ACCOUNT", None) @@ -272,24 +273,20 @@ def test_mpi(self) -> bool: # pragma: no cover return int(os.environ.get("SMARTSIM_TEST_MPI", "1")) > 0 @property - def telemetry_frequency(self) -> int: - return int(os.environ.get("SMARTSIM_TELEMETRY_FREQUENCY", 5)) + def smartsim_base_dir(self) -> Path: + return Path(".smartsim") @property - def telemetry_enabled(self) -> bool: - return int(os.environ.get("SMARTSIM_FLAG_TELEMETRY", "1")) > 0 + def dragon_default_subdir(self) -> Path: + return self.smartsim_base_dir / "dragon" @property - def telemetry_cooldown(self) -> int: - return int(os.environ.get("SMARTSIM_TELEMETRY_COOLDOWN", 90)) + def dragon_logs_subdir(self) -> Path: + return self.dragon_default_subdir / "logs" @property - def telemetry_subdir(self) -> str: - return ".smartsim/telemetry" - - @property - def dragon_default_subdir(self) -> str: - return ".smartsim/dragon" + def metadata_subdir(self) -> Path: + return self.smartsim_base_dir / "metadata" @property def dragon_log_filename(self) -> str: diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 15a5d7e277..cdaccdaf61 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -32,11 +32,8 @@ import pathlib import pickle import signal -import subprocess -import sys import threading import time -import typing as t from smartredis import Client, ConfigOptions @@ -45,6 +42,7 @@ from ..._core.launcher.step import Step from ..._core.utils.helpers import ( SignalInterceptionStack, + get_ts_ms, unpack_colo_db_identifier, unpack_db_identifier, ) @@ -75,17 +73,11 @@ SlurmLauncher, ) from ..launcher.launcher import Launcher -from ..utils import check_cluster_status, create_cluster, serialize -from .controller_utils import _AnonymousBatchJob, _look_up_launched_data +from ..utils import check_cluster_status, create_cluster +from .controller_utils import _AnonymousBatchJob from .job import Job from .jobmanager import JobManager -from .manifest import LaunchedManifest, LaunchedManifestBuilder, Manifest - -if t.TYPE_CHECKING: - from types import FrameType - - from ..utils.serialize import TStepLaunchMetaData - +from .manifest import Manifest logger = get_logger(__name__) @@ -106,7 +98,6 @@ def __init__(self, launcher: str = "local") -> None: """ self._jobs = JobManager(JM_LOCK) self.init_launcher(launcher) - self._telemetry_monitor: t.Optional[subprocess.Popen[bytes]] = None def start( self, @@ -124,26 +115,18 @@ def start( The controller will start the job-manager thread upon execution of all jobs. """ - # launch a telemetry monitor to track job progress - if CONFIG.telemetry_enabled: - self._start_telemetry_monitor(exp_path) - self._jobs.kill_on_interrupt = kill_on_interrupt # register custom signal handler for ^C (SIGINT) SignalInterceptionStack.get(signal.SIGINT).push_unique( self._jobs.signal_interrupt ) - launched = self._launch(exp_name, exp_path, manifest) + self._launch(exp_name, exp_path, manifest) # start the job manager thread if not already started if not self._jobs.actively_monitoring: self._jobs.start() - serialize.save_launch_manifest( - launched.map(_look_up_launched_data(self._launcher)) - ) - # block until all non-database jobs are complete if block: # poll handles its own keyboard interrupt as @@ -151,7 +134,7 @@ def start( self.poll(5, True, kill_on_interrupt=kill_on_interrupt) @property - def active_orchestrator_jobs(self) -> t.Dict[str, Job]: + def active_orchestrator_jobs(self) -> dict[str, Job]: """Return active orchestrator jobs.""" return {**self._jobs.db_jobs} @@ -183,9 +166,7 @@ def poll( for job in to_monitor.values(): logger.info(job) - def finished( - self, entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] - ) -> bool: + def finished(self, entity: SmartSimEntity | EntitySequence[SmartSimEntity]) -> bool: """Return a boolean indicating wether a job has finished or not :param entity: object launched by SmartSim. @@ -210,7 +191,7 @@ def finished( ) from None def stop_entity( - self, entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] + self, entity: SmartSimEntity | EntitySequence[SmartSimEntity] ) -> None: """Stop an instance of an entity @@ -281,7 +262,7 @@ def stop_entity_list(self, entity_list: EntitySequence[SmartSimEntity]) -> None: for entity in entity_list.entities: self.stop_entity(entity) - def get_jobs(self) -> t.Dict[str, Job]: + def get_jobs(self) -> dict[str, Job]: """Return a dictionary of completed job data :returns: dict[str, Job] @@ -290,7 +271,7 @@ def get_jobs(self) -> t.Dict[str, Job]: return self._jobs.completed def get_entity_status( - self, entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] + self, entity: SmartSimEntity | EntitySequence[SmartSimEntity] ) -> SmartSimStatus: """Get the status of an entity @@ -307,7 +288,7 @@ def get_entity_status( def get_entity_list_status( self, entity_list: EntitySequence[SmartSimEntity] - ) -> t.List[SmartSimStatus]: + ) -> list[SmartSimStatus]: """Get the statuses of an entity list :param entity_list: entity list containing entities to @@ -336,7 +317,7 @@ def init_launcher(self, launcher: str) -> None: a supported launcher :raises TypeError: if no launcher argument is provided. """ - launcher_map: t.Dict[str, t.Type[Launcher]] = { + launcher_map: dict[str, type[Launcher]] = { "slurm": SlurmLauncher, "pbs": PBSLauncher, "pals": PBSLauncher, @@ -358,7 +339,7 @@ def init_launcher(self, launcher: str) -> None: @staticmethod def symlink_output_files( - job_step: Step, entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] + job_step: Step, entity: SmartSimEntity | EntitySequence[SmartSimEntity] ) -> None: """Create symlinks for entity output files that point to the output files under the .smartsim directory @@ -370,11 +351,13 @@ def symlink_output_files( entity_out = pathlib.Path(entity.path) / f"{entity.name}.out" entity_err = pathlib.Path(entity.path) / f"{entity.name}.err" - # check if there is already a link to a previous run - if entity_out.is_symlink() or entity_err.is_symlink(): + # Remove old symlinks if they exist + if entity_out.is_symlink(): entity_out.unlink() + if entity_err.is_symlink(): entity_err.unlink() + # Ensure the output files exist (create them if they don't exist yet) historical_err.touch() historical_out.touch() @@ -387,9 +370,7 @@ def symlink_output_files( "Symlinking files failed." ) - def _launch( - self, exp_name: str, exp_path: str, manifest: Manifest - ) -> LaunchedManifest[t.Tuple[str, Step]]: + def _launch(self, _exp_name: str, exp_path: str, manifest: Manifest) -> None: """Main launching function of the controller Orchestrators are always launched first so that the @@ -400,11 +381,10 @@ def _launch( :param manifest: Manifest of deployables to launch """ - manifest_builder = LaunchedManifestBuilder[t.Tuple[str, Step]]( - exp_name=exp_name, - exp_path=exp_path, - launcher_name=str(self._launcher), - ) + # Create a unique timestamp for this launch to ensure unique metadata + # directories + launch_timestamp = get_ts_ms() + # Loop over deployables to launch and launch multiple orchestrators for orchestrator in manifest.dbs: for key in self._jobs.get_db_host_addresses(): @@ -422,28 +402,31 @@ def _launch( raise SmartSimError( "Local launcher does not support multi-host orchestrators" ) - self._launch_orchestrator(orchestrator, manifest_builder) + self._launch_orchestrator(orchestrator) if self.orchestrator_active: self._set_dbobjects(manifest) # create all steps prior to launch - steps: t.List[ - t.Tuple[Step, t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]]] - ] = [] + steps: list[tuple[Step, SmartSimEntity | EntitySequence[SmartSimEntity]]] = [] - symlink_substeps: t.List[ - t.Tuple[Step, t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]]] + symlink_substeps: list[ + tuple[Step, SmartSimEntity | EntitySequence[SmartSimEntity]] ] = [] for elist in manifest.ensembles: - ens_telem_dir = manifest_builder.run_telemetry_subdirectory / "ensemble" + # Create ensemble metadata directory + ensemble_metadata_dir = ( + pathlib.Path(exp_path) + / CONFIG.metadata_subdir + / str(launch_timestamp) + / "ensemble" + / elist.name + ) if elist.batch: - batch_step, substeps = self._create_batch_job_step(elist, ens_telem_dir) - manifest_builder.add_ensemble( - elist, [(batch_step.name, step) for step in substeps] + batch_step, substeps = self._create_batch_job_step( + elist, ensemble_metadata_dir ) - # symlink substeps to maintain directory structure for substep, substep_entity in zip(substeps, elist.models): symlink_substeps.append((substep, substep_entity)) @@ -452,29 +435,30 @@ def _launch( else: # if ensemble is to be run as separate job steps, aka not in a batch job_steps = [ - (self._create_job_step(e, ens_telem_dir / elist.name), e) + (self._create_job_step(e, ensemble_metadata_dir), e) for e in elist.entities ] - manifest_builder.add_ensemble( - elist, [(step.name, step) for step, _ in job_steps] - ) steps.extend(job_steps) # models themselves cannot be batch steps. If batch settings are # attached, wrap them in an anonymous batch job step for model in manifest.models: - model_telem_dir = manifest_builder.run_telemetry_subdirectory / "model" + # Create model-specific metadata directory + model_metadata_dir = ( + pathlib.Path(exp_path) + / CONFIG.metadata_subdir + / str(launch_timestamp) + / "model" + / model.name + ) if model.batch_settings: anon_entity_list = _AnonymousBatchJob(model) batch_step, substeps = self._create_batch_job_step( - anon_entity_list, model_telem_dir + anon_entity_list, model_metadata_dir ) - manifest_builder.add_model(model, (batch_step.name, batch_step)) - symlink_substeps.append((substeps[0], model)) steps.append((batch_step, model)) else: - job_step = self._create_job_step(model, model_telem_dir) - manifest_builder.add_model(model, (job_step.name, job_step)) + job_step = self._create_job_step(model, model_metadata_dir) steps.append((job_step, model)) # launch and symlink steps @@ -486,13 +470,7 @@ def _launch( for substep, entity in symlink_substeps: self.symlink_output_files(substep, entity) - return manifest_builder.finalize() - - def _launch_orchestrator( - self, - orchestrator: Orchestrator, - manifest_builder: LaunchedManifestBuilder[t.Tuple[str, Step]], - ) -> None: + def _launch_orchestrator(self, orchestrator: Orchestrator) -> None: """Launch an Orchestrator instance This function will launch the Orchestrator instance and @@ -500,21 +478,19 @@ def _launch_orchestrator( set them in the JobManager :param orchestrator: orchestrator to launch - :param manifest_builder: An `LaunchedManifestBuilder` to record the - names and `Step`s of the launched orchestrator """ orchestrator.remove_stale_files() - orc_telem_dir = manifest_builder.run_telemetry_subdirectory / "database" - # if the orchestrator was launched as a batch workload if orchestrator.batch: - orc_batch_step, substeps = self._create_batch_job_step( - orchestrator, orc_telem_dir + metadata_dir = ( + pathlib.Path(orchestrator.path) + / CONFIG.metadata_subdir + / "database" + / orchestrator.name ) - manifest_builder.add_database( - orchestrator, [(orc_batch_step.name, step) for step in substeps] + orc_batch_step, substeps = self._create_batch_job_step( + orchestrator, metadata_dir ) - self._launch_step(orc_batch_step, orchestrator) self.symlink_output_files(orc_batch_step, orchestrator) @@ -524,13 +500,16 @@ def _launch_orchestrator( # if orchestrator was run on existing allocation, locally, or in allocation else: + metadata_dir = ( + pathlib.Path(orchestrator.path) + / CONFIG.metadata_subdir + / "database" + / orchestrator.name + ) db_steps = [ - (self._create_job_step(db, orc_telem_dir / orchestrator.name), db) + (self._create_job_step(db, metadata_dir), db) for db in orchestrator.entities ] - manifest_builder.add_database( - orchestrator, [(step.name, step) for step, _ in db_steps] - ) for db_step in db_steps: self._launch_step(*db_step) self.symlink_output_files(*db_step) @@ -569,7 +548,7 @@ def _launch_orchestrator( def _launch_step( self, job_step: Step, - entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]], + entity: SmartSimEntity | EntitySequence[SmartSimEntity], ) -> None: """Use the launcher to launch a job step @@ -626,14 +605,13 @@ def _launch_step( def _create_batch_job_step( self, - entity_list: t.Union[Orchestrator, Ensemble, _AnonymousBatchJob], - telemetry_dir: pathlib.Path, - ) -> t.Tuple[Step, t.List[Step]]: + entity_list: Orchestrator | Ensemble | _AnonymousBatchJob, + metadata_dir: pathlib.Path, + ) -> tuple[Step, list[Step]]: """Use launcher to create batch job step :param entity_list: EntityList to launch as batch - :param telemetry_dir: Path to a directory in which the batch job step - may write telemetry events + :param metadata_dir: Metadata directory for this launch :return: batch job step instance and a list of run steps to be executed within the batch job """ @@ -642,30 +620,31 @@ def _create_batch_job_step( "EntityList must have batch settings to be launched as batch" ) - telemetry_dir = telemetry_dir / entity_list.name batch_step = self._launcher.create_step( entity_list.name, entity_list.path, entity_list.batch_settings ) batch_step.meta["entity_type"] = str(type(entity_list).__name__).lower() - batch_step.meta["status_dir"] = str(telemetry_dir) + + # Set metadata directory for batch step + status_dir = str(metadata_dir) + batch_step.meta["metadata_dir"] = status_dir substeps = [] for entity in entity_list.entities: # tells step creation not to look for an allocation entity.run_settings.in_batch = True - step = self._create_job_step(entity, telemetry_dir) + step = self._create_job_step(entity, metadata_dir) substeps.append(step) batch_step.add_to_batch(step) return batch_step, substeps def _create_job_step( - self, entity: SmartSimEntity, telemetry_dir: pathlib.Path + self, entity: SmartSimEntity, metadata_dir: pathlib.Path ) -> Step: """Create job steps for all entities with the launcher :param entity: an entity to create a step for - :param telemetry_dir: Path to a directory in which the job step - may write telemetry events + :param metadata_dir: Metadata directory for this launch :return: the job step """ # get SSDB, SSIN, SSOUT and add to entity run settings @@ -675,7 +654,9 @@ def _create_job_step( step = self._launcher.create_step(entity.name, entity.path, entity.run_settings) step.meta["entity_type"] = str(type(entity).__name__).lower() - step.meta["status_dir"] = str(telemetry_dir / entity.name) + # Set metadata directory for job step + status_dir = str(metadata_dir) + step.meta["metadata_dir"] = status_dir return step @@ -685,7 +666,7 @@ def _prep_entity_client_env(self, entity: Model) -> None: :param entity: The entity to retrieve connections from """ - client_env: t.Dict[str, t.Union[str, int, float, bool]] = {} + client_env: dict[str, str | int | float | bool] = {} address_dict = self._jobs.get_db_host_addresses() for db_id, addresses in address_dict.items(): @@ -817,9 +798,7 @@ def _orchestrator_launch_wait(self, orchestrator: Orchestrator) -> None: # launch explicitly raise - def reload_saved_db( - self, checkpoint_file: t.Union[str, os.PathLike[str]] - ) -> Orchestrator: + def reload_saved_db(self, checkpoint_file: str | os.PathLike[str]) -> Orchestrator: with JM_LOCK: if not osp.exists(checkpoint_file): @@ -921,34 +900,3 @@ def _set_dbobjects(self, manifest: Manifest) -> None: for db_script in entity.db_scripts: if db_script not in ensemble.db_scripts: set_script(db_script, client) - - def _start_telemetry_monitor(self, exp_dir: str) -> None: - """Spawns a telemetry monitor process to keep track of the life times - of the processes launched through this controller. - - :param exp_dir: An experiment directory - """ - if ( - self._telemetry_monitor is None - or self._telemetry_monitor.returncode is not None - ): - logger.debug("Starting telemetry monitor process") - cmd = [ - sys.executable, - "-m", - "smartsim._core.entrypoints.telemetrymonitor", - "-exp_dir", - exp_dir, - "-frequency", - str(CONFIG.telemetry_frequency), - "-cooldown", - str(CONFIG.telemetry_cooldown), - ] - # pylint: disable-next=consider-using-with - self._telemetry_monitor = subprocess.Popen( - cmd, - stderr=sys.stderr, - stdout=sys.stdout, - cwd=str(pathlib.Path(__file__).parent.parent.parent), - shell=False, - ) diff --git a/smartsim/_core/control/controller_utils.py b/smartsim/_core/control/controller_utils.py index c72d1b5811..1a09932dd3 100644 --- a/smartsim/_core/control/controller_utils.py +++ b/smartsim/_core/control/controller_utils.py @@ -26,16 +26,10 @@ from __future__ import annotations -import pathlib import typing as t -from ..._core.launcher.step import Step from ...entity import EntityList, Model from ...error import SmartSimError -from ..launcher.launcher import Launcher - -if t.TYPE_CHECKING: - from ..utils.serialize import TStepLaunchMetaData class _AnonymousBatchJob(EntityList[Model]): @@ -52,26 +46,3 @@ def __init__(self, model: Model) -> None: self.batch_settings = model.batch_settings def _initialize_entities(self, **kwargs: t.Any) -> None: ... - - -def _look_up_launched_data( - launcher: Launcher, -) -> t.Callable[[t.Tuple[str, Step]], "TStepLaunchMetaData"]: - def _unpack_launched_data(data: t.Tuple[str, Step]) -> "TStepLaunchMetaData": - # NOTE: we cannot assume that the name of the launched step - # ``launched_step_name`` is equal to the name of the step referring to - # the entity ``step.name`` as is the case when an entity list is - # launched as a batch job - launched_step_name, step = data - launched_step_map = launcher.step_mapping[launched_step_name] - out_file, err_file = step.get_output_files() - return ( - launched_step_map.step_id, - launched_step_map.task_id, - launched_step_map.managed, - out_file, - err_file, - pathlib.Path(step.meta.get("status_dir", step.cwd)), - ) - - return _unpack_launched_data diff --git a/smartsim/_core/control/job.py b/smartsim/_core/control/job.py index 3c2c230048..c96960cfcd 100644 --- a/smartsim/_core/control/job.py +++ b/smartsim/_core/control/job.py @@ -24,171 +24,12 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pathlib import time -import typing as t -from dataclasses import dataclass from ...entity import EntitySequence, SmartSimEntity from ...status import SmartSimStatus -@dataclass(frozen=True) -class _JobKey: - """A helper class for creating unique lookup keys within the telemetry - monitor. These keys are not guaranteed to be unique across experiments, - only within an experiment (due to process ID re-use by the OS)""" - - step_id: str - """The process id of an unmanaged task""" - task_id: str - """The task id of a managed task""" - - -class JobEntity: - """An entity containing run-time SmartSimEntity metadata. The run-time metadata - is required to perform telemetry collection. The `JobEntity` satisfies the core - API necessary to use a `JobManager` to manage retrieval of managed step updates. - """ - - def __init__(self) -> None: - self.name: str = "" - """The entity name""" - self.path: str = "" - """The root path for entity output files""" - self.step_id: str = "" - """The process id of an unmanaged task""" - self.task_id: str = "" - """The task id of a managed task""" - self.type: str = "" - """The type of the associated `SmartSimEntity`""" - self.timestamp: int = 0 - """The timestamp when the entity was created""" - self.status_dir: str = "" - """The path configured by the experiment for the entities telemetry output""" - self.telemetry_on: bool = False - """"Flag indicating if optional telemetry is enabled for the entity""" - self.collectors: t.Dict[str, str] = {} - """Mapping of collectors enabled for the entity""" - self.config: t.Dict[str, str] = {} - """Telemetry configuration supplied by the experiment""" - self._is_complete: bool = False - """Flag indicating if the entity has completed execution""" - - @property - def is_db(self) -> bool: - """Returns `True` if the entity represents a database or database shard""" - return self.type in ["orchestrator", "dbnode"] - - @property - def is_managed(self) -> bool: - """Returns `True` if the entity is managed by a workload manager""" - return bool(self.step_id) - - @property - def key(self) -> _JobKey: - """Return a `_JobKey` that identifies an entity. - NOTE: not guaranteed to be unique over time due to reused process IDs""" - return _JobKey(self.step_id, self.task_id) - - @property - def is_complete(self) -> bool: - """Returns `True` if the entity has completed execution""" - return self._is_complete - - def check_completion_status(self) -> None: - """Check for telemetry outputs indicating the entity has completed - TODO: determine correct location to avoid exposing telemetry - implementation details into `JobEntity` - """ - # avoid touching file-system if not necessary - if self._is_complete: - return - - # status telemetry is tracked in JSON files written to disk. look - # for a corresponding `stop` event in the entity status directory - state_file = pathlib.Path(self.status_dir) / "stop.json" - if state_file.exists(): - self._is_complete = True - - @staticmethod - def _map_db_metadata(entity_dict: t.Dict[str, t.Any], entity: "JobEntity") -> None: - """Map DB-specific properties from a runtime manifest onto a `JobEntity` - - :param entity_dict: The raw dictionary deserialized from manifest JSON - :param entity: The entity instance to modify - """ - if entity.is_db: - # add collectors if they're configured to be enabled in the manifest - entity.collectors = { - "client": entity_dict.get("client_file", ""), - "client_count": entity_dict.get("client_count_file", ""), - "memory": entity_dict.get("memory_file", ""), - } - - entity.telemetry_on = any(entity.collectors.values()) - entity.config["host"] = entity_dict.get("hostname", "") - entity.config["port"] = entity_dict.get("port", "") - - @staticmethod - def _map_standard_metadata( - entity_type: str, - entity_dict: t.Dict[str, t.Any], - entity: "JobEntity", - exp_dir: str, - raw_experiment: t.Dict[str, t.Any], - ) -> None: - """Map universal properties from a runtime manifest onto a `JobEntity` - - :param entity_type: The type of the associated `SmartSimEntity` - :param entity_dict: The raw dictionary deserialized from manifest JSON - :param entity: The entity instance to modify - :param exp_dir: The path to the experiment working directory - :param raw_experiment: The raw experiment dictionary deserialized from - manifest JSON - """ - metadata = entity_dict["telemetry_metadata"] - status_dir = pathlib.Path(metadata.get("status_dir")) - is_dragon = raw_experiment["launcher"].lower() == "dragon" - - # all entities contain shared properties that identify the task - entity.type = entity_type - entity.name = ( - entity_dict["name"] - if not is_dragon - else entity_dict["telemetry_metadata"]["step_id"] - ) - entity.step_id = str(metadata.get("step_id") or "") - entity.task_id = str(metadata.get("task_id") or "") - entity.timestamp = int(entity_dict.get("timestamp", "0")) - entity.path = str(exp_dir) - entity.status_dir = str(status_dir) - - @classmethod - def from_manifest( - cls, - entity_type: str, - entity_dict: t.Dict[str, t.Any], - exp_dir: str, - raw_experiment: t.Dict[str, t.Any], - ) -> "JobEntity": - """Instantiate a `JobEntity` from the dictionary deserialized from manifest JSON - - :param entity_type: The type of the associated `SmartSimEntity` - :param entity_dict: The raw dictionary deserialized from manifest JSON - :param exp_dir: The path to the experiment working directory - :param raw_experiment: raw experiment deserialized from manifest JSON - """ - entity = JobEntity() - - cls._map_standard_metadata( - entity_type, entity_dict, entity, exp_dir, raw_experiment - ) - cls._map_db_metadata(entity_dict, entity) - - return entity - - class Job: """Keep track of various information for the controller. In doing so, continuously add various fields of information @@ -199,8 +40,8 @@ class Job: def __init__( self, job_name: str, - job_id: t.Optional[str], - entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity], JobEntity], + job_id: str | None, + entity: SmartSimEntity | EntitySequence[SmartSimEntity], launcher: str, is_task: bool, ) -> None: @@ -217,12 +58,12 @@ def __init__( self.entity = entity self.status = SmartSimStatus.STATUS_NEW # status before smartsim status mapping is applied - self.raw_status: t.Optional[str] = None - self.returncode: t.Optional[int] = None + self.raw_status: str | None = None + self.returncode: int | None = None # output is only populated if it's system related (e.g. cmd failed immediately) - self.output: t.Optional[str] = None - self.error: t.Optional[str] = None # same as output - self.hosts: t.List[str] = [] # currently only used for DB jobs + self.output: str | None = None + self.error: str | None = None # same as output + self.hosts: list[str] = [] # currently only used for DB jobs self.launched_with = launcher self.is_task = is_task self.start_time = time.time() @@ -237,9 +78,9 @@ def set_status( self, new_status: SmartSimStatus, raw_status: str, - returncode: t.Optional[int], - error: t.Optional[str] = None, - output: t.Optional[str] = None, + returncode: int | None, + error: str | None = None, + output: str | None = None, ) -> None: """Set the status of a job. @@ -263,9 +104,7 @@ def record_history(self) -> None: """Record the launching history of a job.""" self.history.record(self.jid, self.status, self.returncode, self.elapsed) - def reset( - self, new_job_name: str, new_job_id: t.Optional[str], is_task: bool - ) -> None: + def reset(self, new_job_name: str, new_job_id: str | None, is_task: bool) -> None: """Reset the job in order to be able to restart it. :param new_job_name: name of the new job step @@ -326,16 +165,16 @@ def __init__(self, runs: int = 0) -> None: :param runs: number of runs so far """ self.runs = runs - self.jids: t.Dict[int, t.Optional[str]] = {} - self.statuses: t.Dict[int, SmartSimStatus] = {} - self.returns: t.Dict[int, t.Optional[int]] = {} - self.job_times: t.Dict[int, float] = {} + self.jids: dict[int, str | None] = {} + self.statuses: dict[int, SmartSimStatus] = {} + self.returns: dict[int, int | None] = {} + self.job_times: dict[int, float] = {} def record( self, - job_id: t.Optional[str], + job_id: str | None, status: SmartSimStatus, - returncode: t.Optional[int], + returncode: int | None, job_time: float, ) -> None: """record the history of a job""" diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index b692edb8b8..d253c02c8b 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -27,7 +27,6 @@ import itertools import time -import typing as t from collections import ChainMap from threading import RLock, Thread from types import FrameType @@ -39,7 +38,7 @@ from ..config import CONFIG from ..launcher import Launcher, LocalLauncher from ..utils.network import get_ip_from_host -from .job import Job, JobEntity +from .job import Job logger = get_logger(__name__) @@ -57,19 +56,19 @@ class JobManager: wlm to query information about jobs that the user requests. """ - def __init__(self, lock: RLock, launcher: t.Optional[Launcher] = None) -> None: + def __init__(self, lock: RLock, launcher: Launcher | None = None) -> None: """Initialize a Jobmanager :param launcher: a Launcher object to manage jobs """ - self.monitor: t.Optional[Thread] = None + self.monitor: Thread | None = None # active jobs - self.jobs: t.Dict[str, Job] = {} - self.db_jobs: t.Dict[str, Job] = {} + self.jobs: dict[str, Job] = {} + self.db_jobs: dict[str, Job] = {} # completed jobs - self.completed: t.Dict[str, Job] = {} + self.completed: dict[str, Job] = {} self.actively_monitoring = False # on/off flag self._launcher = launcher # reference to launcher @@ -145,7 +144,7 @@ def __getitem__(self, entity_name: str) -> Job: entities = ChainMap(self.db_jobs, self.jobs, self.completed) return entities[entity_name] - def __call__(self) -> t.Dict[str, Job]: + def __call__(self) -> dict[str, Job]: """Returns dictionary all jobs for () operator :returns: Dictionary of all jobs @@ -163,8 +162,8 @@ def __contains__(self, key: str) -> bool: def add_job( self, job_name: str, - job_id: t.Optional[str], - entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity], JobEntity], + job_id: str | None, + entity: SmartSimEntity | EntitySequence[SmartSimEntity], is_task: bool = True, ) -> None: """Add a job to the job manager which holds specific jobs by type. @@ -179,8 +178,6 @@ def add_job( job = Job(job_name, job_id, entity, launcher, is_task) if isinstance(entity, (DBNode, Orchestrator)): self.db_jobs[entity.name] = job - elif isinstance(entity, JobEntity) and entity.is_db: - self.db_jobs[entity.name] = job else: self.jobs[entity.name] = job @@ -227,7 +224,7 @@ def check_jobs(self) -> None: def get_status( self, - entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]], + entity: SmartSimEntity | EntitySequence[SmartSimEntity], ) -> SmartSimStatus: """Return the status of a job. @@ -264,7 +261,7 @@ def query_restart(self, entity_name: str) -> bool: def restart_job( self, job_name: str, - job_id: t.Optional[str], + job_id: str | None, entity_name: str, is_task: bool = True, ) -> None: @@ -287,14 +284,14 @@ def restart_job( else: self.jobs[entity_name] = job - def get_db_host_addresses(self) -> t.Dict[str, t.List[str]]: + def get_db_host_addresses(self) -> dict[str, list[str]]: """Retrieve the list of hosts for the database for corresponding database identifiers :return: dictionary of host ip addresses """ - address_dict: t.Dict[str, t.List[str]] = {} + address_dict: dict[str, list[str]] = {} for db_job in self.db_jobs.values(): addresses = [] if isinstance(db_job.entity, (DBNode, Orchestrator)): @@ -303,7 +300,7 @@ def get_db_host_addresses(self) -> t.Dict[str, t.List[str]]: ip_addr = get_ip_from_host(combine[0]) addresses.append(":".join((ip_addr, str(combine[1])))) - dict_entry: t.List[str] = address_dict.get(db_entity.db_identifier, []) + dict_entry: list[str] = address_dict.get(db_entity.db_identifier, []) dict_entry.extend(addresses) address_dict[db_entity.db_identifier] = dict_entry @@ -327,7 +324,7 @@ def set_db_hosts(self, orchestrator: Orchestrator) -> None: else: self.db_jobs[dbnode.name].hosts = dbnode.hosts - def signal_interrupt(self, signo: int, _frame: t.Optional[FrameType]) -> None: + def signal_interrupt(self, signo: int, _frame: FrameType | None) -> None: """Custom handler for whenever SIGINT is received""" if not signo: logger.warning("Received SIGINT with no signal number") diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index f603f218ec..5154f76202 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -25,23 +25,13 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import itertools -import pathlib import typing as t -from dataclasses import dataclass, field +from collections.abc import Iterable from ...database import Orchestrator -from ...entity import DBNode, Ensemble, EntitySequence, Model, SmartSimEntity +from ...entity import Ensemble, EntitySequence, Model, SmartSimEntity from ...error import SmartSimError -from ..config import CONFIG from ..utils import helpers as _helpers -from ..utils import serialize as _serialize - -_T = t.TypeVar("_T") -_U = t.TypeVar("_U") -_AtomicLaunchableT = t.TypeVar("_AtomicLaunchableT", Model, DBNode) - -if t.TYPE_CHECKING: - import os class Manifest: @@ -54,16 +44,14 @@ class Manifest: can all be passed as arguments """ - def __init__( - self, *args: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] - ) -> None: + def __init__(self, *args: SmartSimEntity | EntitySequence[SmartSimEntity]) -> None: self._deployables = list(args) self._check_types(self._deployables) self._check_names(self._deployables) self._check_entity_lists_nonempty() @property - def dbs(self) -> t.List[Orchestrator]: + def dbs(self) -> list[Orchestrator]: """Return a list of Orchestrator instances in Manifest :raises SmartSimError: if user added to databases to manifest @@ -73,18 +61,18 @@ def dbs(self) -> t.List[Orchestrator]: return dbs @property - def models(self) -> t.List[Model]: + def models(self) -> list[Model]: """Return Model instances in Manifest :return: model instances """ - _models: t.List[Model] = [ + _models: list[Model] = [ item for item in self._deployables if isinstance(item, Model) ] return _models @property - def ensembles(self) -> t.List[Ensemble]: + def ensembles(self) -> list[Ensemble]: """Return Ensemble instances in Manifest :return: list of ensembles @@ -92,13 +80,13 @@ def ensembles(self) -> t.List[Ensemble]: return [e for e in self._deployables if isinstance(e, Ensemble)] @property - def all_entity_lists(self) -> t.List[EntitySequence[SmartSimEntity]]: + def all_entity_lists(self) -> list[EntitySequence[SmartSimEntity]]: """All entity lists, including ensembles and exceptional ones like Orchestrator :return: list of entity lists """ - _all_entity_lists: t.List[EntitySequence[SmartSimEntity]] = list(self.ensembles) + _all_entity_lists: list[EntitySequence[SmartSimEntity]] = list(self.ensembles) for db in self.dbs: _all_entity_lists.append(db) @@ -114,7 +102,7 @@ def has_deployable(self) -> bool: return bool(self._deployables) @staticmethod - def _check_names(deployables: t.List[t.Any]) -> None: + def _check_names(deployables: list[t.Any]) -> None: used = [] for deployable in deployables: name = getattr(deployable, "name", None) @@ -125,7 +113,7 @@ def _check_names(deployables: t.List[t.Any]) -> None: used.append(name) @staticmethod - def _check_types(deployables: t.List[t.Any]) -> None: + def _check_types(deployables: list[t.Any]) -> None: for deployable in deployables: if not isinstance(deployable, (SmartSimEntity, EntitySequence)): raise TypeError( @@ -183,139 +171,9 @@ def __str__(self) -> str: @property def has_db_objects(self) -> bool: """Check if any entity has DBObjects to set""" - ents: t.Iterable[t.Union[Model, Ensemble]] = itertools.chain( + ents: Iterable[Model | Ensemble] = itertools.chain( self.models, self.ensembles, (member for ens in self.ensembles for member in ens.entities), ) return any(any(ent.db_models) or any(ent.db_scripts) for ent in ents) - - -class _LaunchedManifestMetadata(t.NamedTuple): - run_id: str - exp_name: str - exp_path: str - launcher_name: str - - @property - def exp_telemetry_subdirectory(self) -> pathlib.Path: - return _format_exp_telemetry_path(self.exp_path) - - @property - def run_telemetry_subdirectory(self) -> pathlib.Path: - return _format_run_telemetry_path(self.exp_path, self.exp_name, self.run_id) - - @property - def manifest_file_path(self) -> pathlib.Path: - return self.exp_telemetry_subdirectory / _serialize.MANIFEST_FILENAME - - -@dataclass(frozen=True) -class LaunchedManifest(t.Generic[_T]): - """Immutable manifest mapping launched entities or collections of launched - entities to other pieces of external data. This is commonly used to map a - launch-able entity to its constructed ``Step`` instance without assuming - that ``step.name == job.name`` or querying the ``JobManager`` which itself - can be ephemeral. - """ - - metadata: _LaunchedManifestMetadata - models: t.Tuple[t.Tuple[Model, _T], ...] - ensembles: t.Tuple[t.Tuple[Ensemble, t.Tuple[t.Tuple[Model, _T], ...]], ...] - databases: t.Tuple[t.Tuple[Orchestrator, t.Tuple[t.Tuple[DBNode, _T], ...]], ...] - - def map(self, func: t.Callable[[_T], _U]) -> "LaunchedManifest[_U]": - def _map_entity_data( - fn: t.Callable[[_T], _U], - entity_list: t.Sequence[t.Tuple[_AtomicLaunchableT, _T]], - ) -> t.Tuple[t.Tuple[_AtomicLaunchableT, _U], ...]: - return tuple((entity, fn(data)) for entity, data in entity_list) - - return LaunchedManifest( - metadata=self.metadata, - models=_map_entity_data(func, self.models), - ensembles=tuple( - (ens, _map_entity_data(func, model_data)) - for ens, model_data in self.ensembles - ), - databases=tuple( - (db_, _map_entity_data(func, node_data)) - for db_, node_data in self.databases - ), - ) - - -@dataclass(frozen=True) -class LaunchedManifestBuilder(t.Generic[_T]): - """A class comprised of mutable collections of SmartSim entities that is - used to build a ``LaunchedManifest`` while going through the launching - process. - """ - - exp_name: str - exp_path: str - launcher_name: str - run_id: str = field(default_factory=_helpers.create_short_id_str) - - _models: t.List[t.Tuple[Model, _T]] = field(default_factory=list, init=False) - _ensembles: t.List[t.Tuple[Ensemble, t.Tuple[t.Tuple[Model, _T], ...]]] = field( - default_factory=list, init=False - ) - _databases: t.List[t.Tuple[Orchestrator, t.Tuple[t.Tuple[DBNode, _T], ...]]] = ( - field(default_factory=list, init=False) - ) - - @property - def exp_telemetry_subdirectory(self) -> pathlib.Path: - return _format_exp_telemetry_path(self.exp_path) - - @property - def run_telemetry_subdirectory(self) -> pathlib.Path: - return _format_run_telemetry_path(self.exp_path, self.exp_name, self.run_id) - - def add_model(self, model: Model, data: _T) -> None: - self._models.append((model, data)) - - def add_ensemble(self, ens: Ensemble, data: t.Sequence[_T]) -> None: - self._ensembles.append((ens, self._entities_to_data(ens.entities, data))) - - def add_database(self, db_: Orchestrator, data: t.Sequence[_T]) -> None: - self._databases.append((db_, self._entities_to_data(db_.entities, data))) - - @staticmethod - def _entities_to_data( - entities: t.Sequence[_AtomicLaunchableT], data: t.Sequence[_T] - ) -> t.Tuple[t.Tuple[_AtomicLaunchableT, _T], ...]: - if not entities: - raise ValueError("Cannot map data to an empty entity sequence") - if len(entities) != len(data): - raise ValueError( - f"Cannot map data sequence of length {len(data)} to entity " - f"sequence of length {len(entities)}" - ) - return tuple(zip(entities, data)) - - def finalize(self) -> LaunchedManifest[_T]: - return LaunchedManifest( - metadata=_LaunchedManifestMetadata( - self.run_id, - self.exp_name, - self.exp_path, - self.launcher_name, - ), - models=tuple(self._models), - ensembles=tuple(self._ensembles), - databases=tuple(self._databases), - ) - - -def _format_exp_telemetry_path( - exp_path: t.Union[str, "os.PathLike[str]"] -) -> pathlib.Path: - return pathlib.Path(exp_path, CONFIG.telemetry_subdir) - - -def _format_run_telemetry_path( - exp_path: t.Union[str, "os.PathLike[str]"], exp_name: str, run_id: str -) -> pathlib.Path: - return _format_exp_telemetry_path(exp_path) / f"{exp_name}/{run_id}" diff --git a/smartsim/_core/control/previewrenderer.py b/smartsim/_core/control/previewrenderer.py index 857a703973..d871a3aebd 100644 --- a/smartsim/_core/control/previewrenderer.py +++ b/smartsim/_core/control/previewrenderer.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2025, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -64,7 +64,7 @@ def as_toggle(_eval_ctx: u.F, value: bool) -> str: @pass_eval_context -def get_ifname(_eval_ctx: u.F, value: t.List[str]) -> str: +def get_ifname(_eval_ctx: u.F, value: list[str]) -> str: """Extract Network Interface from orchestrator run settings.""" if value: for val in value: @@ -108,11 +108,11 @@ def render_to_file(content: str, filename: str) -> None: def render( exp: "Experiment", - manifest: t.Optional[Manifest] = None, + manifest: Manifest | None = None, verbosity_level: Verbosity = Verbosity.INFO, output_format: Format = Format.PLAINTEXT, - output_filename: t.Optional[str] = None, - active_dbjobs: t.Optional[t.Dict[str, Job]] = None, + output_filename: str | None = None, + active_dbjobs: dict[str, Job] | None = None, ) -> str: """ Render the template from the supplied entities. diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py index 6615c9c76e..539bc298ea 100644 --- a/smartsim/_core/entrypoints/colocated.py +++ b/smartsim/_core/entrypoints/colocated.py @@ -30,7 +30,6 @@ import socket import sys import tempfile -import typing as t from pathlib import Path from subprocess import STDOUT from types import FrameType @@ -52,13 +51,13 @@ SIGNALS = [signal.SIGINT, signal.SIGTERM, signal.SIGQUIT, signal.SIGABRT] -def handle_signal(signo: int, _frame: t.Optional[FrameType]) -> None: +def handle_signal(signo: int, _frame: FrameType | None) -> None: if not signo: logger.warning("Received signal with no signo") cleanup() -def launch_db_model(client: Client, db_model: t.List[str]) -> str: +def launch_db_model(client: Client, db_model: list[str]) -> str: """Parse options to launch model on local cluster :param client: SmartRedis client connected to local DB @@ -122,7 +121,7 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str: return name -def launch_db_script(client: Client, db_script: t.List[str]) -> str: +def launch_db_script(client: Client, db_script: list[str]) -> str: """Parse options to launch script on local cluster :param client: SmartRedis client connected to local DB @@ -166,9 +165,9 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str: def main( network_interface: str, db_cpus: int, - command: t.List[str], - db_models: t.List[t.List[str]], - db_scripts: t.List[t.List[str]], + command: list[str], + db_models: list[list[str]], + db_scripts: list[list[str]], db_identifier: str, ) -> None: # pylint: disable=too-many-statements @@ -226,13 +225,13 @@ def main( logger.error(f"Failed to start database process: {str(e)}") raise SSInternalError("Colocated process failed to start") from e - def launch_models(client: Client, db_models: t.List[t.List[str]]) -> None: + def launch_models(client: Client, db_models: list[list[str]]) -> None: for i, db_model in enumerate(db_models): logger.debug("Uploading model") model_name = launch_db_model(client, db_model) logger.debug(f"Added model {model_name} ({i+1}/{len(db_models)})") - def launch_db_scripts(client: Client, db_scripts: t.List[t.List[str]]) -> None: + def launch_db_scripts(client: Client, db_scripts: list[list[str]]) -> None: for i, db_script in enumerate(db_scripts): logger.debug("Uploading script") script_name = launch_db_script(client, db_script) diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index 528003a89b..3ae1aca9f8 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2025, Hewlett Packard Enterpris +# Copyright (c) 2021-2025, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -68,7 +68,7 @@ class DragonEntrypointArgs: interface: str -def handle_signal(signo: int, _frame: t.Optional[FrameType] = None) -> None: +def handle_signal(signo: int, _frame: FrameType | None = None) -> None: if not signo: logger.info("Received signal with no signo") else: @@ -99,7 +99,7 @@ def print_summary(network_interface: str, ip_address: str) -> None: def start_updater( - backend: DragonBackend, updater: t.Optional[ContextThread] + backend: DragonBackend, updater: ContextThread | None ) -> ContextThread: """Start the ``DragonBackend`` updater thread. @@ -302,7 +302,7 @@ def register_signal_handlers() -> None: signal.signal(sig, handle_signal) -def parse_arguments(args: t.List[str]) -> DragonEntrypointArgs: +def parse_arguments(args: list[str]) -> DragonEntrypointArgs: parser = argparse.ArgumentParser( prefix_chars="+", description="SmartSim Dragon Head Process" ) @@ -326,7 +326,7 @@ def parse_arguments(args: t.List[str]) -> DragonEntrypointArgs: return DragonEntrypointArgs(args_.launching_address, args_.interface) -def main(args_: t.List[str]) -> int: +def main(args_: list[str]) -> int: """Execute the dragon entrypoint as a module""" os.environ["PYTHONUNBUFFERED"] = "1" logger.info("Dragon server started") diff --git a/smartsim/_core/entrypoints/dragon_client.py b/smartsim/_core/entrypoints/dragon_client.py index e764dfb09e..eb12f9aee9 100644 --- a/smartsim/_core/entrypoints/dragon_client.py +++ b/smartsim/_core/entrypoints/dragon_client.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2025, Hewlett Packard Enterpris +# Copyright (c) 2021-2025, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,7 +31,6 @@ import signal import sys import time -import typing as t from pathlib import Path from types import FrameType @@ -66,13 +65,13 @@ def cleanup() -> None: logger.debug("Cleaning up") -def parse_requests(request_filepath: Path) -> t.List[DragonRequest]: +def parse_requests(request_filepath: Path) -> list[DragonRequest]: """Parse serialized requests from file :param request_filepath: Path to file with serialized requests :return: Deserialized requests """ - requests: t.List[DragonRequest] = [] + requests: list[DragonRequest] = [] try: with open(request_filepath, "r", encoding="utf-8") as request_file: req_strings = json.load(fp=request_file) @@ -91,7 +90,7 @@ def parse_requests(request_filepath: Path) -> t.List[DragonRequest]: return requests -def parse_arguments(args: t.List[str]) -> DragonClientEntrypointArgs: +def parse_arguments(args: list[str]) -> DragonClientEntrypointArgs: """Parse arguments used to run entrypoint script :param args: Arguments without name of executable @@ -111,7 +110,7 @@ def parse_arguments(args: t.List[str]) -> DragonClientEntrypointArgs: return DragonClientEntrypointArgs(submit=Path(args_.submit)) -def handle_signal(signo: int, _frame: t.Optional[FrameType] = None) -> None: +def handle_signal(signo: int, _frame: FrameType | None = None) -> None: """Handle signals sent to this process :param signo: Signal number @@ -176,7 +175,7 @@ def execute_entrypoint(args: DragonClientEntrypointArgs) -> int: return os.EX_OK -def main(args_: t.List[str]) -> int: +def main(args_: list[str]) -> int: """Execute the dragon client entrypoint as a module""" os.environ["PYTHONUNBUFFERED"] = "1" diff --git a/smartsim/_core/entrypoints/indirect.py b/smartsim/_core/entrypoints/indirect.py deleted file mode 100644 index 6626c30da1..0000000000 --- a/smartsim/_core/entrypoints/indirect.py +++ /dev/null @@ -1,252 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2025 Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import argparse -import logging -import os -import pathlib -import signal -import sys -import typing as t -from types import FrameType - -import coloredlogs -import psutil - -import smartsim.log -from smartsim._core.utils.helpers import decode_cmd, get_ts_ms -from smartsim._core.utils.telemetry.telemetry import write_event - -STEP_PID: t.Optional[int] = None -logger = smartsim.log.get_logger(__name__) - -# kill is not catchable -SIGNALS = [signal.SIGINT, signal.SIGTERM, signal.SIGQUIT, signal.SIGABRT] - - -def main( - cmd: str, - entity_type: str, - cwd: str, - status_dir: str, -) -> int: - """This function receives an encoded step command from a SmartSim Experiment - and runs it in a subprocess. The entrypoint integrates with the telemetry - monitor by writing status update events. It is useful for wrapping - unmanaged tasks - a workload manager can be queried for a managed task - to achieve the same result. - - :param cmd: a base64 encoded cmd to execute - :param entity_type: `SmartSimEntity` entity class. Valid values - include: orchestrator, dbnode, ensemble, model - :param cwd: working directory to execute the cmd from - :param status_dir: path to the output directory for status updates - """ - global STEP_PID # pylint: disable=global-statement - proxy_pid = os.getpid() - - status_path = pathlib.Path(status_dir) - if not status_path.exists(): - status_path.mkdir(parents=True, exist_ok=True) - - if not cmd.strip(): - raise ValueError("Invalid cmd supplied") - - cleaned_cmd = decode_cmd(cmd) - ret_code: int = 1 - logger.debug("Indirect step starting") - - start_detail = f"Proxy process {proxy_pid}" - start_rc: t.Optional[int] = None - - try: - process = psutil.Popen( - cleaned_cmd, - cwd=cwd, - stdout=sys.stdout, - stderr=sys.stderr, - ) - STEP_PID = process.pid - logger.info(f"Indirect proxy {proxy_pid} child process {STEP_PID} started") - start_detail += f" started child process {STEP_PID}" - - except Exception as ex: - start_detail += f" failed to start child process. {ex}" - start_rc = 1 - logger.error("Failed to create process", exc_info=True) - cleanup() - return 1 - finally: - write_event( - get_ts_ms(), - proxy_pid, - "", # step_id for unmanaged task is always empty - entity_type, - "start", - status_path, - detail=start_detail, - return_code=start_rc, - ) - - logger.info(f"Waiting for child process {STEP_PID} to complete") - - try: - ret_code = process.wait() - except Exception: - logger.error("Failed to complete process", exc_info=True) - ret_code = -1 - - logger.info( - f"Indirect proxy {proxy_pid} child process {STEP_PID} complete." - f" return code: {ret_code}" - ) - msg = f"Process {STEP_PID} finished with return code: {ret_code}" - write_event( - get_ts_ms(), - proxy_pid, - "", # step_id for unmanaged task is always empty - entity_type, - "stop", - status_path, - detail=msg, - return_code=ret_code, - ) - cleanup() - - return ret_code - - -def cleanup() -> None: - """Perform cleanup required for clean termination""" - global STEP_PID # pylint: disable=global-statement - if STEP_PID is None: - return - - logger.info("Performing cleanup") - - try: - # attempt to stop the subprocess performing step-execution - if psutil.pid_exists(STEP_PID): - process = psutil.Process(STEP_PID) - process.terminate() - except psutil.NoSuchProcess: - # swallow exception to avoid overwriting outputs from cmd - ... - - except OSError as ex: - logger.warning(f"Failed to clean up step executor gracefully: {ex}") - finally: - STEP_PID = None - - -def handle_signal(signo: int, _frame: t.Optional[FrameType]) -> None: - """Helper function to ensure clean process termination""" - logger.info(f"handling signal {signo}") - if not signo: - logger.warning("Received signal with no signo") - - cleanup() - - -def register_signal_handlers() -> None: - """Register a signal handling function for all termination events""" - for sig in SIGNALS: - signal.signal(sig, handle_signal) - - -def get_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser( - prefix_chars="+", description="SmartSim Step Executor" - ) - parser.add_argument( - "+name", type=str, help="Name of the step being executed", required=True - ) - parser.add_argument( - "+command", type=str, help="The command to execute", required=True - ) - parser.add_argument( - "+entity_type", - type=str, - help="The type of entity related to the step", - required=True, - ) - parser.add_argument( - "+working_dir", - type=str, - help="The working directory of the executable", - required=True, - ) - parser.add_argument( - "+telemetry_dir", - type=str, - help="Directory for telemetry output", - required=True, - ) - return parser - - -if __name__ == "__main__": - arg_parser = get_parser() - os.environ["PYTHONUNBUFFERED"] = "1" - parsed_args = arg_parser.parse_args() - - # Set up a local private logger for when this module is run as an entry point - level = logger.getEffectiveLevel() - logger = logging.getLogger(f"{__name__}.{parsed_args.name}") - logger.propagate = False - logger.setLevel(level) - - fh = logging.FileHandler(f"{parsed_args.name}.indirect.log") - coloredlogs.HostNameFilter.install(fh) - fh.setFormatter( - logging.Formatter( - smartsim.log.DEFAULT_LOG_FORMAT, - datefmt=smartsim.log.DEFAULT_DATE_FORMAT, - ) - ) - logger.addHandler(fh) - - try: - logger.debug("Starting indirect step execution") - - # make sure to register the cleanup before the start the process - # so our signaller will be able to stop the database process. - register_signal_handlers() - - rc = main( - cmd=parsed_args.command, - entity_type=parsed_args.entity_type, - cwd=parsed_args.working_dir, - status_dir=parsed_args.telemetry_dir, - ) - sys.exit(rc) - - # gracefully exit the processes in the distributed application that - # we do not want to have start a colocated process. Only one process - # per node should be running. - except Exception as e: - logger.exception(f"An unexpected error caused step execution to fail: {e}") - sys.exit(1) diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py index 130b3ce91c..88e45da0ce 100644 --- a/smartsim/_core/entrypoints/redis.py +++ b/smartsim/_core/entrypoints/redis.py @@ -29,7 +29,6 @@ import os import signal import textwrap -import typing as t from subprocess import PIPE, STDOUT from types import FrameType @@ -45,19 +44,19 @@ Redis/KeyDB entrypoint script """ -DBPID: t.Optional[int] = None +DBPID: int | None = None # kill is not catchable SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM, signal.SIGABRT] -def handle_signal(signo: int, _frame: t.Optional[FrameType]) -> None: +def handle_signal(signo: int, _frame: FrameType | None) -> None: if not signo: logger.warning("Received signal with no signo") cleanup() -def build_bind_args(source_addr: str, *addrs: str) -> t.Tuple[str, ...]: +def build_bind_args(source_addr: str, *addrs: str) -> tuple[str, ...]: return ( "--bind", source_addr, @@ -68,14 +67,14 @@ def build_bind_args(source_addr: str, *addrs: str) -> t.Tuple[str, ...]: ) -def build_cluster_args(shard_data: LaunchedShardData) -> t.Tuple[str, ...]: +def build_cluster_args(shard_data: LaunchedShardData) -> tuple[str, ...]: if cluster_conf_file := shard_data.cluster_conf_file: return ("--cluster-enabled", "yes", "--cluster-config-file", cluster_conf_file) return () def print_summary( - cmd: t.List[str], network_interface: str, shard_data: LaunchedShardData + cmd: list[str], network_interface: str, shard_data: LaunchedShardData ) -> None: print( textwrap.dedent(f"""\ diff --git a/smartsim/_core/entrypoints/telemetrymonitor.py b/smartsim/_core/entrypoints/telemetrymonitor.py deleted file mode 100644 index dc61858e39..0000000000 --- a/smartsim/_core/entrypoints/telemetrymonitor.py +++ /dev/null @@ -1,172 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2025 Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import argparse -import asyncio -import logging -import os -import os.path -import pathlib -import signal -import sys -import typing as t -from types import FrameType - -import smartsim._core.config as cfg -from smartsim._core.utils.telemetry.telemetry import ( - TelemetryMonitor, - TelemetryMonitorArgs, -) -from smartsim.log import DEFAULT_LOG_FORMAT, HostnameFilter - -"""Telemetry Monitor entrypoint -Starts a long-running, standalone process that hosts a `TelemetryMonitor`""" - - -logger = logging.getLogger("TelemetryMonitor") - - -def register_signal_handlers( - handle_signal: t.Callable[[int, t.Optional[FrameType]], None] -) -> None: - """Register a signal handling function for all termination events - - :param handle_signal: the function to execute when a term signal is received - """ - # NOTE: omitting kill because it is not catchable - term_signals = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM, signal.SIGABRT] - for signal_num in term_signals: - signal.signal(signal_num, handle_signal) - - -def get_parser() -> argparse.ArgumentParser: - """Instantiate a parser to process command line arguments - - :returns: An argument parser ready to accept required telemetry monitor parameters - """ - arg_parser = argparse.ArgumentParser(description="SmartSim Telemetry Monitor") - arg_parser.add_argument( - "-exp_dir", - type=str, - help="Experiment root directory", - required=True, - ) - arg_parser.add_argument( - "-frequency", - type=float, - help="Frequency of telemetry updates (in seconds))", - required=True, - ) - arg_parser.add_argument( - "-cooldown", - type=int, - help="Default lifetime of telemetry monitor (in seconds) before auto-shutdown", - default=cfg.CONFIG.telemetry_cooldown, - ) - arg_parser.add_argument( - "-loglevel", - type=int, - help="Logging level", - default=logging.INFO, - ) - return arg_parser - - -def parse_arguments() -> TelemetryMonitorArgs: - """Parse the command line arguments and return an instance - of TelemetryMonitorArgs populated with the CLI inputs - - :returns: `TelemetryMonitorArgs` instance populated with command line arguments - """ - parser = get_parser() - parsed_args = parser.parse_args() - return TelemetryMonitorArgs( - parsed_args.exp_dir, - parsed_args.frequency, - parsed_args.cooldown, - parsed_args.loglevel, - ) - - -def configure_logger(logger_: logging.Logger, log_level_: int, exp_dir: str) -> None: - """Configure the telemetry monitor logger to write logs to the - target output file path passed as an argument to the entrypoint - - :param logger_: logger to configure - :param log_level_: log level to apply to the python logging system - :param exp_dir: root path to experiment outputs - """ - logger_.setLevel(log_level_) - logger_.propagate = False - - # use a standard subdirectory of the experiment output path for logs - telemetry_dir = pathlib.Path(exp_dir) / cfg.CONFIG.telemetry_subdir - - # all telemetry monitor logs are written to file in addition to stdout - log_path = telemetry_dir / "logs/telemetrymonitor.out" - log_path.parent.mkdir(parents=True, exist_ok=True) - file_handler = logging.FileHandler(log_path, "a") - - # HostnameFilter is required to enrich log context to use DEFAULT_LOG_FORMAT - file_handler.addFilter(HostnameFilter()) - - formatter = logging.Formatter(DEFAULT_LOG_FORMAT) - file_handler.setFormatter(formatter) - logger_.addHandler(file_handler) - - -if __name__ == "__main__": - """Prepare the telemetry monitor process using command line arguments. - - Sample usage: - python -m smartsim._core.entrypoints.telemetrymonitor -exp_dir - -frequency 30 -cooldown 90 -loglevel INFO - The experiment id is generated during experiment startup - and can be found in the manifest.json in /.smartsim/telemetry - """ - os.environ["PYTHONUNBUFFERED"] = "1" - - args = parse_arguments() - configure_logger(logger, args.log_level, args.exp_dir) - - telemetry_monitor = TelemetryMonitor(args) - - # Must register cleanup before the main loop is running - def cleanup_telemetry_monitor(_signo: int, _frame: t.Optional[FrameType]) -> None: - """Create an enclosure on `manifest_observer` to avoid global variables""" - logger.info("Shutdown signal received by telemetry monitor entrypoint") - telemetry_monitor.cleanup() - - register_signal_handlers(cleanup_telemetry_monitor) - - try: - asyncio.run(telemetry_monitor.run()) - sys.exit(0) - except Exception: - logger.exception( - "Shutting down telemetry monitor due to unexpected error", exc_info=True - ) - - sys.exit(1) diff --git a/smartsim/_core/generation/generator.py b/smartsim/_core/generation/generator.py index 5e937a69ba..95b85f9b41 100644 --- a/smartsim/_core/generation/generator.py +++ b/smartsim/_core/generation/generator.py @@ -108,7 +108,7 @@ def generate_experiment(self, *args: t.Any) -> None: self._gen_entity_list_dir(generator_manifest.ensembles) self._gen_entity_dirs(generator_manifest.models) - def set_tag(self, tag: str, regex: t.Optional[str] = None) -> None: + def set_tag(self, tag: str, regex: str | None = None) -> None: """Set the tag used for tagging input files Set a tag or a regular expression for the @@ -153,7 +153,7 @@ def _gen_exp_dir(self) -> None: dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S") log_file.write(f"Generation start date and time: {dt_string}\n") - def _gen_orc_dir(self, orchestrator_list: t.List[Orchestrator]) -> None: + def _gen_orc_dir(self, orchestrator_list: list[Orchestrator]) -> None: """Create the directory that will hold the error, output and configuration files for the orchestrator. @@ -169,7 +169,7 @@ def _gen_orc_dir(self, orchestrator_list: t.List[Orchestrator]) -> None: shutil.rmtree(orc_path, ignore_errors=True) pathlib.Path(orc_path).mkdir(exist_ok=self.overwrite, parents=True) - def _gen_entity_list_dir(self, entity_lists: t.List[Ensemble]) -> None: + def _gen_entity_list_dir(self, entity_lists: list[Ensemble]) -> None: """Generate directories for Ensemble instances :param entity_lists: list of Ensemble instances @@ -192,8 +192,8 @@ def _gen_entity_list_dir(self, entity_lists: t.List[Ensemble]) -> None: def _gen_entity_dirs( self, - entities: t.List[Model], - entity_list: t.Optional[Ensemble] = None, + entities: list[Model], + entity_list: Ensemble | None = None, ) -> None: """Generate directories for Entity instances @@ -269,7 +269,7 @@ def _build_tagged_files(tagged: TaggedFilesHierarchy) -> None: self._log_params(entity, files_to_params) def _log_params( - self, entity: Model, files_to_params: t.Dict[str, t.Dict[str, str]] + self, entity: Model, files_to_params: dict[str, dict[str, str]] ) -> None: """Log which files were modified during generation @@ -278,8 +278,8 @@ def _log_params( :param entity: the model being generated :param files_to_params: a dict connecting each file to its parameter settings """ - used_params: t.Dict[str, str] = {} - file_to_tables: t.Dict[str, str] = {} + used_params: dict[str, str] = {} + file_to_tables: dict[str, str] = {} for file, params in files_to_params.items(): used_params.update(params) table = tabulate(params.items(), headers=["Name", "Value"]) diff --git a/smartsim/_core/generation/modelwriter.py b/smartsim/_core/generation/modelwriter.py index 7502a16224..b7bee66e78 100644 --- a/smartsim/_core/generation/modelwriter.py +++ b/smartsim/_core/generation/modelwriter.py @@ -26,7 +26,7 @@ import collections import re -import typing as t +from collections import defaultdict from smartsim.error.errors import SmartSimError @@ -40,9 +40,9 @@ class ModelWriter: def __init__(self) -> None: self.tag = ";" self.regex = "(;[^;]+;)" - self.lines: t.List[str] = [] + self.lines: list[str] = [] - def set_tag(self, tag: str, regex: t.Optional[str] = None) -> None: + def set_tag(self, tag: str, regex: str | None = None) -> None: """Set the tag for the modelwriter to search for within tagged files attached to an entity. @@ -59,10 +59,10 @@ def set_tag(self, tag: str, regex: t.Optional[str] = None) -> None: def configure_tagged_model_files( self, - tagged_files: t.List[str], - params: t.Dict[str, str], + tagged_files: list[str], + params: dict[str, str], make_missing_tags_fatal: bool = False, - ) -> t.Dict[str, t.Dict[str, str]]: + ) -> dict[str, dict[str, str]]: """Read, write and configure tagged files attached to a Model instance. @@ -71,7 +71,7 @@ def configure_tagged_model_files( :param make_missing_tags_fatal: raise an error if a tag is missing :returns: A dict connecting each file to its parameter settings """ - files_to_tags: t.Dict[str, t.Dict[str, str]] = {} + files_to_tags: dict[str, dict[str, str]] = {} for tagged_file in tagged_files: self._set_lines(tagged_file) used_tags = self._replace_tags(params, make_missing_tags_fatal) @@ -105,8 +105,8 @@ def _write_changes(self, file_path: str) -> None: raise ParameterWriterError(file_path, read=False) from e def _replace_tags( - self, params: t.Dict[str, str], make_fatal: bool = False - ) -> t.Dict[str, str]: + self, params: dict[str, str], make_fatal: bool = False + ) -> dict[str, str]: """Replace the tagged parameters within the file attached to this model. The tag defaults to ";" @@ -116,8 +116,8 @@ def _replace_tags( :returns: A dict of parameter names and values set for the file """ edited = [] - unused_tags: t.DefaultDict[str, t.List[int]] = collections.defaultdict(list) - used_params: t.Dict[str, str] = {} + unused_tags: defaultdict[str, list[int]] = collections.defaultdict(list) + used_params: dict[str, str] = {} for i, line in enumerate(self.lines, 1): while search := re.search(self.regex, line): tagged_line = search.group(0) @@ -144,9 +144,7 @@ def _replace_tags( self.lines = edited return used_params - def _is_ensemble_spec( - self, tagged_line: str, model_params: t.Dict[str, str] - ) -> bool: + def _is_ensemble_spec(self, tagged_line: str, model_params: dict[str, str]) -> bool: split_tag = tagged_line.split(self.tag) prev_val = split_tag[1] if prev_val in model_params.keys(): diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index 4de156b65f..3f7e7cfd2a 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -34,7 +34,7 @@ def write_colocated_launch_script( - file_name: str, db_log: str, colocated_settings: t.Dict[str, t.Any] + file_name: str, db_log: str, colocated_settings: dict[str, t.Any] ) -> None: """Write the colocated launch script @@ -80,11 +80,11 @@ def write_colocated_launch_script( def _build_colocated_wrapper_cmd( db_log: str, cpus: int = 1, - rai_args: t.Optional[t.Dict[str, str]] = None, - extra_db_args: t.Optional[t.Dict[str, str]] = None, + rai_args: dict[str, str] | None = None, + extra_db_args: dict[str, str] | None = None, port: int = 6780, - ifname: t.Optional[t.Union[str, t.List[str]]] = None, - custom_pinning: t.Optional[str] = None, + ifname: str | list[str] | None = None, + custom_pinning: str | None = None, **kwargs: t.Any, ) -> str: """Build the command use to run a colocated DB application @@ -189,7 +189,7 @@ def _build_colocated_wrapper_cmd( return " ".join(cmd) -def _build_db_model_cmd(db_models: t.List[DBModel]) -> t.List[str]: +def _build_db_model_cmd(db_models: list[DBModel]) -> list[str]: cmd = [] for db_model in db_models: cmd.append("+db_model") @@ -219,7 +219,7 @@ def _build_db_model_cmd(db_models: t.List[DBModel]) -> t.List[str]: return cmd -def _build_db_script_cmd(db_scripts: t.List[DBScript]) -> t.List[str]: +def _build_db_script_cmd(db_scripts: list[DBScript]) -> list[str]: cmd = [] for db_script in db_scripts: cmd.append("+db_script") diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 6fc2ab8dca..18364676e9 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -45,7 +45,6 @@ # pylint: enable=import-error # isort: on -from ...._core.config import get_config from ...._core.schemas import ( DragonHandshakeRequest, DragonHandshakeResponse, @@ -79,19 +78,19 @@ def __str__(self) -> str: class ProcessGroupInfo: status: SmartSimStatus """Status of step""" - process_group: t.Optional[dragon_process_group.ProcessGroup] = None + process_group: dragon_process_group.ProcessGroup | None = None """Internal Process Group object, None for finished or not started steps""" - puids: t.Optional[t.List[t.Optional[int]]] = None # puids can be None + puids: list[int | None] | None = None # puids can be None """List of Process UIDS belonging to the ProcessGroup""" - return_codes: t.Optional[t.List[int]] = None + return_codes: list[int] | None = None """List of return codes of completed processes""" - hosts: t.List[str] = field(default_factory=list) + hosts: list[str] = field(default_factory=list) """List of hosts on which the Process Group """ - redir_workers: t.Optional[dragon_process_group.ProcessGroup] = None + redir_workers: dragon_process_group.ProcessGroup | None = None """Workers used to redirect stdout and stderr to file""" @property - def smartsim_info(self) -> t.Tuple[SmartSimStatus, t.Optional[t.List[int]]]: + def smartsim_info(self) -> tuple[SmartSimStatus, list[int] | None]: """Information needed by SmartSim Launcher and Job Manager""" return (self.status, self.return_codes) @@ -146,7 +145,7 @@ class DragonBackend: def __init__(self, pid: int) -> None: self._pid = pid """PID of dragon executable which launched this server""" - self._group_infos: t.Dict[str, ProcessGroupInfo] = {} + self._group_infos: dict[str, ProcessGroupInfo] = {} """ProcessGroup execution state information""" self._queue_lock = RLock() """Lock that needs to be acquired to access internal queues""" @@ -160,9 +159,9 @@ def __init__(self, pid: int) -> None: """Steps waiting for execution""" self._stop_requests: t.Deque[DragonStopRequest] = collections.deque() """Stop requests which have not been processed yet""" - self._running_steps: t.List[str] = [] + self._running_steps: list[str] = [] """List of currently running steps""" - self._completed_steps: t.List[str] = [] + self._completed_steps: list[str] = [] """List of completed steps""" self._last_beat: float = 0.0 """Time at which the last heartbeat was set""" @@ -175,14 +174,9 @@ def __init__(self, pid: int) -> None: """Whether the server can shut down""" self._frontend_shutdown: bool = False """Whether the server frontend should shut down when the backend does""" - self._shutdown_initiation_time: t.Optional[float] = None + self._shutdown_initiation_time: float | None = None """The time at which the server initiated shutdown""" - smartsim_config = get_config() - self._cooldown_period = ( - smartsim_config.telemetry_frequency * 2 + 5 - if smartsim_config.telemetry_enabled - else 5 - ) + self._cooldown_period = 5 """Time in seconds needed to server to complete shutdown""" self._view = DragonBackendView(self) @@ -213,14 +207,14 @@ def _initialize_hosts(self) -> None: self._nodes = [ dragon_machine.Node(node) for node in dragon_machine.System().nodes ] - self._hosts: t.List[str] = sorted(node.hostname for node in self._nodes) + self._hosts: list[str] = sorted(node.hostname for node in self._nodes) self._cpus = [node.num_cpus for node in self._nodes] self._gpus = [node.num_gpus for node in self._nodes] """List of hosts available in allocation""" self._free_hosts: t.Deque[str] = collections.deque(self._hosts) """List of hosts on which steps can be launched""" - self._allocated_hosts: t.Dict[str, str] = {} + self._allocated_hosts: dict[str, str] = {} """Mapping of hosts on which a step is already running to step ID""" def __str__(self) -> str: @@ -288,9 +282,7 @@ def current_time(self) -> float: """Current time for DragonBackend object, in seconds since the Epoch""" return time.time() - def _can_honor_policy( - self, request: DragonRunRequest - ) -> t.Tuple[bool, t.Optional[str]]: + def _can_honor_policy(self, request: DragonRunRequest) -> tuple[bool, str | None]: """Check if the policy can be honored with resources available in the allocation. :param request: DragonRunRequest containing policy information @@ -316,7 +308,7 @@ def _can_honor_policy( return True, None - def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str]]: + def _can_honor(self, request: DragonRunRequest) -> tuple[bool, str | None]: """Check if request can be honored with resources available in the allocation. Currently only checks for total number of nodes, @@ -339,7 +331,7 @@ def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str] def _allocate_step( self, step_id: str, request: DragonRunRequest - ) -> t.Optional[t.List[str]]: + ) -> list[str] | None: num_hosts: int = request.nodes with self._queue_lock: @@ -355,10 +347,10 @@ def _allocate_step( @staticmethod def _create_redirect_workers( global_policy: dragon_policy.Policy, - policies: t.List[dragon_policy.Policy], - puids: t.List[int], - out_file: t.Optional[str], - err_file: t.Optional[str], + policies: list[dragon_policy.Policy], + puids: list[int], + out_file: str | None, + err_file: str | None, ) -> dragon_process_group.ProcessGroup: grp_redir = dragon_process_group.ProcessGroup( restart=False, policy=global_policy, pmi_enabled=False @@ -439,8 +431,8 @@ def create_run_policy( run_request: DragonRunRequest = request affinity = dragon_policy.Policy.Affinity.DEFAULT - cpu_affinity: t.List[int] = [] - gpu_affinity: t.List[int] = [] + cpu_affinity: list[int] = [] + gpu_affinity: list[int] = [] # Customize policy only if the client requested it, otherwise use default if run_request.policy is not None: @@ -743,7 +735,7 @@ def host_desc(self) -> str: @staticmethod def _proc_group_info_table_line( step_id: str, proc_group_info: ProcessGroupInfo - ) -> t.List[str]: + ) -> list[str]: table_line = [step_id, f"{proc_group_info.status.value}"] if proc_group_info.hosts is not None: diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py index e43865b285..72a2512f76 100644 --- a/smartsim/_core/launcher/dragon/dragonConnector.py +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -35,6 +35,7 @@ import sys import typing as t from collections import defaultdict +from collections.abc import Iterable from pathlib import Path from threading import RLock @@ -59,7 +60,7 @@ logger = get_logger(__name__) -_SchemaT = t.TypeVar("_SchemaT", bound=t.Union[DragonRequest, DragonResponse]) +_SchemaT = t.TypeVar("_SchemaT", bound=DragonRequest | DragonResponse) DRG_LOCK = RLock() @@ -73,17 +74,17 @@ def __init__(self) -> None: self._context: zmq.Context[t.Any] = zmq.Context.instance() self._context.setsockopt(zmq.REQ_CORRELATE, 1) self._context.setsockopt(zmq.REQ_RELAXED, 1) - self._authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None + self._authenticator: zmq.auth.thread.ThreadAuthenticator | None = None config = get_config() self._reset_timeout(config.dragon_server_timeout) - self._dragon_head_socket: t.Optional[zmq.Socket[t.Any]] = None - self._dragon_head_process: t.Optional[subprocess.Popen[bytes]] = None + self._dragon_head_socket: zmq.Socket[t.Any] | None = None + self._dragon_head_process: subprocess.Popen[bytes] | None = None # Returned by dragon head, useful if shutdown is to be requested # but process was started by another connector - self._dragon_head_pid: t.Optional[int] = None + self._dragon_head_pid: int | None = None self._dragon_server_path = config.dragon_server_path logger.debug(f"Dragon Server path was set to {self._dragon_server_path}") - self._env_vars: t.Dict[str, str] = {} + self._env_vars: dict[str, str] = {} if self._dragon_server_path is None: raise SmartSimError( "DragonConnector could not find the dragon server path. " @@ -218,7 +219,7 @@ def _connect_to_existing_server(self, path: Path) -> None: def _start_connector_socket(self, socket_addr: str) -> zmq.Socket[t.Any]: config = get_config() - connector_socket: t.Optional[zmq.Socket[t.Any]] = None + connector_socket: zmq.Socket[t.Any] | None = None self._reset_timeout(config.dragon_server_startup_timeout) self._get_new_authenticator(-1) connector_socket = dragonSockets.get_secure_socket(self._context, zmq.REP, True) @@ -229,7 +230,7 @@ def _start_connector_socket(self, socket_addr: str) -> zmq.Socket[t.Any]: return connector_socket - def load_persisted_env(self) -> t.Dict[str, str]: + def load_persisted_env(self) -> dict[str, str]: """Load key-value pairs from a .env file created during dragon installation :return: Key-value pairs stored in .env file""" @@ -251,7 +252,7 @@ def load_persisted_env(self) -> t.Dict[str, str]: return self._env_vars - def merge_persisted_env(self, current_env: t.Dict[str, str]) -> t.Dict[str, str]: + def merge_persisted_env(self, current_env: dict[str, str]) -> dict[str, str]: """Combine the current environment variable set with the dragon .env by adding Dragon-specific values and prepending any new values to existing keys @@ -259,7 +260,7 @@ def merge_persisted_env(self, current_env: t.Dict[str, str]) -> t.Dict[str, str] :return: Merged environment """ # ensure we start w/a complete env from current env state - merged_env: t.Dict[str, str] = {**current_env} + merged_env: dict[str, str] = {**current_env} # copy all the values for dragon straight into merged_env merged_env.update( @@ -416,8 +417,8 @@ def send_request(self, request: DragonRequest, flags: int = 0) -> DragonResponse @staticmethod def _parse_launched_dragon_server_info_from_iterable( - stream: t.Iterable[str], num_dragon_envs: t.Optional[int] = None - ) -> t.List[t.Dict[str, str]]: + stream: Iterable[str], num_dragon_envs: int | None = None + ) -> list[dict[str, str]]: lines = (line.strip() for line in stream) lines = (line for line in lines if line) tokenized = (line.split(maxsplit=1) for line in lines) @@ -441,9 +442,9 @@ def _parse_launched_dragon_server_info_from_iterable( @classmethod def _parse_launched_dragon_server_info_from_files( cls, - file_paths: t.List[t.Union[str, "os.PathLike[str]"]], - num_dragon_envs: t.Optional[int] = None, - ) -> t.List[t.Dict[str, str]]: + file_paths: list[str | os.PathLike[str]], + num_dragon_envs: int | None = None, + ) -> list[dict[str, str]]: with fileinput.FileInput(file_paths) as ifstream: dragon_envs = cls._parse_launched_dragon_server_info_from_iterable( ifstream, num_dragon_envs @@ -468,16 +469,16 @@ def _send_req_with_socket( return response -def _assert_schema_type(obj: object, typ: t.Type[_SchemaT], /) -> _SchemaT: +def _assert_schema_type(obj: object, typ: type[_SchemaT], /) -> _SchemaT: if not isinstance(obj, typ): raise TypeError(f"Expected schema of type `{typ}`, but got {type(obj)}") return obj def _dragon_cleanup( - server_socket: t.Optional[zmq.Socket[t.Any]] = None, - server_process_pid: t.Optional[int] = 0, - server_authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None, + server_socket: zmq.Socket[t.Any] | None = None, + server_process_pid: int | None = 0, + server_authenticator: zmq.auth.thread.ThreadAuthenticator | None = None, ) -> None: """Clean up resources used by the launcher. :param server_socket: (optional) Socket used to connect to dragon environment @@ -519,7 +520,7 @@ def _dragon_cleanup( print("Authenticator shutdown is complete") -def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path: +def _resolve_dragon_path(fallback: str | os.PathLike[str]) -> Path: dragon_server_path = get_config().dragon_server_path or os.path.join( fallback, ".smartsim", "dragon" ) diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py index 911625800e..666f091049 100644 --- a/smartsim/_core/launcher/dragon/dragonLauncher.py +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -27,7 +27,6 @@ from __future__ import annotations import os -import typing as t from smartsim._core.schemas.dragonRequests import DragonRunPolicy @@ -92,7 +91,7 @@ def cleanup(self) -> None: # RunSettings types supported by this launcher @property - def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: + def supported_rs(self) -> dict[type[SettingsBase], type[Step]]: # RunSettings types supported by this launcher return { DragonRunSettings: DragonStep, @@ -106,7 +105,7 @@ def add_step_to_mapping_table(self, name: str, step_map: StepMap) -> None: if step_map.step_id is None: return - sublauncher: t.Optional[t.Union[SlurmLauncher, PBSLauncher]] = None + sublauncher: SlurmLauncher | PBSLauncher | None = None if step_map.step_id.startswith("SLURM-"): sublauncher = self._slurm_launcher elif step_map.step_id.startswith("PBS-"): @@ -121,7 +120,7 @@ def add_step_to_mapping_table(self, name: str, step_map: StepMap) -> None: ) sublauncher.add_step_to_mapping_table(name, sublauncher_step_map) - def run(self, step: Step) -> t.Optional[str]: + def run(self, step: Step) -> str | None: """Run a job step through Slurm :param step: a job step instance @@ -140,7 +139,7 @@ def run(self, step: Step) -> t.Optional[str]: if isinstance(step, DragonBatchStep): # wait for batch step to submit successfully - sublauncher_step_id: t.Optional[str] = None + sublauncher_step_id: str | None = None return_code, out, err = self.task_manager.start_and_wait(cmd, step.cwd) if return_code != 0: raise LauncherError(f"Sbatch submission failed\n {out}\n {err}") @@ -241,7 +240,7 @@ def stop(self, step_name: str) -> StepInfo: def _unprefix_step_id(step_id: str) -> str: return step_id.split("-", maxsplit=1)[1] - def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: + def _get_managed_step_update(self, step_ids: list[str]) -> list[StepInfo]: """Get step updates for Dragon-managed jobs :param step_ids: list of job step ids @@ -250,9 +249,9 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: step_id_updates: dict[str, StepInfo] = {} - dragon_step_ids: t.List[str] = [] - slurm_step_ids: t.List[str] = [] - pbs_step_ids: t.List[str] = [] + dragon_step_ids: list[str] = [] + slurm_step_ids: list[str] = [] + pbs_step_ids: list[str] = [] for step_id in step_ids: if step_id.startswith("SLURM-"): slurm_step_ids.append(step_id) @@ -321,7 +320,7 @@ def __str__(self) -> str: return "Dragon" -def _assert_schema_type(obj: object, typ: t.Type[_SchemaT], /) -> _SchemaT: +def _assert_schema_type(obj: object, typ: type[_SchemaT], /) -> _SchemaT: if not isinstance(obj, typ): raise TypeError(f"Expected schema of type `{typ}`, but got {type(obj)}") return obj diff --git a/smartsim/_core/launcher/dragon/dragonSockets.py b/smartsim/_core/launcher/dragon/dragonSockets.py index ae669acdd2..6b2dcb96ac 100644 --- a/smartsim/_core/launcher/dragon/dragonSockets.py +++ b/smartsim/_core/launcher/dragon/dragonSockets.py @@ -42,7 +42,7 @@ logger = get_logger(__name__) -AUTHENTICATOR: t.Optional["zmq.auth.thread.ThreadAuthenticator"] = None +AUTHENTICATOR: "zmq.auth.thread.ThreadAuthenticator | None" = None def as_server( diff --git a/smartsim/_core/launcher/launcher.py b/smartsim/_core/launcher/launcher.py index 87ab468cdd..70e7900d5e 100644 --- a/smartsim/_core/launcher/launcher.py +++ b/smartsim/_core/launcher/launcher.py @@ -25,7 +25,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import abc -import typing as t from ..._core.launcher.stepMapping import StepMap from ...error import AllocationError, LauncherError, SSUnsupportedError @@ -54,16 +53,16 @@ def create_step(self, name: str, cwd: str, step_settings: SettingsBase) -> Step: @abc.abstractmethod def get_step_update( - self, step_names: t.List[str] - ) -> t.List[t.Tuple[str, t.Union[StepInfo, None]]]: + self, step_names: list[str] + ) -> list[tuple[str, StepInfo | None]]: raise NotImplementedError @abc.abstractmethod - def get_step_nodes(self, step_names: t.List[str]) -> t.List[t.List[str]]: + def get_step_nodes(self, step_names: list[str]) -> list[list[str]]: raise NotImplementedError @abc.abstractmethod - def run(self, step: Step) -> t.Optional[str]: + def run(self, step: Step) -> str | None: raise NotImplementedError @abc.abstractmethod @@ -93,7 +92,7 @@ def __init__(self) -> None: @property @abc.abstractmethod - def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: + def supported_rs(self) -> dict[type[SettingsBase], type[Step]]: raise NotImplementedError # every launcher utilizing this interface must have a map @@ -125,19 +124,19 @@ def create_step( # don't need to be covered here. def get_step_nodes( - self, step_names: t.List[str] - ) -> t.List[t.List[str]]: # pragma: no cover + self, step_names: list[str] + ) -> list[list[str]]: # pragma: no cover raise SSUnsupportedError("Node acquisition not supported for this launcher") def get_step_update( - self, step_names: t.List[str] - ) -> t.List[t.Tuple[str, t.Union[StepInfo, None]]]: # cov-wlm + self, step_names: list[str] + ) -> list[tuple[str, StepInfo | None]]: # cov-wlm """Get update for a list of job steps :param step_names: list of job steps to get updates for :return: list of name, job update tuples """ - updates: t.List[t.Tuple[str, t.Union[StepInfo, None]]] = [] + updates: list[tuple[str, StepInfo | None]] = [] # get updates of jobs managed by workload manager (PBS, Slurm, etc) # this is primarily batch jobs. @@ -161,8 +160,8 @@ def get_step_update( return updates def _get_unmanaged_step_update( - self, task_ids: t.List[str] - ) -> t.List[UnmanagedStepInfo]: # cov-wlm + self, task_ids: list[str] + ) -> list[UnmanagedStepInfo]: # cov-wlm """Get step updates for Popen managed jobs :param task_ids: task id to check @@ -178,6 +177,6 @@ def _get_unmanaged_step_update( # pylint: disable-next=no-self-use def _get_managed_step_update( self, - step_ids: t.List[str], # pylint: disable=unused-argument - ) -> t.List[StepInfo]: # pragma: no cover + step_ids: list[str], # pylint: disable=unused-argument + ) -> list[StepInfo]: # pragma: no cover return [] diff --git a/smartsim/_core/launcher/local/local.py b/smartsim/_core/launcher/local/local.py index 2fc4700215..6cff067ce9 100644 --- a/smartsim/_core/launcher/local/local.py +++ b/smartsim/_core/launcher/local/local.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t from ....settings import RunSettings, SettingsBase from ..launcher import Launcher @@ -54,8 +53,8 @@ def create_step(self, name: str, cwd: str, step_settings: SettingsBase) -> Step: return LocalStep(name, cwd, step_settings) def get_step_update( - self, step_names: t.List[str] - ) -> t.List[t.Tuple[str, t.Optional[StepInfo]]]: + self, step_names: list[str] + ) -> list[tuple[str, StepInfo | None]]: """Get status updates of each job step name provided :param step_names: list of step_names @@ -63,7 +62,7 @@ def get_step_update( """ # step ids are process ids of the tasks # as there is no WLM intermediary - updates: t.List[t.Tuple[str, t.Optional[StepInfo]]] = [] + updates: list[tuple[str, StepInfo | None]] = [] s_names, s_ids = self.step_mapping.get_ids(step_names, managed=False) for step_name, step_id in zip(s_names, s_ids): status, ret_code, out, err = self.task_manager.get_task_update(str(step_id)) @@ -72,7 +71,7 @@ def get_step_update( updates.append(update) return updates - def get_step_nodes(self, step_names: t.List[str]) -> t.List[t.List[str]]: + def get_step_nodes(self, step_names: list[str]) -> list[list[str]]: """Return the address of nodes assigned to the step :param step_names: list of step_names diff --git a/smartsim/_core/launcher/pbs/pbsCommands.py b/smartsim/_core/launcher/pbs/pbsCommands.py index a0eb8a988e..de3f402f5e 100644 --- a/smartsim/_core/launcher/pbs/pbsCommands.py +++ b/smartsim/_core/launcher/pbs/pbsCommands.py @@ -24,12 +24,11 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t from ...utils.shell import execute_cmd -def qstat(args: t.List[str]) -> t.Tuple[str, str]: +def qstat(args: list[str]) -> tuple[str, str]: """Calls PBS qstat with args :param args: List of command arguments @@ -40,7 +39,7 @@ def qstat(args: t.List[str]) -> t.Tuple[str, str]: return out, error -def qsub(args: t.List[str]) -> t.Tuple[str, str]: +def qsub(args: list[str]) -> tuple[str, str]: """Calls PBS qsub with args :param args: List of command arguments @@ -51,7 +50,7 @@ def qsub(args: t.List[str]) -> t.Tuple[str, str]: return out, error -def qdel(args: t.List[str]) -> t.Tuple[int, str, str]: +def qdel(args: list[str]) -> tuple[int, str, str]: """Calls PBS qdel with args. returncode is also supplied in this function. diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py index 6907c13de7..f3d312fbeb 100644 --- a/smartsim/_core/launcher/pbs/pbsLauncher.py +++ b/smartsim/_core/launcher/pbs/pbsLauncher.py @@ -25,7 +25,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import time -import typing as t from ....error import LauncherError from ....log import get_logger @@ -76,7 +75,7 @@ class PBSLauncher(WLMLauncher): # init in WLMLauncher, launcher.py @property - def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: + def supported_rs(self) -> dict[type[SettingsBase], type[Step]]: # RunSettings types supported by this launcher return { AprunSettings: AprunStep, @@ -88,7 +87,7 @@ def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: PalsMpiexecSettings: MpiexecStep, } - def run(self, step: Step) -> t.Optional[str]: + def run(self, step: Step) -> str | None: """Run a job step through PBSPro :param step: a job step instance @@ -99,8 +98,8 @@ def run(self, step: Step) -> t.Optional[str]: self.task_manager.start() cmd_list = step.get_launch_cmd() - step_id: t.Optional[str] = None - task_id: t.Optional[str] = None + step_id: str | None = None + task_id: str | None = None if isinstance(step, QsubBatchStep): # wait for batch step to submit successfully return_code, out, err = self.task_manager.start_and_wait(cmd_list, step.cwd) @@ -162,7 +161,7 @@ def _get_pbs_step_id(step: Step, interval: int = 2) -> str: TODO: change this to use ``qstat -a -u user`` """ time.sleep(interval) - step_id: t.Optional[str] = None + step_id: str | None = None trials = CONFIG.wlm_trials while trials > 0: output, _ = qstat(["-f", "-F", "json"]) @@ -176,13 +175,13 @@ def _get_pbs_step_id(step: Step, interval: int = 2) -> str: raise LauncherError("Could not find id of launched job step") return step_id - def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: + def _get_managed_step_update(self, step_ids: list[str]) -> list[StepInfo]: """Get step updates for WLM managed jobs :param step_ids: list of job step ids :return: list of updates for managed jobs """ - updates: t.List[StepInfo] = [] + updates: list[StepInfo] = [] qstat_out, _ = qstat(step_ids) stats = [parse_qstat_jobid(qstat_out, str(step_id)) for step_id in step_ids] diff --git a/smartsim/_core/launcher/pbs/pbsParser.py b/smartsim/_core/launcher/pbs/pbsParser.py index 8ded7c3800..4439c52faf 100644 --- a/smartsim/_core/launcher/pbs/pbsParser.py +++ b/smartsim/_core/launcher/pbs/pbsParser.py @@ -57,7 +57,7 @@ def parse_qsub_error(output: str) -> str: return base_err -def parse_qstat_jobid(output: str, job_id: str) -> t.Optional[str]: +def parse_qstat_jobid(output: str, job_id: str) -> str | None: """Parse and return output of the qstat command run with options to obtain job status. @@ -76,7 +76,7 @@ def parse_qstat_jobid(output: str, job_id: str) -> t.Optional[str]: return result -def parse_qstat_jobid_json(output: str, job_id: str) -> t.Optional[str]: +def parse_qstat_jobid_json(output: str, job_id: str) -> str | None: """Parse and return output of the qstat command run with JSON options to obtain job status. @@ -89,13 +89,13 @@ def parse_qstat_jobid_json(output: str, job_id: str) -> t.Optional[str]: if "Jobs" not in out_json: return None jobs: dict[str, t.Any] = out_json["Jobs"] - job: t.Optional[dict[str, t.Any]] = jobs.get(job_id, None) + job: dict[str, t.Any] | None = jobs.get(job_id, None) if job is None: return None return str(job.get("job_state", None)) -def parse_qstat_nodes(output: str) -> t.List[str]: +def parse_qstat_nodes(output: str) -> list[str]: """Parse and return the qstat command run with options to obtain node list. @@ -107,7 +107,7 @@ def parse_qstat_nodes(output: str) -> t.List[str]: :param output: output of the qstat command in JSON format :return: compute nodes of the allocation or job """ - nodes: t.List[str] = [] + nodes: list[str] = [] out_json = load_and_clean_json(output) if "Jobs" not in out_json: return nodes @@ -122,14 +122,14 @@ def parse_qstat_nodes(output: str) -> t.List[str]: return list(sorted(set(nodes))) -def parse_step_id_from_qstat(output: str, step_name: str) -> t.Optional[str]: +def parse_step_id_from_qstat(output: str, step_name: str) -> str | None: """Parse and return the step id from a qstat command :param output: output qstat :param step_name: the name of the step to query :return: the step_id """ - step_id: t.Optional[str] = None + step_id: str | None = None out_json = load_and_clean_json(output) if "Jobs" not in out_json: diff --git a/smartsim/_core/launcher/sge/sgeCommands.py b/smartsim/_core/launcher/sge/sgeCommands.py index c9160b6ac7..710b4ec7ca 100644 --- a/smartsim/_core/launcher/sge/sgeCommands.py +++ b/smartsim/_core/launcher/sge/sgeCommands.py @@ -24,12 +24,11 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t from ...utils.shell import execute_cmd -def qstat(args: t.List[str]) -> t.Tuple[str, str]: +def qstat(args: list[str]) -> tuple[str, str]: """Calls SGE qstat with args :param args: List of command arguments @@ -40,7 +39,7 @@ def qstat(args: t.List[str]) -> t.Tuple[str, str]: return out, error -def qsub(args: t.List[str]) -> t.Tuple[str, str]: +def qsub(args: list[str]) -> tuple[str, str]: """Calls SGE qsub with args :param args: List of command arguments @@ -51,7 +50,7 @@ def qsub(args: t.List[str]) -> t.Tuple[str, str]: return out, error -def qdel(args: t.List[str]) -> t.Tuple[int, str, str]: +def qdel(args: list[str]) -> tuple[int, str, str]: """Calls SGE qdel with args. returncode is also supplied in this function. @@ -64,7 +63,7 @@ def qdel(args: t.List[str]) -> t.Tuple[int, str, str]: return returncode, out, error -def qacct(args: t.List[str]) -> t.Tuple[int, str, str]: +def qacct(args: list[str]) -> tuple[int, str, str]: """Calls SGE qacct with args. returncode is also supplied in this function. diff --git a/smartsim/_core/launcher/sge/sgeLauncher.py b/smartsim/_core/launcher/sge/sgeLauncher.py index 920fab4d74..f6b4558ce7 100644 --- a/smartsim/_core/launcher/sge/sgeLauncher.py +++ b/smartsim/_core/launcher/sge/sgeLauncher.py @@ -25,7 +25,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import time -import typing as t from ....error import LauncherError from ....log import get_logger @@ -69,7 +68,7 @@ class SGELauncher(WLMLauncher): # init in WLMLauncher, launcher.py @property - def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: + def supported_rs(self) -> dict[type[SettingsBase], type[Step]]: # RunSettings types supported by this launcher return { SgeQsubBatchSettings: SgeQsubBatchStep, @@ -79,7 +78,7 @@ def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: RunSettings: LocalStep, } - def run(self, step: Step) -> t.Optional[str]: + def run(self, step: Step) -> str | None: """Run a job step through SGE :param step: a job step instance @@ -90,8 +89,8 @@ def run(self, step: Step) -> t.Optional[str]: self.task_manager.start() cmd_list = step.get_launch_cmd() - step_id: t.Optional[str] = None - task_id: t.Optional[str] = None + step_id: str | None = None + task_id: str | None = None if isinstance(step, SgeQsubBatchStep): # wait for batch step to submit successfully return_code, out, err = self.task_manager.start_and_wait(cmd_list, step.cwd) @@ -141,13 +140,13 @@ def stop(self, step_name: str) -> StepInfo: ) # set status to cancelled instead of failed return step_info - def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: + def _get_managed_step_update(self, step_ids: list[str]) -> list[StepInfo]: """Get step updates for WLM managed jobs :param step_ids: list of job step ids :return: list of updates for managed jobs """ - updates: t.List[StepInfo] = [] + updates: list[StepInfo] = [] qstat_out, _ = qstat(["-xml"]) stats = [parse_qstat_jobid_xml(qstat_out, str(step_id)) for step_id in step_ids] diff --git a/smartsim/_core/launcher/sge/sgeParser.py b/smartsim/_core/launcher/sge/sgeParser.py index ec811d53b2..de03c54161 100644 --- a/smartsim/_core/launcher/sge/sgeParser.py +++ b/smartsim/_core/launcher/sge/sgeParser.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t import xml.etree.ElementTree as ET @@ -57,7 +56,7 @@ def parse_qsub_error(output: str) -> str: return base_err -def parse_qstat_jobid_xml(output: str, job_id: str) -> t.Optional[str]: +def parse_qstat_jobid_xml(output: str, job_id: str) -> str | None: """Parse and return output of the qstat command run with XML options to obtain job status. @@ -78,7 +77,7 @@ def parse_qstat_jobid_xml(output: str, job_id: str) -> t.Optional[str]: return None -def parse_qacct_job_output(output: str, field_name: str) -> t.Union[str, int]: +def parse_qacct_job_output(output: str, field_name: str) -> str | int: """Parse the output from qacct for a single job :param output: The raw text output from qacct diff --git a/smartsim/_core/launcher/slurm/slurmCommands.py b/smartsim/_core/launcher/slurm/slurmCommands.py index ee043c759d..08da33fc18 100644 --- a/smartsim/_core/launcher/slurm/slurmCommands.py +++ b/smartsim/_core/launcher/slurm/slurmCommands.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t from ....error import LauncherError from ....log import get_logger @@ -34,7 +33,7 @@ logger = get_logger(__name__) -def sstat(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]: +def sstat(args: list[str], *, raise_on_err: bool = False) -> tuple[str, str]: """Calls sstat with args :param args: List of command arguments @@ -44,7 +43,7 @@ def sstat(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str] return out, err -def sacct(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]: +def sacct(args: list[str], *, raise_on_err: bool = False) -> tuple[str, str]: """Calls sacct with args :param args: List of command arguments @@ -54,7 +53,7 @@ def sacct(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str] return out, err -def salloc(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]: +def salloc(args: list[str], *, raise_on_err: bool = False) -> tuple[str, str]: """Calls slurm salloc with args :param args: List of command arguments @@ -64,7 +63,7 @@ def salloc(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str return out, err -def sinfo(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]: +def sinfo(args: list[str], *, raise_on_err: bool = False) -> tuple[str, str]: """Calls slurm sinfo with args :param args: List of command arguments @@ -74,7 +73,7 @@ def sinfo(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str] return out, err -def scontrol(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]: +def scontrol(args: list[str], *, raise_on_err: bool = False) -> tuple[str, str]: """Calls slurm scontrol with args :param args: List of command arguments @@ -84,7 +83,7 @@ def scontrol(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, s return out, err -def scancel(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[int, str, str]: +def scancel(args: list[str], *, raise_on_err: bool = False) -> tuple[int, str, str]: """Calls slurm scancel with args. returncode is also supplied in this function. @@ -106,8 +105,8 @@ def _find_slurm_command(cmd: str) -> str: def _execute_slurm_cmd( - command: str, args: t.List[str], raise_on_err: bool = False -) -> t.Tuple[int, str, str]: + command: str, args: list[str], raise_on_err: bool = False +) -> tuple[int, str, str]: cmd_exe = _find_slurm_command(command) cmd = [cmd_exe] + args returncode, out, error = execute_cmd(cmd) diff --git a/smartsim/_core/launcher/slurm/slurmLauncher.py b/smartsim/_core/launcher/slurm/slurmLauncher.py index dba0cd5edb..5b8bda6f59 100644 --- a/smartsim/_core/launcher/slurm/slurmLauncher.py +++ b/smartsim/_core/launcher/slurm/slurmLauncher.py @@ -26,7 +26,6 @@ import os import time -import typing as t from shutil import which from ....error import LauncherError @@ -74,7 +73,7 @@ class SlurmLauncher(WLMLauncher): # RunSettings types supported by this launcher @property - def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: + def supported_rs(self) -> dict[type[SettingsBase], type[Step]]: # RunSettings types supported by this launcher return { SrunSettings: SrunStep, @@ -85,7 +84,7 @@ def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: RunSettings: LocalStep, } - def get_step_nodes(self, step_names: t.List[str]) -> t.List[t.List[str]]: + def get_step_nodes(self, step_names: list[str]) -> list[list[str]]: """Return the compute nodes of a specific job or allocation This function returns the compute nodes of a specific job or allocation @@ -116,7 +115,7 @@ def get_step_nodes(self, step_names: t.List[str]) -> t.List[t.List[str]]: raise LauncherError("Failed to retrieve nodelist from stat") return node_lists - def run(self, step: Step) -> t.Optional[str]: + def run(self, step: Step) -> str | None: """Run a job step through Slurm :param step: a job step instance @@ -230,7 +229,7 @@ def _get_slurm_step_id(step: Step, interval: int = 2) -> str: m2-119225.1|119225.1| """ time.sleep(interval) - step_id: t.Optional[str] = None + step_id: str | None = None trials = CONFIG.wlm_trials while trials > 0: output, _ = sacct( @@ -247,7 +246,7 @@ def _get_slurm_step_id(step: Step, interval: int = 2) -> str: raise LauncherError("Could not find id of launched job step") return step_id - def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: + def _get_managed_step_update(self, step_ids: list[str]) -> list[StepInfo]: """Get step updates for WLM managed jobs :param step_ids: list of job step ids @@ -262,7 +261,7 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: stat_tuples = [parse_sacct(sacct_out, step_id) for step_id in step_ids] # create SlurmStepInfo objects to return - updates: t.List[StepInfo] = [] + updates: list[StepInfo] = [] for stat_tuple, step_id in zip(stat_tuples, step_ids): _rc = int(stat_tuple[1]) if stat_tuple[1] else None info = SlurmStepInfo(stat_tuple[0], _rc) @@ -301,5 +300,5 @@ def __str__(self) -> str: return "Slurm" -def _create_step_id_str(step_ids: t.List[str]) -> str: +def _create_step_id_str(step_ids: list[str]) -> str: return ",".join(step_ids) diff --git a/smartsim/_core/launcher/slurm/slurmParser.py b/smartsim/_core/launcher/slurm/slurmParser.py index 29ce003171..ee1732b36e 100644 --- a/smartsim/_core/launcher/slurm/slurmParser.py +++ b/smartsim/_core/launcher/slurm/slurmParser.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t from shutil import which """ @@ -32,14 +31,14 @@ """ -def parse_salloc(output: str) -> t.Optional[str]: +def parse_salloc(output: str) -> str | None: for line in output.split("\n"): if line.startswith("salloc: Granted job allocation"): return line.split()[-1] return None -def parse_salloc_error(output: str) -> t.Optional[str]: +def parse_salloc_error(output: str) -> str | None: """Parse and return error output of a failed salloc command :param output: stderr output of salloc command @@ -81,14 +80,14 @@ def jobid_exact_match(parsed_id: str, job_id: str) -> bool: return parsed_id.split(".")[0] == job_id -def parse_sacct(output: str, job_id: str) -> t.Tuple[str, t.Optional[str]]: +def parse_sacct(output: str, job_id: str) -> tuple[str, str | None]: """Parse and return output of the sacct command :param output: output of the sacct command :param job_id: allocation id or job step id :return: status and returncode """ - result: t.Tuple[str, t.Optional[str]] = ("PENDING", None) + result: tuple[str, str | None] = ("PENDING", None) for line in output.split("\n"): parts = line.split("|") if len(parts) >= 3: @@ -100,7 +99,7 @@ def parse_sacct(output: str, job_id: str) -> t.Tuple[str, t.Optional[str]]: return result -def parse_sstat_nodes(output: str, job_id: str) -> t.List[str]: +def parse_sstat_nodes(output: str, job_id: str) -> list[str]: """Parse and return the sstat command This function parses and returns the nodes of @@ -121,7 +120,7 @@ def parse_sstat_nodes(output: str, job_id: str) -> t.List[str]: return list(set(nodes)) -def parse_step_id_from_sacct(output: str, step_name: str) -> t.Optional[str]: +def parse_step_id_from_sacct(output: str, step_name: str) -> str | None: """Parse and return the step id from a sacct command :param output: output of sacct --noheader -p diff --git a/smartsim/_core/launcher/step/alpsStep.py b/smartsim/_core/launcher/step/alpsStep.py index 7f77acd8a7..d102f53336 100644 --- a/smartsim/_core/launcher/step/alpsStep.py +++ b/smartsim/_core/launcher/step/alpsStep.py @@ -26,13 +26,12 @@ import os import shutil -import typing as t from shlex import split as sh_split from ....error import AllocationError from ....log import get_logger from ....settings import AprunSettings, RunSettings, Singularity -from .step import Step, proxyable_launch_cmd +from .step import Step logger = get_logger(__name__) @@ -46,19 +45,18 @@ def __init__(self, name: str, cwd: str, run_settings: AprunSettings) -> None: :param run_settings: run settings for entity """ super().__init__(name, cwd, run_settings) - self.alloc: t.Optional[str] = None + self.alloc: str | None = None if not run_settings.in_batch: self._set_alloc() self.run_settings = run_settings - def _get_mpmd(self) -> t.List[RunSettings]: + def _get_mpmd(self) -> list[RunSettings]: """Temporary convenience function to return a typed list of attached RunSettings """ return self.run_settings.mpmd - @proxyable_launch_cmd - def get_launch_cmd(self) -> t.List[str]: + def get_launch_cmd(self) -> list[str]: """Get the command to launch this step :return: launch command @@ -114,7 +112,7 @@ def _set_alloc(self) -> None: "No allocation specified or found and not running in batch" ) - def _build_exe(self) -> t.List[str]: + def _build_exe(self) -> list[str]: """Build the executable for this step :return: executable list @@ -126,7 +124,7 @@ def _build_exe(self) -> t.List[str]: args = self.run_settings._exe_args # pylint: disable=protected-access return exe + args - def _make_mpmd(self) -> t.List[str]: + def _make_mpmd(self) -> list[str]: """Build Aprun (MPMD) executable""" exe = self.run_settings.exe diff --git a/smartsim/_core/launcher/step/dragonStep.py b/smartsim/_core/launcher/step/dragonStep.py index a5c851c4e3..60d9eefa52 100644 --- a/smartsim/_core/launcher/step/dragonStep.py +++ b/smartsim/_core/launcher/step/dragonStep.py @@ -63,7 +63,7 @@ def __init__(self, name: str, cwd: str, run_settings: DragonRunSettings) -> None def run_settings(self) -> DragonRunSettings: return t.cast(DragonRunSettings, self.step_settings) - def get_launch_cmd(self) -> t.List[str]: + def get_launch_cmd(self) -> list[str]: """Get stringified version of request needed to launch this step @@ -93,12 +93,12 @@ def get_launch_cmd(self) -> t.List[str]: return exe_cmd_and_args @staticmethod - def _get_exe_args_list(run_setting: DragonRunSettings) -> t.List[str]: + def _get_exe_args_list(run_setting: DragonRunSettings) -> list[str]: """Convenience function to encapsulate checking the runsettings.exe_args type to always return a list """ exe_args = run_setting.exe_args - args: t.List[str] = exe_args if isinstance(exe_args, list) else [exe_args] + args: list[str] = exe_args if isinstance(exe_args, list) else [exe_args] return args @@ -107,7 +107,7 @@ def __init__( self, name: str, cwd: str, - batch_settings: t.Union[SbatchSettings, QsubBatchSettings], + batch_settings: SbatchSettings | QsubBatchSettings, ) -> None: """Initialize a Slurm Sbatch step @@ -116,12 +116,12 @@ def __init__( :param batch_settings: batch settings for entity """ super().__init__(name, cwd, batch_settings) - self.steps: t.List[Step] = [] + self.steps: list[Step] = [] self.managed = True self.batch_settings = batch_settings self._request_file_name = "requests.json" - def get_launch_cmd(self) -> t.List[str]: + def get_launch_cmd(self) -> list[str]: """Get the launch command for the batch :return: launch command for the batch diff --git a/smartsim/_core/launcher/step/localStep.py b/smartsim/_core/launcher/step/localStep.py index f8feffd4e4..9ad104473d 100644 --- a/smartsim/_core/launcher/step/localStep.py +++ b/smartsim/_core/launcher/step/localStep.py @@ -26,11 +26,10 @@ import os import shutil -import typing as t from ....settings import Singularity from ....settings.base import RunSettings -from .step import Step, proxyable_launch_cmd +from .step import Step class LocalStep(Step): @@ -40,11 +39,10 @@ def __init__(self, name: str, cwd: str, run_settings: RunSettings): self._env = self._set_env() @property - def env(self) -> t.Dict[str, str]: + def env(self) -> dict[str, str]: return self._env - @proxyable_launch_cmd - def get_launch_cmd(self) -> t.List[str]: + def get_launch_cmd(self) -> list[str]: cmd = [] # Add run command and args if user specified @@ -73,7 +71,7 @@ def get_launch_cmd(self) -> t.List[str]: cmd.extend(self.run_settings.exe_args) return cmd - def _set_env(self) -> t.Dict[str, str]: + def _set_env(self) -> dict[str, str]: env = os.environ.copy() if self.run_settings.env_vars: for k, v in self.run_settings.env_vars.items(): diff --git a/smartsim/_core/launcher/step/mpiStep.py b/smartsim/_core/launcher/step/mpiStep.py index 01e83ba434..c272f59f4e 100644 --- a/smartsim/_core/launcher/step/mpiStep.py +++ b/smartsim/_core/launcher/step/mpiStep.py @@ -26,14 +26,13 @@ import os import shutil -import typing as t from shlex import split as sh_split from ....error import AllocationError, SmartSimError from ....log import get_logger from ....settings import MpiexecSettings, MpirunSettings, OrterunSettings from ....settings.base import RunSettings -from .step import Step, proxyable_launch_cmd +from .step import Step logger = get_logger(__name__) @@ -49,15 +48,14 @@ def __init__(self, name: str, cwd: str, run_settings: RunSettings) -> None: super().__init__(name, cwd, run_settings) - self.alloc: t.Optional[str] = None + self.alloc: str | None = None if not run_settings.in_batch: self._set_alloc() self.run_settings = run_settings _supported_launchers = ["PBS", "SLURM", "LSB", "SGE"] - @proxyable_launch_cmd - def get_launch_cmd(self) -> t.List[str]: + def get_launch_cmd(self) -> list[str]: """Get the command to launch this step :return: launch command @@ -116,16 +114,16 @@ def _set_alloc(self) -> None: "No allocation specified or found and not running in batch" ) - def _get_mpmd(self) -> t.List[RunSettings]: + def _get_mpmd(self) -> list[RunSettings]: """Temporary convenience function to return a typed list of attached RunSettings """ if hasattr(self.run_settings, "mpmd") and self.run_settings.mpmd: - rs_mpmd: t.List[RunSettings] = self.run_settings.mpmd + rs_mpmd: list[RunSettings] = self.run_settings.mpmd return rs_mpmd return [] - def _build_exe(self) -> t.List[str]: + def _build_exe(self) -> list[str]: """Build the executable for this step :return: executable list @@ -137,7 +135,7 @@ def _build_exe(self) -> t.List[str]: args = self.run_settings._exe_args # pylint: disable=protected-access return exe + args - def _make_mpmd(self) -> t.List[str]: + def _make_mpmd(self) -> list[str]: """Build mpiexec (MPMD) executable""" exe = self.run_settings.exe args = self.run_settings._exe_args # pylint: disable=protected-access diff --git a/smartsim/_core/launcher/step/pbsStep.py b/smartsim/_core/launcher/step/pbsStep.py index bc96659b42..124fb2660f 100644 --- a/smartsim/_core/launcher/step/pbsStep.py +++ b/smartsim/_core/launcher/step/pbsStep.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t from ....log import get_logger from ....settings import QsubBatchSettings @@ -42,11 +41,11 @@ def __init__(self, name: str, cwd: str, batch_settings: QsubBatchSettings) -> No :param batch_settings: batch settings for entity """ super().__init__(name, cwd, batch_settings) - self.step_cmds: t.List[t.List[str]] = [] + self.step_cmds: list[list[str]] = [] self.managed = True self.batch_settings = batch_settings - def get_launch_cmd(self) -> t.List[str]: + def get_launch_cmd(self) -> list[str]: """Get the launch command for the batch :return: launch command for the batch diff --git a/smartsim/_core/launcher/step/sgeStep.py b/smartsim/_core/launcher/step/sgeStep.py index 14225e07ca..1dc889be9a 100644 --- a/smartsim/_core/launcher/step/sgeStep.py +++ b/smartsim/_core/launcher/step/sgeStep.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t from ....log import get_logger from ....settings import SgeQsubBatchSettings @@ -44,11 +43,11 @@ def __init__( :param batch_settings: batch settings for entity """ super().__init__(name, cwd, batch_settings) - self.step_cmds: t.List[t.List[str]] = [] + self.step_cmds: list[list[str]] = [] self.managed = True self.batch_settings = batch_settings - def get_launch_cmd(self) -> t.List[str]: + def get_launch_cmd(self) -> list[str]: """Get the launch command for the batch :return: launch command for the batch diff --git a/smartsim/_core/launcher/step/slurmStep.py b/smartsim/_core/launcher/step/slurmStep.py index 5b5db499e0..a14e9b1105 100644 --- a/smartsim/_core/launcher/step/slurmStep.py +++ b/smartsim/_core/launcher/step/slurmStep.py @@ -26,7 +26,6 @@ import os import shutil -import typing as t from shlex import split as sh_split from ....error import AllocationError @@ -46,11 +45,11 @@ def __init__(self, name: str, cwd: str, batch_settings: SbatchSettings) -> None: :param batch_settings: batch settings for entity """ super().__init__(name, cwd, batch_settings) - self.step_cmds: t.List[t.List[str]] = [] + self.step_cmds: list[list[str]] = [] self.managed = True self.batch_settings = batch_settings - def get_launch_cmd(self) -> t.List[str]: + def get_launch_cmd(self) -> list[str]: """Get the launch command for the batch :return: launch command for the batch @@ -106,13 +105,13 @@ def __init__(self, name: str, cwd: str, run_settings: SrunSettings) -> None: :param run_settings: run settings for entity """ super().__init__(name, cwd, run_settings) - self.alloc: t.Optional[str] = None + self.alloc: str | None = None self.managed = True self.run_settings = run_settings if not self.run_settings.in_batch: self._set_alloc() - def get_launch_cmd(self) -> t.List[str]: + def get_launch_cmd(self) -> list[str]: """Get the command to launch this step :return: launch command @@ -124,7 +123,7 @@ def get_launch_cmd(self) -> t.List[str]: output, error = self.get_output_files() srun_cmd = [srun, "--output", output, "--error", error, "--job-name", self.name] - compound_env: t.Set[str] = set() + compound_env: set[str] = set() if self.alloc: srun_cmd += ["--jobid", str(self.alloc)] @@ -177,22 +176,22 @@ def _set_alloc(self) -> None: "No allocation specified or found and not running in batch" ) - def _get_mpmd(self) -> t.List[RunSettings]: + def _get_mpmd(self) -> list[RunSettings]: """Temporary convenience function to return a typed list of attached RunSettings """ return self.run_settings.mpmd @staticmethod - def _get_exe_args_list(run_setting: RunSettings) -> t.List[str]: + def _get_exe_args_list(run_setting: RunSettings) -> list[str]: """Convenience function to encapsulate checking the runsettings.exe_args type to always return a list """ exe_args = run_setting.exe_args - args: t.List[str] = exe_args if isinstance(exe_args, list) else [exe_args] + args: list[str] = exe_args if isinstance(exe_args, list) else [exe_args] return args - def _build_exe(self) -> t.List[str]: + def _build_exe(self) -> list[str]: """Build the executable for this step :return: executable list @@ -204,7 +203,7 @@ def _build_exe(self) -> t.List[str]: args = self._get_exe_args_list(self.run_settings) return exe + args - def _make_mpmd(self) -> t.List[str]: + def _make_mpmd(self) -> list[str]: """Build Slurm multi-prog (MPMD) executable""" exe = self.run_settings.exe args = self._get_exe_args_list(self.run_settings) diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index 769a609081..b7bb43e7d1 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -27,20 +27,16 @@ from __future__ import annotations import copy -import functools import os.path as osp import pathlib -import sys import time -import typing as t from os import makedirs -from smartsim._core.config import CONFIG -from smartsim.error.errors import SmartSimError, UnproxyableStepError +from smartsim.error.errors import SmartSimError from ....log import get_logger from ....settings.base import RunSettings, SettingsBase -from ...utils.helpers import encode_cmd, get_base_36_repr +from ...utils.helpers import get_base_36_repr from ..colocated import write_colocated_launch_script logger = get_logger(__name__) @@ -53,14 +49,14 @@ def __init__(self, name: str, cwd: str, step_settings: SettingsBase) -> None: self.cwd = cwd self.managed = False self.step_settings = copy.deepcopy(step_settings) - self.meta: t.Dict[str, str] = {} + self.meta: dict[str, str] = {} @property - def env(self) -> t.Optional[t.Dict[str, str]]: + def env(self) -> dict[str, str] | None: """Overridable, read only property for step to specify its environment""" return None - def get_launch_cmd(self) -> t.List[str]: + def get_launch_cmd(self) -> list[str]: raise NotImplementedError @staticmethod @@ -74,10 +70,10 @@ def _ensure_output_directory_exists(output_dir: str) -> None: if not osp.exists(output_dir): pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) - def get_output_files(self) -> t.Tuple[str, str]: + def get_output_files(self) -> tuple[str, str]: """Return two paths to error and output files based on metadata directory""" try: - output_dir = self.meta["status_dir"] + output_dir = self.meta["metadata_dir"] except KeyError as exc: raise KeyError("Status directory for this step has not been set.") from exc self._ensure_output_directory_exists(output_dir) @@ -85,9 +81,7 @@ def get_output_files(self) -> t.Tuple[str, str]: error = osp.join(output_dir, f"{self.entity_name}.err") return output, error - def get_step_file( - self, ending: str = ".sh", script_name: t.Optional[str] = None - ) -> str: + def get_step_file(self, ending: str = ".sh", script_name: str | None = None) -> str: """Get the name for a file/script created by the step class Used for Batch scripts, mpmd scripts, etc. @@ -129,61 +123,3 @@ def add_to_batch(self, step: Step) -> None: :param step: a job step instance e.g. SrunStep """ raise SmartSimError("add_to_batch not implemented for this step type") - - -_StepT = t.TypeVar("_StepT", bound=Step) - - -def proxyable_launch_cmd( - fn: t.Callable[[_StepT], t.List[str]], / -) -> t.Callable[[_StepT], t.List[str]]: - @functools.wraps(fn) - def _get_launch_cmd(self: _StepT) -> t.List[str]: - """ - Generate a launch command that executes the `JobStep` with the - indirect launching entrypoint instead of directly. The original - command is passed to the proxy as a base64 encoded string. - - Steps implementing `get_launch_cmd` and decorated with - `proxyable_launch_cmd` will generate status updates that can be consumed - by the telemetry monitor and dashboard""" - original_cmd_list = fn(self) - - if not CONFIG.telemetry_enabled: - return original_cmd_list - - if self.managed: - raise UnproxyableStepError( - f"Attempting to proxy managed step of type {type(self)} " - "through the unmanaged step proxy entry point" - ) - - proxy_module = "smartsim._core.entrypoints.indirect" - entity_type = self.meta["entity_type"] - status_dir = self.meta["status_dir"] - - logger.debug(f"Encoding command{' '.join(original_cmd_list)}") - - # encode the original cmd to avoid potential collisions and escaping - # errors when passing it using CLI arguments to the indirect entrypoint - encoded_cmd = encode_cmd(original_cmd_list) - - # return a new command that executes the proxy and passes - # the original command as an argument - return [ - sys.executable, - "-m", - proxy_module, - "+name", - self.name, - "+command", - encoded_cmd, - "+entity_type", - entity_type, - "+telemetry_dir", - status_dir, - "+working_dir", - self.cwd, - ] - - return _get_launch_cmd diff --git a/smartsim/_core/launcher/stepInfo.py b/smartsim/_core/launcher/stepInfo.py index ad72f71319..79ba9e56c0 100644 --- a/smartsim/_core/launcher/stepInfo.py +++ b/smartsim/_core/launcher/stepInfo.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t import psutil @@ -36,9 +35,9 @@ def __init__( self, status: SmartSimStatus, launcher_status: str = "", - returncode: t.Optional[int] = None, - output: t.Optional[str] = None, - error: t.Optional[str] = None, + returncode: int | None = None, + output: str | None = None, + error: str | None = None, ) -> None: self.status = status self.launcher_status = launcher_status @@ -53,11 +52,11 @@ def __str__(self) -> str: return info_str @property - def mapping(self) -> t.Dict[str, SmartSimStatus]: + def mapping(self) -> dict[str, SmartSimStatus]: raise NotImplementedError def _get_smartsim_status( - self, status: str, returncode: t.Optional[int] = None + self, status: str, returncode: int | None = None ) -> SmartSimStatus: """ Map the status of the WLM step to a smartsim-specific status @@ -73,7 +72,7 @@ def _get_smartsim_status( class UnmanagedStepInfo(StepInfo): @property - def mapping(self) -> t.Dict[str, SmartSimStatus]: + def mapping(self) -> dict[str, SmartSimStatus]: # see https://github.com/giampaolo/psutil/blob/master/psutil/_pslinux.py # see https://github.com/giampaolo/psutil/blob/master/psutil/_common.py return { @@ -96,9 +95,9 @@ def mapping(self) -> t.Dict[str, SmartSimStatus]: def __init__( self, status: str = "", - returncode: t.Optional[int] = None, - output: t.Optional[str] = None, - error: t.Optional[str] = None, + returncode: int | None = None, + output: str | None = None, + error: str | None = None, ) -> None: smartsim_status = self._get_smartsim_status(status) super().__init__( @@ -138,9 +137,9 @@ class SlurmStepInfo(StepInfo): # cov-slurm def __init__( self, status: str = "", - returncode: t.Optional[int] = None, - output: t.Optional[str] = None, - error: t.Optional[str] = None, + returncode: int | None = None, + output: str | None = None, + error: str | None = None, ) -> None: smartsim_status = self._get_smartsim_status(status) super().__init__( @@ -150,7 +149,7 @@ def __init__( class PBSStepInfo(StepInfo): # cov-pbs @property - def mapping(self) -> t.Dict[str, SmartSimStatus]: + def mapping(self) -> dict[str, SmartSimStatus]: # pylint: disable-next=line-too-long # see http://nusc.nsu.ru/wiki/lib/exe/fetch.php/doc/pbs/PBSReferenceGuide19.2.1.pdf#M11.9.90788.PBSHeading1.81.Job.States return { @@ -176,9 +175,9 @@ def mapping(self) -> t.Dict[str, SmartSimStatus]: def __init__( self, status: str = "", - returncode: t.Optional[int] = None, - output: t.Optional[str] = None, - error: t.Optional[str] = None, + returncode: int | None = None, + output: str | None = None, + error: str | None = None, ) -> None: if status == "NOTFOUND": if returncode is not None: @@ -200,7 +199,7 @@ def __init__( class SGEStepInfo(StepInfo): # cov-pbs @property - def mapping(self) -> t.Dict[str, SmartSimStatus]: + def mapping(self) -> dict[str, SmartSimStatus]: # pylint: disable-next=line-too-long # see https://manpages.ubuntu.com/manpages/jammy/man5/sge_status.5.html return { @@ -250,9 +249,9 @@ def mapping(self) -> t.Dict[str, SmartSimStatus]: def __init__( self, status: str = "", - returncode: t.Optional[int] = None, - output: t.Optional[str] = None, - error: t.Optional[str] = None, + returncode: int | None = None, + output: str | None = None, + error: str | None = None, ) -> None: if status == "NOTFOUND": if returncode is not None: diff --git a/smartsim/_core/launcher/stepMapping.py b/smartsim/_core/launcher/stepMapping.py index 50c12f8bde..b52af18a73 100644 --- a/smartsim/_core/launcher/stepMapping.py +++ b/smartsim/_core/launcher/stepMapping.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t from ...log import get_logger @@ -34,9 +33,9 @@ class StepMap: def __init__( self, - step_id: t.Optional[str] = None, - task_id: t.Optional[str] = None, - managed: t.Optional[bool] = None, + step_id: str | None = None, + task_id: str | None = None, + managed: bool | None = None, ) -> None: self.step_id = step_id self.task_id = task_id @@ -46,7 +45,7 @@ def __init__( class StepMapping: def __init__(self) -> None: # step_name : wlm_id, pid, wlm_managed? - self.mapping: t.Dict[str, StepMap] = {} + self.mapping: dict[str, StepMap] = {} def __getitem__(self, step_name: str) -> StepMap: return self.mapping[step_name] @@ -57,8 +56,8 @@ def __setitem__(self, step_name: str, step_map: StepMap) -> None: def add( self, step_name: str, - step_id: t.Optional[str] = None, - task_id: t.Optional[str] = None, + step_id: str | None = None, + task_id: str | None = None, managed: bool = True, ) -> None: try: @@ -68,7 +67,7 @@ def add( msg = f"Could not add step {step_name} to mapping: {e}" logger.exception(msg) - def get_task_id(self, step_id: str) -> t.Optional[str]: + def get_task_id(self, step_id: str) -> str | None: """Get the task id from the step id""" task_id = None for stepmap in self.mapping.values(): @@ -78,9 +77,9 @@ def get_task_id(self, step_id: str) -> t.Optional[str]: return task_id def get_ids( - self, step_names: t.List[str], managed: bool = True - ) -> t.Tuple[t.List[str], t.List[t.Union[str, None]]]: - ids: t.List[t.Union[str, None]] = [] + self, step_names: list[str], managed: bool = True + ) -> tuple[list[str], list[str | None]]: + ids: list[str | None] = [] names = [] for name in step_names: if name in self.mapping: diff --git a/smartsim/_core/launcher/taskManager.py b/smartsim/_core/launcher/taskManager.py index a2e9393ab8..59093166ca 100644 --- a/smartsim/_core/launcher/taskManager.py +++ b/smartsim/_core/launcher/taskManager.py @@ -27,7 +27,6 @@ from __future__ import annotations import time -import typing as t from subprocess import PIPE from threading import RLock @@ -62,10 +61,8 @@ class TaskManager: def __init__(self) -> None: """Initialize a task manager thread.""" self.actively_monitoring = False - self.task_history: t.Dict[ - str, t.Tuple[t.Optional[int], t.Optional[str], t.Optional[str]] - ] = {} - self.tasks: t.List[Task] = [] + self.task_history: dict[str, tuple[int | None, str | None, str | None]] = {} + self.tasks: list[Task] = [] self._lock = RLock() def start(self) -> None: @@ -102,9 +99,9 @@ def run(self) -> None: def start_task( self, - cmd_list: t.List[str], + cmd_list: list[str], cwd: str, - env: t.Optional[t.Dict[str, str]] = None, + env: dict[str, str] | None = None, out: int = PIPE, err: int = PIPE, ) -> str: @@ -131,11 +128,11 @@ def start_task( @staticmethod def start_and_wait( - cmd_list: t.List[str], + cmd_list: list[str], cwd: str, - env: t.Optional[t.Dict[str, str]] = None, - timeout: t.Optional[int] = None, - ) -> t.Tuple[int, str, str]: + env: dict[str, str] | None = None, + timeout: int | None = None, + ) -> tuple[int, str, str]: """Start a task not managed by the TaskManager This method is used by launchers to launch managed tasks @@ -193,7 +190,7 @@ def remove_task(self, task_id: str) -> None: def get_task_update( self, task_id: str - ) -> t.Tuple[str, t.Optional[int], t.Optional[str], t.Optional[str]]: + ) -> tuple[str, int | None, str | None, str | None]: """Get the update of a task :param task_id: task id @@ -227,9 +224,9 @@ def get_task_update( def add_task_history( self, task_id: str, - returncode: t.Optional[int] = None, - out: t.Optional[str] = None, - err: t.Optional[str] = None, + returncode: int | None = None, + out: str | None = None, + err: str | None = None, ) -> None: """Add a task to the task history @@ -263,7 +260,7 @@ def __init__(self, process: psutil.Process) -> None: self.process = process self.pid = str(self.process.pid) - def check_status(self) -> t.Optional[int]: + def check_status(self) -> int | None: """Ping the job and return the returncode if finished :return: returncode if finished otherwise None @@ -277,7 +274,7 @@ def check_status(self) -> t.Optional[int]: # have to rely on .kill() to stop. return self.returncode - def get_io(self) -> t.Tuple[t.Optional[str], t.Optional[str]]: + def get_io(self) -> tuple[str | None, str | None]: """Get the IO from the subprocess :return: output and error from the Popen @@ -341,7 +338,7 @@ def wait(self) -> None: self.process.wait() @property - def returncode(self) -> t.Optional[int]: + def returncode(self) -> int | None: if self.owned and isinstance(self.process, psutil.Popen): if self.process.returncode is not None: return int(self.process.returncode) diff --git a/smartsim/_core/launcher/util/launcherUtil.py b/smartsim/_core/launcher/util/launcherUtil.py index 0307bc51b4..a58eaf2e4b 100644 --- a/smartsim/_core/launcher/util/launcherUtil.py +++ b/smartsim/_core/launcher/util/launcherUtil.py @@ -24,8 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t - class ComputeNode: # cov-slurm """The ComputeNode class holds resource information @@ -33,15 +31,15 @@ class ComputeNode: # cov-slurm """ def __init__( - self, node_name: t.Optional[str] = None, node_ppn: t.Optional[int] = None + self, node_name: str | None = None, node_ppn: int | None = None ) -> None: """Initialize a ComputeNode :param node_name: the name of the node :param node_ppn: the number of ppn """ - self.name: t.Optional[str] = node_name - self.ppn: t.Optional[int] = node_ppn + self.name: str | None = node_name + self.ppn: int | None = node_ppn def _is_valid_node(self) -> bool: """Check if the node is complete @@ -66,9 +64,9 @@ class Partition: # cov-slurm def __init__(self) -> None: """Initialize a system partition""" - self.name: t.Optional[str] = None - self.min_ppn: t.Optional[int] = None - self.nodes: t.Set[ComputeNode] = set() + self.name: str | None = None + self.min_ppn: int | None = None + self.nodes: set[ComputeNode] = set() def _is_valid_partition(self) -> bool: """Check if the partition is valid diff --git a/smartsim/_core/schemas/dragonRequests.py b/smartsim/_core/schemas/dragonRequests.py index 28ff30b555..f3990f4c02 100644 --- a/smartsim/_core/schemas/dragonRequests.py +++ b/smartsim/_core/schemas/dragonRequests.py @@ -43,14 +43,14 @@ class DragonRequest(BaseModel): ... class DragonRunPolicy(BaseModel): """Policy specifying hardware constraints when running a Dragon job""" - cpu_affinity: t.List[NonNegativeInt] = Field(default_factory=list) + cpu_affinity: list[NonNegativeInt] = Field(default_factory=list) """List of CPU indices to which the job should be pinned""" - gpu_affinity: t.List[NonNegativeInt] = Field(default_factory=list) + gpu_affinity: list[NonNegativeInt] = Field(default_factory=list) """List of GPU indices to which the job should be pinned""" @staticmethod def from_run_args( - run_args: t.Dict[str, t.Union[int, str, float, None]] + run_args: dict[str, int | str | float | None] ) -> "DragonRunPolicy": """Create a DragonRunPolicy with hardware constraints passed from a dictionary of run arguments @@ -79,23 +79,23 @@ def from_run_args( class DragonRunRequestView(DragonRequest): exe: t.Annotated[str, Field(min_length=1)] - exe_args: t.List[t.Annotated[str, Field(min_length=1)]] = [] + exe_args: list[t.Annotated[str, Field(min_length=1)]] = [] path: t.Annotated[str, Field(min_length=1)] nodes: PositiveInt = 1 tasks: PositiveInt = 1 tasks_per_node: PositiveInt = 1 - hostlist: t.Optional[t.Annotated[str, Field(min_length=1)]] = None - output_file: t.Optional[t.Annotated[str, Field(min_length=1)]] = None - error_file: t.Optional[t.Annotated[str, Field(min_length=1)]] = None - env: t.Dict[str, t.Optional[str]] = {} - name: t.Optional[t.Annotated[str, Field(min_length=1)]] = None + hostlist: t.Annotated[str, Field(min_length=1)] | None = None + output_file: t.Annotated[str, Field(min_length=1)] | None = None + error_file: t.Annotated[str, Field(min_length=1)] | None = None + env: dict[str, str | None] = {} + name: t.Annotated[str, Field(min_length=1)] | None = None pmi_enabled: bool = True @request_registry.register("run") class DragonRunRequest(DragonRunRequestView): - current_env: t.Dict[str, t.Optional[str]] = {} - policy: t.Optional[DragonRunPolicy] = None + current_env: dict[str, str | None] = {} + policy: DragonRunPolicy | None = None def __str__(self) -> str: return str(DragonRunRequestView.parse_obj(self.dict(exclude={"current_env"}))) @@ -103,7 +103,7 @@ def __str__(self) -> str: @request_registry.register("update_status") class DragonUpdateStatusRequest(DragonRequest): - step_ids: t.List[t.Annotated[str, Field(min_length=1)]] + step_ids: list[t.Annotated[str, Field(min_length=1)]] @request_registry.register("stop") diff --git a/smartsim/_core/schemas/dragonResponses.py b/smartsim/_core/schemas/dragonResponses.py index 318a4eabf9..14ffd797cc 100644 --- a/smartsim/_core/schemas/dragonResponses.py +++ b/smartsim/_core/schemas/dragonResponses.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import typing as t +from collections.abc import Mapping from pydantic import BaseModel, Field @@ -38,7 +39,7 @@ class DragonResponse(BaseModel): - error_message: t.Optional[str] = None + error_message: str | None = None @response_registry.register("run") @@ -49,9 +50,9 @@ class DragonRunResponse(DragonResponse): @response_registry.register("status_update") class DragonUpdateStatusResponse(DragonResponse): # status is a dict: {step_id: (is_alive, returncode)} - statuses: t.Mapping[ + statuses: Mapping[ t.Annotated[str, Field(min_length=1)], - t.Tuple[SmartSimStatus, t.Optional[t.List[int]]], + tuple[SmartSimStatus, list[int] | None], ] = {} diff --git a/smartsim/_core/schemas/utils.py b/smartsim/_core/schemas/utils.py index 508ef34ed0..47daf1e050 100644 --- a/smartsim/_core/schemas/utils.py +++ b/smartsim/_core/schemas/utils.py @@ -26,6 +26,7 @@ import dataclasses import typing as t +from collections.abc import Callable, Mapping import pydantic import pydantic.dataclasses @@ -54,7 +55,7 @@ def __str__(self) -> str: def from_str( cls, str_: str, - payload_type: t.Type[_SchemaT], + payload_type: type[_SchemaT], delimiter: str = _DEFAULT_MSG_DELIM, ) -> "_Message[_SchemaT]": header, payload = str_.split(delimiter, 1) @@ -63,11 +64,11 @@ def from_str( class SchemaRegistry(t.Generic[_SchemaT]): def __init__( - self, init_map: t.Optional[t.Mapping[str, t.Type[_SchemaT]]] = None + self, init_map: t.Optional[Mapping[str, type[_SchemaT]]] = None ) -> None: self._map = dict(init_map) if init_map else {} - def register(self, key: str) -> t.Callable[[t.Type[_SchemaT]], t.Type[_SchemaT]]: + def register(self, key: str) -> Callable[[type[_SchemaT]], type[_SchemaT]]: if _DEFAULT_MSG_DELIM in key: _msg = f"Registry key cannot contain delimiter `{_DEFAULT_MSG_DELIM}`" raise ValueError(_msg) @@ -76,7 +77,7 @@ def register(self, key: str) -> t.Callable[[t.Type[_SchemaT]], t.Type[_SchemaT]] if key in self._map: raise KeyError(f"Key `{key}` has already been registered for this parser") - def _register(cls: t.Type[_SchemaT]) -> t.Type[_SchemaT]: + def _register(cls: type[_SchemaT]) -> type[_SchemaT]: self._map[key] = cls return cls diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index ff3c93e16f..eafd6ac5af 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -34,6 +34,7 @@ import subprocess import typing as t import uuid +from collections.abc import Callable, Iterable, Sequence from datetime import datetime from functools import lru_cache from pathlib import Path @@ -44,10 +45,10 @@ _TRedisAIBackendStr = t.Literal["tensorflow", "torch", "onnxruntime"] -_TSignalHandlerFn = t.Callable[[int, t.Optional["FrameType"]], object] +_TSignalHandlerFn = Callable[[int, "FrameType | None"], object] -def unpack_db_identifier(db_id: str, token: str) -> t.Tuple[str, str]: +def unpack_db_identifier(db_id: str, token: str) -> tuple[str, str]: """Unpack the unformatted database identifier and format for env variable suffix using the token :param db_id: the unformatted database identifier eg. identifier_1 @@ -86,7 +87,7 @@ def check_dev_log_level() -> bool: return lvl == "developer" -def fmt_dict(value: t.Dict[str, t.Any]) -> str: +def fmt_dict(value: dict[str, t.Any]) -> str: fmt_str = "" for k, v in value.items(): fmt_str += "\t" + str(k) + " = " + str(v) @@ -130,7 +131,7 @@ def expand_exe_path(exe: str) -> str: return os.path.abspath(in_path) -def is_valid_cmd(command: t.Union[str, None]) -> bool: +def is_valid_cmd(command: str | None) -> bool: try: if command: expand_exe_path(command) @@ -173,7 +174,7 @@ def colorize( return f"\x1b[{';'.join(attr)}m{string}\x1b[0m" -def delete_elements(dictionary: t.Dict[str, t.Any], key_list: t.List[str]) -> None: +def delete_elements(dictionary: dict[str, t.Any], key_list: list[str]) -> None: """Delete elements from a dictionary. :param dictionary: the dictionary from which the elements must be deleted. :param key_list: the list of keys to delete from the dictionary. @@ -225,7 +226,7 @@ def _installed(base_path: Path, backend: str) -> bool: return backend_so.is_file() -def redis_install_base(backends_path: t.Optional[str] = None) -> Path: +def redis_install_base(backends_path: str | None = None) -> Path: # pylint: disable-next=import-outside-toplevel,cyclic-import from ..._core.config import CONFIG @@ -236,8 +237,8 @@ def redis_install_base(backends_path: t.Optional[str] = None) -> Path: def installed_redisai_backends( - backends_path: t.Optional[str] = None, -) -> t.Set[_TRedisAIBackendStr]: + backends_path: str | None = None, +) -> set[_TRedisAIBackendStr]: """Check which ML backends are available for the RedisAI module. The optional argument ``backends_path`` is needed if the backends @@ -252,7 +253,7 @@ def installed_redisai_backends( """ # import here to avoid circular import base_path = redis_install_base(backends_path) - backends: t.Set[_TRedisAIBackendStr] = { + backends: set[_TRedisAIBackendStr] = { "tensorflow", "torch", "onnxruntime", @@ -267,7 +268,7 @@ def get_ts_ms() -> int: return int(datetime.now().timestamp() * 1000) -def encode_cmd(cmd: t.Sequence[str]) -> str: +def encode_cmd(cmd: Sequence[str]) -> str: """Transform a standard command list into an encoded string safe for providing as an argument to a proxy entrypoint """ @@ -279,7 +280,7 @@ def encode_cmd(cmd: t.Sequence[str]) -> str: return encoded_cmd -def decode_cmd(encoded_cmd: str) -> t.List[str]: +def decode_cmd(encoded_cmd: str) -> list[str]: """Decode an encoded command string to the original command list format""" if not encoded_cmd.strip(): raise ValueError("Invalid cmd supplied") @@ -305,7 +306,7 @@ def check_for_utility(util_name: str) -> str: return utility -def execute_platform_cmd(cmd: str) -> t.Tuple[str, int]: +def execute_platform_cmd(cmd: str) -> tuple[str, int]: """Execute the platform check command as a subprocess :param cmd: the command to execute @@ -321,9 +322,9 @@ def execute_platform_cmd(cmd: str) -> t.Tuple[str, int]: class CrayExPlatformResult: locate_msg = "Unable to locate `{0}`." - def __init__(self, ldconfig: t.Optional[str], fi_info: t.Optional[str]) -> None: - self.ldconfig: t.Optional[str] = ldconfig - self.fi_info: t.Optional[str] = fi_info + def __init__(self, ldconfig: str | None, fi_info: str | None) -> None: + self.ldconfig: str | None = ldconfig + self.fi_info: str | None = fi_info self.has_pmi: bool = False self.has_pmi2: bool = False self.has_cxi: bool = False @@ -349,7 +350,7 @@ def is_cray(self) -> bool: ) @property - def failures(self) -> t.List[str]: + def failures(self) -> list[str]: """Return a list of messages describing all failed validations""" failure_messages = [] @@ -421,7 +422,7 @@ class SignalInterceptionStack(collections.abc.Collection[_TSignalHandlerFn]): def __init__( self, signalnum: int, - callbacks: t.Optional[t.Iterable[_TSignalHandlerFn]] = None, + callbacks: Iterable[_TSignalHandlerFn] | None = None, ) -> None: """Set up a ``SignalInterceptionStack`` for particular signal number. @@ -438,7 +439,7 @@ def __init__( self._callbacks = list(callbacks) if callbacks else [] self._original = signal.signal(signalnum, self) - def __call__(self, signalnum: int, frame: t.Optional["FrameType"]) -> None: + def __call__(self, signalnum: int, frame: "FrameType | None") -> None: """Handle the signal on which the interception stack was registered. End by calling the originally registered signal hander (if present). diff --git a/smartsim/_core/utils/network.py b/smartsim/_core/utils/network.py index 7c2b6f5e14..1c08c0e005 100644 --- a/smartsim/_core/utils/network.py +++ b/smartsim/_core/utils/network.py @@ -35,8 +35,8 @@ class IFConfig(t.NamedTuple): - interface: t.Optional[str] - address: t.Optional[str] + interface: str | None + address: str | None def get_ip_from_host(host: str) -> str: diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index ab7ecdea04..9b290eac29 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -46,7 +46,7 @@ logger = get_logger(__name__) -def create_cluster(hosts: t.List[str], ports: t.List[int]) -> None: # cov-wlm +def create_cluster(hosts: list[str], ports: list[int]) -> None: # cov-wlm """Connect launched cluster instances. Should only be used in the case where cluster initialization @@ -78,7 +78,7 @@ def create_cluster(hosts: t.List[str], ports: t.List[int]) -> None: # cov-wlm def check_cluster_status( - hosts: t.List[str], ports: t.List[int], trials: int = 10 + hosts: list[str], ports: list[int], trials: int = 10 ) -> None: # cov-wlm """Check that a Redis/KeyDB cluster is up and running @@ -117,7 +117,7 @@ def check_cluster_status( raise SSInternalError("Cluster setup could not be verified") -def db_is_active(hosts: t.List[str], ports: t.List[int], num_shards: int) -> bool: +def db_is_active(hosts: list[str], ports: list[int], num_shards: int) -> bool: """Check if a DB is running if the DB is clustered, check cluster status, otherwise @@ -212,7 +212,7 @@ def set_script(db_script: DBScript, client: Client) -> None: raise error -def shutdown_db_node(host_ip: str, port: int) -> t.Tuple[int, str, str]: # cov-wlm +def shutdown_db_node(host_ip: str, port: int) -> tuple[int, str, str]: # cov-wlm """Send shutdown signal to DB node. Should only be used in the case where cluster deallocation diff --git a/smartsim/_core/utils/security.py b/smartsim/_core/utils/security.py index c3f4600749..a65466dea2 100644 --- a/smartsim/_core/utils/security.py +++ b/smartsim/_core/utils/security.py @@ -28,7 +28,6 @@ import dataclasses import pathlib import stat -import typing as t from enum import IntEnum import zmq @@ -216,7 +215,7 @@ def _load_keypair(cls, locator: _KeyLocator, in_context: bool) -> KeyPair: key_path = locator.private if in_context else locator.public pub_key: bytes = b"" - priv_key: t.Optional[bytes] = b"" + priv_key: bytes | None = b"" if key_path.exists(): logger.debug(f"Existing key files located at {key_path}") @@ -227,7 +226,7 @@ def _load_keypair(cls, locator: _KeyLocator, in_context: bool) -> KeyPair: # avoid a `None` value in the private key when it isn't loaded return KeyPair(pub_key, priv_key or b"") - def _load_keys(self) -> t.Tuple[KeyPair, KeyPair]: + def _load_keys(self) -> tuple[KeyPair, KeyPair]: """Use ZMQ auth to load public/private key pairs for the server and client components from the standard key paths for the associated experiment @@ -270,7 +269,7 @@ def _create_keys(self) -> None: locator.private.chmod(_KeyPermissions.PRIVATE_KEY) locator.public.chmod(_KeyPermissions.PUBLIC_KEY) - def get_keys(self, create: bool = True) -> t.Tuple[KeyPair, KeyPair]: + def get_keys(self, create: bool = True) -> tuple[KeyPair, KeyPair]: """Use ZMQ auth to generate a public/private key pair for the server and client components. diff --git a/smartsim/_core/utils/serialize.py b/smartsim/_core/utils/serialize.py index 20dcec3ea4..e69de29bb2 100644 --- a/smartsim/_core/utils/serialize.py +++ b/smartsim/_core/utils/serialize.py @@ -1,265 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2025, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from __future__ import annotations - -import json -import time -import typing as t -from pathlib import Path - -import smartsim._core._cli.utils as _utils -import smartsim.log - -if t.TYPE_CHECKING: - from smartsim._core.control.manifest import LaunchedManifest as _Manifest - from smartsim.database.orchestrator import Orchestrator - from smartsim.entity import DBNode, Ensemble, Model - from smartsim.entity.dbobject import DBModel, DBScript - from smartsim.settings.base import BatchSettings, RunSettings - - -TStepLaunchMetaData = t.Tuple[ - t.Optional[str], t.Optional[str], t.Optional[bool], str, str, Path -] - -MANIFEST_FILENAME: t.Final[str] = "manifest.json" - -_LOGGER = smartsim.log.get_logger(__name__) - - -def save_launch_manifest(manifest: _Manifest[TStepLaunchMetaData]) -> None: - manifest.metadata.run_telemetry_subdirectory.mkdir(parents=True, exist_ok=True) - exp_out, exp_err = smartsim.log.get_exp_log_paths() - - new_run = { - "run_id": manifest.metadata.run_id, - "timestamp": int(time.time_ns()), - "model": [ - _dictify_model(model, *telemetry_metadata) - for model, telemetry_metadata in manifest.models - ], - "orchestrator": [ - _dictify_db(db, nodes_info) for db, nodes_info in manifest.databases - ], - "ensemble": [ - _dictify_ensemble(ens, member_info) - for ens, member_info in manifest.ensembles - ], - } - try: - with open(manifest.metadata.manifest_file_path, "r", encoding="utf-8") as file: - manifest_dict = json.load(file) - except (FileNotFoundError, json.JSONDecodeError): - manifest_dict = { - "schema info": { - "schema_name": "entity manifest", - "version": "0.0.4", - }, - "experiment": { - "name": manifest.metadata.exp_name, - "path": manifest.metadata.exp_path, - "launcher": manifest.metadata.launcher_name, - "out_file": str(exp_out), - "err_file": str(exp_err), - }, - "runs": [new_run], - } - else: - manifest_dict["runs"].append(new_run) - finally: - with open(manifest.metadata.manifest_file_path, "w", encoding="utf-8") as file: - json.dump(manifest_dict, file, indent=2) - - -def _dictify_model( - model: Model, - step_id: t.Optional[str], - task_id: t.Optional[str], - managed: t.Optional[bool], - out_file: str, - err_file: str, - telemetry_data_path: Path, -) -> t.Dict[str, t.Any]: - colo_settings = (model.run_settings.colocated_db_settings or {}).copy() - db_scripts = t.cast("t.List[DBScript]", colo_settings.pop("db_scripts", [])) - db_models = t.cast("t.List[DBModel]", colo_settings.pop("db_models", [])) - return { - "name": model.name, - "path": model.path, - "exe_args": model.run_settings.exe_args, - "run_settings": _dictify_run_settings(model.run_settings), - "batch_settings": ( - _dictify_batch_settings(model.batch_settings) - if model.batch_settings - else {} - ), - "params": model.params, - "files": ( - { - "Symlink": model.files.link, - "Configure": model.files.tagged, - "Copy": model.files.copy, - } - if model.files - else { - "Symlink": [], - "Configure": [], - "Copy": [], - } - ), - "colocated_db": ( - { - "settings": colo_settings, - "scripts": [ - { - script.name: { - "backend": "TORCH", - "device": script.device, - } - } - for script in db_scripts - ], - "models": [ - { - model.name: { - "backend": model.backend, - "device": model.device, - } - } - for model in db_models - ], - } - if colo_settings - else {} - ), - "telemetry_metadata": { - "status_dir": str(telemetry_data_path), - "step_id": step_id, - "task_id": task_id, - "managed": managed, - }, - "out_file": out_file, - "err_file": err_file, - } - - -def _dictify_ensemble( - ens: Ensemble, - members: t.Sequence[t.Tuple[Model, TStepLaunchMetaData]], -) -> t.Dict[str, t.Any]: - return { - "name": ens.name, - "params": ens.params, - "batch_settings": ( - _dictify_batch_settings(ens.batch_settings) - # FIXME: Typehint here is wrong, ``ens.batch_settings`` can - # also be an empty dict for no discernible reason... - if ens.batch_settings - else {} - ), - "models": [ - _dictify_model(model, *launching_metadata) - for model, launching_metadata in members - ], - } - - -def _dictify_run_settings(run_settings: RunSettings) -> t.Dict[str, t.Any]: - # TODO: remove this downcast - if hasattr(run_settings, "mpmd") and run_settings.mpmd: - _LOGGER.warning( - "SmartSim currently cannot properly serialize all information in " - "MPMD run settings" - ) - return { - "exe": run_settings.exe, - # TODO: We should try to move this back - # "exe_args": run_settings.exe_args, - "run_command": run_settings.run_command, - "run_args": run_settings.run_args, - # TODO: We currently do not have a way to represent MPMD commands! - # Maybe add a ``"mpmd"`` key here that is a - # ``list[TDictifiedRunSettings]``? - } - - -def _dictify_batch_settings(batch_settings: BatchSettings) -> t.Dict[str, t.Any]: - return { - "batch_command": batch_settings.batch_cmd, - "batch_args": batch_settings.batch_args, - } - - -def _dictify_db( - db: Orchestrator, - nodes: t.Sequence[t.Tuple[DBNode, TStepLaunchMetaData]], -) -> t.Dict[str, t.Any]: - db_path = _utils.get_db_path() - if db_path: - db_type, _ = db_path.name.split("-", 1) - else: - db_type = "Unknown" - - return { - "name": db.name, - "type": db_type, - "interface": db._interfaces, # pylint: disable=protected-access - "shards": [ - { - **shard.to_dict(), - "conf_file": shard.cluster_conf_file, - "out_file": out_file, - "err_file": err_file, - "memory_file": ( - str(status_dir / "memory.csv") if db.telemetry.is_enabled else "" - ), - "client_file": ( - str(status_dir / "client.csv") if db.telemetry.is_enabled else "" - ), - "client_count_file": ( - str(status_dir / "client_count.csv") - if db.telemetry.is_enabled - else "" - ), - "telemetry_metadata": { - "status_dir": str(status_dir), - "step_id": step_id, - "task_id": task_id, - "managed": managed, - }, - } - for dbnode, ( - step_id, - task_id, - managed, - out_file, - err_file, - status_dir, - ) in nodes - for shard in dbnode.get_launched_shard_info() - ], - } diff --git a/smartsim/_core/utils/shell.py b/smartsim/_core/utils/shell.py index 32ff0b86fd..b1b3f35727 100644 --- a/smartsim/_core/utils/shell.py +++ b/smartsim/_core/utils/shell.py @@ -25,7 +25,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import time -import typing as t from subprocess import PIPE, TimeoutExpired import psutil @@ -39,13 +38,13 @@ def execute_cmd( - cmd_list: t.List[str], + cmd_list: list[str], shell: bool = False, - cwd: t.Optional[str] = None, - env: t.Optional[t.Dict[str, str]] = None, + cwd: str | None = None, + env: dict[str, str] | None = None, proc_input: str = "", - timeout: t.Optional[int] = None, -) -> t.Tuple[int, str, str]: + timeout: int | None = None, +) -> tuple[int, str, str]: """Execute a command locally :param cmd_list: list of command with arguments @@ -86,9 +85,9 @@ def execute_cmd( def execute_async_cmd( - cmd_list: t.List[str], + cmd_list: list[str], cwd: str, - env: t.Optional[t.Dict[str, str]] = None, + env: dict[str, str] | None = None, out: int = PIPE, err: int = PIPE, ) -> psutil.Popen: diff --git a/smartsim/_core/utils/telemetry/__init__.py b/smartsim/_core/utils/telemetry/__init__.py deleted file mode 100644 index f096dda3de..0000000000 --- a/smartsim/_core/utils/telemetry/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2025, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/smartsim/_core/utils/telemetry/collector.py b/smartsim/_core/utils/telemetry/collector.py deleted file mode 100644 index 395839d873..0000000000 --- a/smartsim/_core/utils/telemetry/collector.py +++ /dev/null @@ -1,482 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2025 Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import abc -import asyncio -import collections -import itertools -import logging -import typing as t - -import redis.asyncio as redisa -import redis.exceptions as redisex - -from smartsim._core.control.job import JobEntity -from smartsim._core.utils.helpers import get_ts_ms -from smartsim._core.utils.telemetry.sink import FileSink, Sink - -logger = logging.getLogger("TelemetryMonitor") - - -class Collector(abc.ABC): - """Base class for telemetry collectors. - - A Collector is used to retrieve runtime metrics about an entity.""" - - def __init__(self, entity: JobEntity, sink: Sink) -> None: - """Initialize the collector - - :param entity: entity to collect metrics on - :param sink: destination to write collected information - """ - self._entity = entity - self._sink = sink - self._enabled = True - - @property - def enabled(self) -> bool: - """Boolean indicating if the collector should perform data collection""" - return self._entity.telemetry_on - - @enabled.setter - def enabled(self, value: bool) -> None: - self._entity.telemetry_on = value - - @property - def entity(self) -> JobEntity: - """The `JobEntity` for which data is collected - :return: the entity""" - return self._entity - - @property - def sink(self) -> Sink: - """The sink where collected data is written - :return: the sink - """ - return self._sink - - @abc.abstractmethod - async def prepare(self) -> None: - """Initialization logic for the collector""" - - @abc.abstractmethod - async def collect(self) -> None: - """Execute metric collection""" - - @abc.abstractmethod - async def shutdown(self) -> None: - """Execute cleanup of resources for the collector""" - - -class _DBAddress: - """Helper class to hold and pretty-print connection details""" - - def __init__(self, host: str, port: int) -> None: - """Initialize the instance - :param host: host address for database connections - :param port: port number for database connections - """ - self.host = host.strip() if host else "" - self.port = port - self._check() - - def _check(self) -> None: - """Validate input arguments""" - if not self.host: - raise ValueError(f"{type(self).__name__} requires host") - if not self.port: - raise ValueError(f"{type(self).__name__} requires port") - - def __str__(self) -> str: - """Pretty-print the instance""" - return f"{self.host}:{self.port}" - - -class DBCollector(Collector): - """A base class for collectors that retrieve statistics from an orchestrator""" - - def __init__(self, entity: JobEntity, sink: Sink) -> None: - """Initialize the `DBCollector` - - :param entity: entity with metadata about the resource to monitor - :param sink: destination to write collected information - """ - super().__init__(entity, sink) - self._client: t.Optional[redisa.Redis[bytes]] = None - self._address = _DBAddress( - self._entity.config.get("host", ""), - int(self._entity.config.get("port", 0)), - ) - - async def _configure_client(self) -> None: - """Configure the client connection to the target database""" - try: - if not self._client: - self._client = redisa.Redis( - host=self._address.host, port=self._address.port - ) - except Exception as e: - logger.exception(e) - finally: - if not self._client: - logger.error( - f"{type(self).__name__} failed to connect to {self._address}" - ) - - async def prepare(self) -> None: - """Initialization logic for the DB collector. Creates a database - connection then executes the `post_prepare` callback function.""" - if self._client: - return - - await self._configure_client() - await self._post_prepare() - - @abc.abstractmethod - async def _post_prepare(self) -> None: - """Hook function to enable subclasses to perform actions - after a db client is ready""" - - @abc.abstractmethod - async def _perform_collection( - self, - ) -> t.Sequence[t.Tuple[t.Union[int, float, str], ...]]: - """Hook function for subclasses to execute custom metric retrieval. - NOTE: all implementations return an iterable of metrics to avoid - adding extraneous base class code to differentiate the results - - :return: an iterable containing individual metric collection results - """ - - async def collect(self) -> None: - """Execute database metric collection if the collector is enabled. Writes - the resulting metrics to the associated output sink. Calling `collect` - when `self.enabled` is `False` performs no actions.""" - if not self.enabled: - # collectors may be disabled by monitoring changes to the - # manifest. Leave the collector but do NOT collect - logger.debug(f"{type(self).__name__} is not enabled") - return - - await self.prepare() - if not self._client: - logger.warning(f"{type(self).__name__} cannot collect") - return - - try: - # if we can't communicate w/the db, exit - if not await self._check_db(): - return - - all_metrics = await self._perform_collection() - for metrics in all_metrics: - await self._sink.save(*metrics) - except Exception as ex: - logger.warning(f"Collect failed for {type(self).__name__}", exc_info=ex) - - async def shutdown(self) -> None: - """Execute cleanup of database client connections""" - try: - if self._client: - logger.info( - f"Shutting down {self._entity.name}::{self.__class__.__name__}" - ) - await self._client.close() - self._client = None - except Exception as ex: - logger.error( - f"An error occurred during {type(self).__name__} shutdown", exc_info=ex - ) - - async def _check_db(self) -> bool: - """Check if the target database is reachable. - - :return: `True` if connection succeeds, `False` otherwise. - """ - try: - if self._client: - return await self._client.ping() - except redisex.ConnectionError: - logger.warning(f"Cannot ping db {self._address}") - - return False - - -class DBMemoryCollector(DBCollector): - """A `DBCollector` that collects memory consumption metrics""" - - def __init__(self, entity: JobEntity, sink: Sink) -> None: - super().__init__(entity, sink) - self._columns = ["used_memory", "used_memory_peak", "total_system_memory"] - - async def _post_prepare(self) -> None: - """Write column headers for a CSV formatted output sink after - the database connection is established""" - await self._sink.save("timestamp", *self._columns) - - async def _perform_collection( - self, - ) -> t.Sequence[t.Tuple[int, float, float, float]]: - """Perform memory metric collection and return the results - - :return: an iterable containing individual metric collection results - in the format `(timestamp,used_memory,used_memory_peak,total_system_memory)` - """ - if self._client is None: - return [] - - db_info = await self._client.info("memory") - - used = float(db_info["used_memory"]) - peak = float(db_info["used_memory_peak"]) - total = float(db_info["total_system_memory"]) - - value = (get_ts_ms(), used, peak, total) - - # return a list containing a single record to simplify the parent - # class code to save multiple records from a single collection - return [value] - - -class DBConnectionCollector(DBCollector): - """A `DBCollector` that collects database client-connection metrics""" - - def __init__(self, entity: JobEntity, sink: Sink) -> None: - super().__init__(entity, sink) - self._columns = ["client_id", "address"] - - async def _post_prepare(self) -> None: - """Write column headers for a CSV formatted output sink after - the database connection is established""" - await self._sink.save("timestamp", *self._columns) - - async def _perform_collection( - self, - ) -> t.Sequence[t.Tuple[t.Union[int, str, str], ...]]: - """Perform connection metric collection and return the results - - :return: an iterable containing individual metric collection results - in the format `(timestamp,client_id,address)` - """ - if self._client is None: - return [] - - now_ts = get_ts_ms() - clients = await self._client.client_list() - - values: t.List[t.Tuple[int, str, str]] = [] - - # content-filter the metrics and return them all together - for client in clients: - # all records for the request will have the same timestamp - value = now_ts, client["id"], client["addr"] - values.append(value) - - return values - - -class DBConnectionCountCollector(DBCollector): - """A DBCollector that collects aggregated client-connection count metrics""" - - def __init__(self, entity: JobEntity, sink: Sink) -> None: - super().__init__(entity, sink) - self._columns = ["num_clients"] - - async def _post_prepare(self) -> None: - """Write column headers for a CSV formatted output sink after - the database connection is established""" - await self._sink.save("timestamp", *self._columns) - - async def _perform_collection( - self, - ) -> t.Sequence[t.Tuple[int, int]]: - """Perform connection-count metric collection and return the results - - :return: an iterable containing individual metric collection results - in the format `(timestamp,num_clients)` - """ - if self._client is None: - return [] - - client_list = await self._client.client_list() - - addresses = {item["addr"] for item in client_list} - - # return a list containing a single record to simplify the parent - # class code to save multiple records from a single collection - value = (get_ts_ms(), len(addresses)) - return [value] - - -class CollectorManager: - """The `CollectorManager` manages the set of all collectors required to retrieve - metrics for an experiment. It provides the ability to add and remove collectors - with unique configuration per entity. The `CollectorManager` is primarily used - to perform bulk actions on 1-to-many collectors (e.g. prepare all collectors, - request metrics for all collectors, close all collector connections)""" - - def __init__(self, timeout_ms: int = 1000) -> None: - """Initialize the `CollectorManager` without collectors - :param timeout_ms: maximum time (in ms) allowed for `Collector.collect` - """ - # A lookup table to hold a list of registered collectors per entity - self._collectors: t.Dict[str, t.List[Collector]] = collections.defaultdict(list) - # Max time to allow a collector to work before cancelling requests - self._timeout_ms = timeout_ms - - def clear(self) -> None: - """Remove all collectors from the monitored set""" - self._collectors = collections.defaultdict(list) - - def add(self, collector: Collector) -> None: - """Add a collector to the monitored set - - :param collector: `Collector` instance to monitor - """ - entity_name = collector.entity.name - - registered_collectors = self._collectors[entity_name] - - # Exit if the collector is already registered to the entity - if any(c for c in registered_collectors if type(c) is type(collector)): - return - - logger.debug(f"Adding collector: {entity_name}::{type(collector).__name__}") - registered_collectors.append(collector) - - def add_all(self, collectors: t.Sequence[Collector]) -> None: - """Add multiple collectors to the monitored set - - :param collectors: a collection of `Collectors` to monitor - """ - for collector in collectors: - self.add(collector) - - async def remove_all(self, entities: t.Sequence[JobEntity]) -> None: - """Remove all collectors registered to the supplied entities - - :param entities: a collection of `JobEntity` instances that will - no longer have registered collectors - """ - if not entities: - return - - tasks = (self.remove(entity) for entity in entities) - await asyncio.gather(*tasks) - - async def remove(self, entity: JobEntity) -> None: - """Remove all collectors registered to the supplied entity - - :param entities: `JobEntity` that will no longer have registered collectors - """ - registered = self._collectors.pop(entity.name, []) - if not registered: - return - - logger.debug(f"Removing collectors registered for {entity.name}") - asyncio.gather(*(collector.shutdown() for collector in registered)) - - async def prepare(self) -> None: - """Prepare registered collectors to perform collection""" - tasks = (collector.prepare() for collector in self.all_collectors) - # use gather so all collectors are prepared before collection - await asyncio.gather(*tasks) - - async def collect(self) -> None: - """Perform collection for all registered collectors""" - if collectors := self.all_collectors: - tasks = [asyncio.create_task(item.collect()) for item in collectors] - - _, pending = await asyncio.wait(tasks, timeout=self._timeout_ms / 1000.0) - - # any tasks still pending has exceeded the timeout - if pending: - # manually cancel tasks since asyncio.wait will not - for remaining_task in pending: - remaining_task.cancel() - logger.debug(f"Execution of {len(pending)} collectors timed out.") - - async def shutdown(self) -> None: - """Release resources for all registered collectors""" - logger.debug(f"{type(self).__name__} shutting down collectors...") - if list(self.all_collectors): - shutdown_tasks = [] - # create an async tasks to execute all shutdowns in parallel - for item in self.all_collectors: - shutdown_tasks.append(asyncio.create_task(item.shutdown())) - # await until all shutdowns are complete - await asyncio.wait(shutdown_tasks) - logger.debug("Collector shutdown complete...") - - @property - def all_collectors(self) -> t.Sequence[Collector]: - """Get a list of all registered collectors - - :return: a collection of registered collectors for all entities - """ - # flatten and return all the lists-of-collectors that are registered - collectors = itertools.chain.from_iterable(self._collectors.values()) - return [collector for collector in collectors if collector.enabled] - - @property - def dead_collectors(self) -> t.Sequence[Collector]: - """Get a list of all disabled collectors - - :return: a collection of disabled collectors for all entities - """ - collectors = itertools.chain.from_iterable(self._collectors.values()) - return [collector for collector in collectors if not collector.enabled] - - def register_collectors(self, entity: JobEntity) -> None: - """Find all configured collectors for the entity and register them - - :param entity: a `JobEntity` instance that will have all configured collectors - registered for collection. Configuration is found in the `RuntimeManifest` - """ - collectors: t.List[Collector] = [] - - # ONLY db telemetry is implemented at this time. This resolver must - # be updated when non-database or always-on collectors are introduced - if entity.is_db and entity.telemetry_on: - if mem_out := entity.collectors.get("memory", None): - collectors.append(DBMemoryCollector(entity, FileSink(mem_out))) - - if con_out := entity.collectors.get("client", None): - collectors.append(DBConnectionCollector(entity, FileSink(con_out))) - - if num_out := entity.collectors.get("client_count", None): - collectors.append(DBConnectionCountCollector(entity, FileSink(num_out))) - else: - logger.debug(f"Collectors disabled for db {entity.name}") - - self.add_all(collectors) - - def register_all_collectors(self, entities: t.Sequence[JobEntity]) -> None: - """Find all configured collectors for the entity and register them - - :param entities: entities to call `register_collectors` for - """ - for entity in entities: - self.register_collectors(entity) diff --git a/smartsim/_core/utils/telemetry/manifest.py b/smartsim/_core/utils/telemetry/manifest.py deleted file mode 100644 index 66442f8ca5..0000000000 --- a/smartsim/_core/utils/telemetry/manifest.py +++ /dev/null @@ -1,242 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2025 Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import json -import logging -import pathlib -import time -import typing as t -from dataclasses import dataclass, field - -from smartsim._core.control.job import JobEntity - -logger = logging.getLogger("TelemetryMonitor") - - -@dataclass -class Run: - """ - A Run contains the collection of entities created when a `SmartSim` - driver script executes `Experiment.start`""" - - timestamp: int - """the timestamp at the time the `Experiment.start` is called""" - models: t.List[JobEntity] - """models started in this run""" - orchestrators: t.List[JobEntity] - """orchestrators started in this run""" - ensembles: t.List[JobEntity] - """ensembles started in this run""" - - def flatten( - self, filter_fn: t.Optional[t.Callable[[JobEntity], bool]] = None - ) -> t.Sequence[JobEntity]: - """Flatten all `JobEntity`'s in the `Run` into a 1-dimensional list - - :param filter_fn: optional boolean filter that returns - True for entities to include in the result - """ - entities = self.models + self.orchestrators + self.ensembles - if filter_fn: - entities = [entity for entity in entities if filter_fn(entity)] - return entities - - @staticmethod - def load_entity( - entity_type: str, - entity_dict: t.Dict[str, t.Any], - exp_dir: pathlib.Path, - raw_experiment: t.Dict[str, t.Any], - ) -> t.List[JobEntity]: - """Map entity data persisted in a manifest file to an object - - :param entity_type: type of the associated `SmartSimEntity` - :param entity_dict: raw dictionary deserialized from entity in manifest JSON - :param exp_dir: root path to experiment outputs - :param raw_experiment: raw experiment deserialized from manifest JSON - :return: list of loaded `JobEntity` instances - """ - entities = [] - - # an entity w/parent keys must create entities for the items that it - # comprises. traverse the children and create each entity - parent_keys = {"shards", "models"} - parent_keys = parent_keys.intersection(entity_dict.keys()) - if parent_keys: - container = "shards" if "shards" in parent_keys else "models" - child_type = "orchestrator" if container == "shards" else "model" - for child_entity in entity_dict[container]: - entity = JobEntity.from_manifest( - child_type, child_entity, str(exp_dir), raw_experiment - ) - entities.append(entity) - - return entities - - # not a parent type, just create the entity w/the entity_type passed in - entity = JobEntity.from_manifest( - entity_type, entity_dict, str(exp_dir), raw_experiment - ) - entities.append(entity) - return entities - - @staticmethod - def load_entities( - entity_type: str, - run: t.Dict[str, t.Any], - exp_dir: pathlib.Path, - raw_experiment: t.Dict[str, t.Any], - ) -> t.Dict[str, t.List[JobEntity]]: - """Map a collection of entity data persisted in a manifest file to an object - - :param entity_type: type of the associated `SmartSimEntity` - :param run: raw dictionary containing `Run` data deserialized from JSON - :param exp_dir: root path to experiment outputs - :param raw_experiment: raw experiment deserialized from manifest JSON - :return: list of loaded `JobEntity` instances - """ - persisted: t.Dict[str, t.List[JobEntity]] = { - "model": [], - "orchestrator": [], - } - for item in run[entity_type]: - entities = Run.load_entity(entity_type, item, exp_dir, raw_experiment) - for new_entity in entities: - persisted[new_entity.type].append(new_entity) - - return persisted - - @staticmethod - def load_run( - raw_run: t.Dict[str, t.Any], - exp_dir: pathlib.Path, - raw_experiment: t.Dict[str, t.Any], - ) -> "Run": - """Map run data persisted in a manifest file to an object - - :param raw_run: raw dictionary containing `Run` data deserialized from JSON - :param exp_dir: root path to experiment outputs - :param raw_experiment: raw experiment deserialized from manifest JSON - :return: populated `Run` instance - """ - - # create an output mapping to hold the deserialized entities - run_entities: t.Dict[str, t.List[JobEntity]] = { - "model": [], - "orchestrator": [], - "ensemble": [], - } - - # use the output mapping keys to load all the target - # entities from the deserialized JSON - for entity_type in run_entities: - _entities = Run.load_entities(entity_type, raw_run, exp_dir, raw_experiment) - - # load_entities may return a mapping containing types different from - # entity_type IF it was a parent entity. Iterate through the keys in - # the output dictionary and put them in the right place - for entity_type, new_entities in _entities.items(): - if not new_entities: - continue - run_entities[entity_type].extend(new_entities) - - loaded_run = Run( - raw_run["timestamp"], - run_entities["model"], - run_entities["orchestrator"], - run_entities["ensemble"], - ) - return loaded_run - - -@dataclass -class RuntimeManifest: - """The runtime manifest holds information about the entities created - at runtime during a SmartSim Experiment. The runtime manifest differs - from a standard manifest - it may contain multiple experiment - executions in a `runs` collection and holds information that is unknown - at design-time, such as IP addresses of host machines. - """ - - name: str - """The name of the `Experiment` associated to the `RuntimeManifest`""" - path: pathlib.Path - """The path to the `Experiment` working directory""" - launcher: str - """The launcher type used by the `Experiment`""" - runs: t.List[Run] = field(default_factory=list) - """A `List` of 0 to many `Run` instances""" - - @staticmethod - def load_manifest(file_path: str) -> t.Optional["RuntimeManifest"]: - """Load a persisted manifest and return the content - - :param file_path: path to the manifest file to load - :return: deserialized `RuntimeManifest` if the manifest file is found, - otherwise None - """ - manifest_dict: t.Optional[t.Dict[str, t.Any]] = None - try_count, max_attempts = 1, 5 - - # allow multiple read attempts in case the manifest is being - # written at the time load_manifest is called - while manifest_dict is None and try_count <= max_attempts: - source = pathlib.Path(file_path) - source = source.resolve() - time.sleep(0.01) # a tiny sleep avoids reading partially written json - - try: - if text := source.read_text(encoding="utf-8").strip(): - manifest_dict = json.loads(text) - except json.JSONDecodeError as ex: - print(f"Error loading manifest: {ex}") - # hack/fix: handle issues reading file before it is fully written - time.sleep(0.1 * try_count) - finally: - try_count += 1 - - if not manifest_dict: - return None - - # if we don't have an experiment, the manifest is malformed - exp = manifest_dict.get("experiment", None) - if not exp: - raise ValueError("Manifest missing required experiment") - - # if we don't have runs, the manifest is malformed - runs = manifest_dict.get("runs", None) - if runs is None: - raise ValueError("Manifest missing required runs") - - exp_dir = pathlib.Path(exp["path"]) - runs = [Run.load_run(raw_run, exp_dir, exp) for raw_run in runs] - - manifest = RuntimeManifest( - name=exp["name"], - path=exp_dir, - launcher=exp["launcher"], - runs=runs, - ) - return manifest diff --git a/smartsim/_core/utils/telemetry/sink.py b/smartsim/_core/utils/telemetry/sink.py deleted file mode 100644 index 72f501b32d..0000000000 --- a/smartsim/_core/utils/telemetry/sink.py +++ /dev/null @@ -1,81 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2025 Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import abc -import logging -import pathlib -import typing as t - -logger = logging.getLogger("TelemetryMonitor") - - -class Sink(abc.ABC): - """Base class for output sinks. Represents a durable, read-only - storage mechanism""" - - @abc.abstractmethod - async def save(self, *args: t.Any) -> None: - """Save the args passed to this method to the underlying sink - - :param args: variadic list of values to save - """ - - -class FileSink(Sink): - """Telemetry sink that writes to a file""" - - def __init__(self, path: str) -> None: - """Initialize the FileSink - - :param filename: path to a file backing this `Sink` - """ - super().__init__() - self._check_init(path) - self._path = pathlib.Path(path) - - @staticmethod - def _check_init(filename: str) -> None: - """Validate initialization arguments and raise a ValueError - if an invalid filename is passed - - :param filename: path to a file backing this `Sink` - """ - if not filename: - raise ValueError("No filename provided to FileSink") - - @property - def path(self) -> pathlib.Path: - """The path to the file this FileSink writes - - :return: path to a file backing this `Sink` - """ - return self._path - - async def save(self, *args: t.Any) -> None: - self._path.parent.mkdir(parents=True, exist_ok=True) - - with open(self._path, "a+", encoding="utf-8") as sink_fp: - values = ",".join(map(str, args)) + "\n" - sink_fp.write(values) diff --git a/smartsim/_core/utils/telemetry/telemetry.py b/smartsim/_core/utils/telemetry/telemetry.py deleted file mode 100644 index a741ac627b..0000000000 --- a/smartsim/_core/utils/telemetry/telemetry.py +++ /dev/null @@ -1,590 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2025 Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import asyncio -import json -import logging -import os -import pathlib -import threading -import typing as t - -from watchdog.events import ( - FileSystemEvent, - LoggingEventHandler, - PatternMatchingEventHandler, -) -from watchdog.observers import Observer -from watchdog.observers.api import BaseObserver - -from smartsim._core.config import CONFIG -from smartsim._core.control.job import JobEntity, _JobKey -from smartsim._core.control.jobmanager import JobManager -from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher -from smartsim._core.launcher.launcher import Launcher -from smartsim._core.launcher.local.local import LocalLauncher -from smartsim._core.launcher.pbs.pbsLauncher import PBSLauncher -from smartsim._core.launcher.slurm.slurmLauncher import SlurmLauncher -from smartsim._core.launcher.stepInfo import StepInfo -from smartsim._core.utils.helpers import get_ts_ms -from smartsim._core.utils.serialize import MANIFEST_FILENAME -from smartsim._core.utils.telemetry.collector import CollectorManager -from smartsim._core.utils.telemetry.manifest import Run, RuntimeManifest -from smartsim._core.utils.telemetry.util import map_return_code, write_event -from smartsim.error.errors import SmartSimError -from smartsim.status import TERMINAL_STATUSES - -logger = logging.getLogger("TelemetryMonitor") - - -class ManifestEventHandler(PatternMatchingEventHandler): - """The ManifestEventHandler monitors an experiment and updates a - datastore as needed. This event handler is triggered by changes to - the experiment manifest written to physical disk by a driver. - - It also contains an event loop. The loop checks experiment entities for updates - at each timestep and executes a configurable set of metrics collectors.""" - - def __init__( - self, - pattern: str, - ignore_patterns: t.Optional[t.List[str]] = None, - ignore_directories: bool = True, - case_sensitive: bool = False, - timeout_ms: int = 1000, - ) -> None: - """Initialize the manifest event handler - - :param pattern: a pattern that identifies the files whose - events are of interest by matching their name - :param ignore_patterns: a pattern that identifies the files whose - events should be ignored - :param ignore_directories: set to `True` to avoid directory events - :param case_sensitive: set to `True` to require case sensitivity in - resource names in order to match input patterns - :param timeout_ms: maximum duration (in ms) of a call to the event - loop prior to cancelling tasks - """ - super().__init__( - [pattern], ignore_patterns, ignore_directories, case_sensitive - ) # type: ignore - self._tracked_runs: t.Dict[int, Run] = {} - self._tracked_jobs: t.Dict[_JobKey, JobEntity] = {} - self._completed_jobs: t.Dict[_JobKey, JobEntity] = {} - self._launcher: t.Optional[Launcher] = None - self.job_manager: JobManager = JobManager(threading.RLock()) - self._launcher_map: t.Dict[str, t.Type[Launcher]] = { - "slurm": SlurmLauncher, - "pbs": PBSLauncher, - "local": LocalLauncher, - "dragon": DragonLauncher, - } - self._collector_mgr = CollectorManager(timeout_ms) - - @property - def tracked_jobs(self) -> t.Sequence[JobEntity]: - """The collection of `JobEntity` that are actively being monitored - - :return: the collection - """ - return list(self._tracked_jobs.values()) - - def init_launcher(self, launcher: str) -> None: - """Initialize the controller with a specific type of launcher. - SmartSim currently supports Slurm, PBS(Pro), Dragon - and local launching - - :param launcher: the name of the workload manager used by the experiment - :raises ValueError: if a string is passed that is not - a supported launcher - :raises TypeError: if no launcher argument is provided. - """ - if not launcher: - raise TypeError("Must provide a 'launcher' argument") - - if launcher_type := self._launcher_map.get(launcher.lower(), None): - self._launcher = launcher_type() - return - - raise ValueError("Launcher type not supported: " + launcher) - - def init_job_manager(self) -> None: - """Initialize the job manager instance""" - if not self._launcher: - raise TypeError("self._launcher must be initialized") - - self.job_manager.set_launcher(self._launcher) - self.job_manager.start() - - def set_launcher(self, launcher_type: str) -> None: - """Set the launcher for the experiment - :param launcher_type: the name of the workload manager used by the experiment - """ - self.init_launcher(launcher_type) - - if self._launcher is None: - raise SmartSimError("Launcher init failed") - - self.job_manager.set_launcher(self._launcher) - self.job_manager.start() - - def process_manifest(self, manifest_path: str) -> None: - """Read the manifest for the experiment. Process the - `RuntimeManifest` by updating the set of tracked jobs - and registered collectors - - :param manifest_path: full path to the manifest file - """ - try: - # it is possible to read the manifest prior to a completed - # write due to no access locking mechanism. log the issue - # and continue. it will retry on the next event loop iteration - manifest = RuntimeManifest.load_manifest(manifest_path) - if not manifest: - logger.debug("No manifest file exists") - return - except json.JSONDecodeError: - logger.error(f"Malformed manifest encountered: {manifest_path}") - return - except ValueError: - logger.error("Manifest content error", exc_info=True) - return - - if self._launcher is None: - self.set_launcher(manifest.launcher) - - if not self._launcher: - raise SmartSimError(f"Unable to set launcher from {manifest_path}") - - # filter out previously added items - runs = [run for run in manifest.runs if run.timestamp not in self._tracked_runs] - - # manifest is stored at /.smartsim/telemetry/manifest.json - exp_dir = pathlib.Path(manifest_path).parent.parent.parent - - for run in runs: - for entity in run.flatten( - filter_fn=lambda e: e.key not in self._tracked_jobs - ): - entity.path = str(exp_dir) - - # track everything coming in (managed and unmanaged) - self._tracked_jobs[entity.key] = entity - - # register collectors for new entities as needed - if entity.telemetry_on: - self._collector_mgr.register_collectors(entity) - - # persist a `start` event for each new entity in the manifest - write_event( - run.timestamp, - entity.task_id, - entity.step_id, - entity.type, - "start", - pathlib.Path(entity.status_dir), - ) - - if entity.is_managed: - # Tell JobManager the task is unmanaged. This collects - # status updates but does not try to start a new copy - self.job_manager.add_job( - entity.name, - entity.step_id, - entity, - False, - ) - # Tell the launcher it's managed so it doesn't attempt - # to look for a PID that may no longer exist - self._launcher.step_mapping.add( - entity.name, entity.step_id, "", True - ) - self._tracked_runs[run.timestamp] = run - - def on_modified(self, event: FileSystemEvent) -> None: - """Event handler for when a file or directory is modified. - - :param event: event representing file/directory modification. - """ - super().on_modified(event) - logger.debug(f"Processing manifest modified @ {event.src_path}") - self.process_manifest(event.src_path) - - def on_created(self, event: FileSystemEvent) -> None: - """Event handler for when a file or directory is created. - - :param event: event representing file/directory creation. - """ - super().on_created(event) - logger.debug(f"processing manifest created @ {event.src_path}") - self.process_manifest(event.src_path) - - async def _to_completed( - self, - timestamp: int, - entity: JobEntity, - step_info: StepInfo, - ) -> None: - """Move a monitored entity from the active to completed collection to - stop monitoring for updates during timesteps. - - :param timestamp: current timestamp for event logging - :param entity: running SmartSim Job - :param step_info: `StepInfo` received when requesting a Job status update - """ - # remember completed entities to ignore them after manifest updates - inactive_entity = self._tracked_jobs.pop(entity.key) - if entity.key not in self._completed_jobs: - self._completed_jobs[entity.key] = inactive_entity - - # remove all the registered collectors for the completed entity - await self._collector_mgr.remove(entity) - - job = self.job_manager[entity.name] - self.job_manager.move_to_completed(job) - - status_clause = f"status: {step_info.status}" - error_clause = f", error: {step_info.error}" if step_info.error else "" - - write_path = pathlib.Path(entity.status_dir) - - # persist a `stop` event for an entity that has completed - write_event( - timestamp, - entity.task_id, - entity.step_id, - entity.type, - "stop", - write_path, - detail=f"{status_clause}{error_clause}", - return_code=map_return_code(step_info), - ) - - async def on_timestep(self, timestamp: int) -> None: - """Called at polling frequency to request status updates on - monitored entities - - :param timestamp: current timestamp for event logging - """ - if not self._launcher: - return - - await self._collector_mgr.collect() - - # ensure unmanaged jobs move out of tracked jobs list - u_jobs = [job for job in self._tracked_jobs.values() if not job.is_managed] - for job in u_jobs: - job.check_completion_status() - if job.is_complete: - completed_entity = self._tracked_jobs.pop(job.key) - self._completed_jobs[job.key] = completed_entity - - # consider not using name to avoid collisions - m_jobs = [job for job in self._tracked_jobs.values() if job.is_managed] - if names := {entity.name: entity for entity in m_jobs}: - step_updates: t.List[t.Tuple[str, t.Optional[StepInfo]]] = [] - - try: - task_names = list(names.keys()) - updates = self._launcher.get_step_update(task_names) - step_updates.extend(updates) - logger.debug(f"Retrieved updates for: {task_names}") - except Exception: - logger.warning(f"Telemetry step updates failed for {names.keys()}") - - try: - for step_name, step_info in step_updates: - if step_info and step_info.status in TERMINAL_STATUSES: - completed_entity = names[step_name] - await self._to_completed(timestamp, completed_entity, step_info) - except Exception as ex: - msg = f"An error occurred getting step updates on {names}" - logger.error(msg, exc_info=ex) - - async def shutdown(self) -> None: - """Release all resources owned by the `ManifestEventHandler`""" - logger.debug(f"{type(self).__name__} shutting down...") - await self._collector_mgr.shutdown() - logger.debug(f"{type(self).__name__} shutdown complete...") - - -class TelemetryMonitorArgs: - """Strongly typed entity to house logic for validating - configuration passed to the telemetry monitor""" - - def __init__( - self, - exp_dir: str, - frequency: int, - cooldown: int, - log_level: int = logging.DEBUG, - ) -> None: - """Initialize the instance with inputs and defaults - - :param exp_dir: root path to experiment outputs - :param frequency: desired frequency of metric & status updates (in seconds) - :param frequency: cooldown period (in seconds) before automatic shutdown - :param log_level: log level to apply to python logging - """ - self.exp_dir: str = exp_dir - self.frequency: int = frequency # freq in seconds - self.cooldown: int = cooldown # cooldown in seconds - self.log_level: int = log_level - self._validate() - - @property - def min_frequency(self) -> int: - """The minimum duration (in seconds) for the monitoring loop to wait - between executions of the monitoring loop. Shorter frequencies may - not allow the monitoring loop to complete. Adjusting the minimum frequency - can result in inconsistent or missing outputs due to the telemetry - monitor cancelling processes that exceed the allotted frequency.""" - return 1 - - @property - def max_frequency(self) -> int: - """The maximum duration (in seconds) for the monitoring loop to wait - between executions of the monitoring loop. Longer frequencies potentially - keep the telemetry monitor alive unnecessarily.""" - return 600 - - @property - def min_cooldown(self) -> int: - """The minimum allowed cooldown period that can be configured. Ensures - the cooldown does not cause the telemetry monitor to shutdown prior to - completing a single pass through the monitoring loop""" - return min(self.frequency + 1, self.cooldown) - - @property - def max_cooldown(self) -> int: - """The maximum allowed cooldown period that can be configured. Ensures the - telemetry monitor can automatically shutdown if not needed""" - return self.max_frequency - - @property - def cooldown_ms(self) -> int: - """The duration of the time period (in ms) the telemetry monitor will - wait for new resources to monitor before shutting down""" - return self.cooldown * 1000 - - @property - def frequency_ms(self) -> int: - """The desired frequency (in ms) of the telemetry monitor attempts - to retrieve status updates and metrics""" - return self.frequency * 1000 - - def _check_exp_dir(self) -> None: - """Validate the existence of the experiment directory""" - if not pathlib.Path(self.exp_dir).exists(): - raise ValueError(f"Experiment directory cannot be found: {self.exp_dir}") - - def _check_frequency(self) -> None: - """Validate the frequency input is in the range - [`min_frequency`, `max_frequency`]""" - if self.max_frequency >= self.frequency >= self.min_frequency: - return - - freq_tpl = "Telemetry collection frequency must be in the range [{0}, {1}]" - raise ValueError(freq_tpl.format(self.min_frequency, self.max_frequency)) - - def _check_log_level(self) -> None: - """Validate the frequency log level input. Uses standard python log levels""" - if self.log_level not in [ - logging.DEBUG, - logging.INFO, - logging.WARNING, - logging.ERROR, - ]: - raise ValueError(f"Invalid log_level supplied: {self.log_level}") - - def _validate(self) -> None: - """Execute all validation functions""" - self._check_exp_dir() - self._check_frequency() - self._check_log_level() - - -class TelemetryMonitor: - """The telemetry monitor is a standalone process managed by SmartSim to perform - long-term retrieval of experiment status updates and resource usage - metrics. Note that a non-blocking driver script is likely to complete before - the SmartSim entities complete. Also, the JobManager performs status updates - only as long as the driver is running. This telemetry monitor entrypoint is - started automatically when a SmartSim experiment calls the `start` method - on resources. The entrypoint runs until it has no resources to monitor.""" - - def __init__(self, telemetry_monitor_args: TelemetryMonitorArgs): - """Initialize the telemetry monitor instance - - :param telemetry_monitor_args: configuration for the telemetry monitor - """ - self._observer: BaseObserver = Observer() - """an observer object that triggers the action handler""" - self._args = telemetry_monitor_args - """user-supplied arguments configuring telemetry monitor behavior""" - self._experiment_dir = pathlib.Path(self._args.exp_dir) - """path to the root directory where experiment outputs are written""" - self._telemetry_path = self._experiment_dir / CONFIG.telemetry_subdir - """path to the root directory where telemetry outputs are written""" - self._manifest_path = self._telemetry_path / MANIFEST_FILENAME - """path to the runtime manifest file""" - self._action_handler: t.Optional[ManifestEventHandler] = None - """an event listener holding action handlers for manifest on-change events""" - - def _can_shutdown(self) -> bool: - """Determines if the telemetry monitor can perform shutdown. An - automatic shutdown will occur if there are no active jobs being monitored. - Managed jobs and databases are considered separately due to the way they - are stored in the job manager - - :return: return True if capable of automatically shutting down - """ - managed_jobs = ( - list(self._action_handler.job_manager.jobs.values()) - if self._action_handler - else [] - ) - unmanaged_jobs = ( - list(self._action_handler.tracked_jobs) if self._action_handler else [] - ) - # get an individual count of databases for logging - n_dbs: int = len( - [ - job - for job in managed_jobs + unmanaged_jobs - if isinstance(job, JobEntity) and job.is_db - ] - ) - - # if we have no jobs currently being monitored we can shutdown - n_jobs = len(managed_jobs) + len(unmanaged_jobs) - n_dbs - shutdown_ok = n_jobs + n_dbs == 0 - - logger.debug(f"{n_jobs} active job(s), {n_dbs} active db(s)") - return shutdown_ok - - async def monitor(self) -> None: - """The main monitoring loop. Executes a busy wait and triggers - telemetry collectors using frequency from constructor arguments. - Continue monitoring until it satisfies automatic shutdown criteria.""" - elapsed: int = 0 - last_ts: int = get_ts_ms() - shutdown_in_progress = False - - if self._action_handler is None: - raise ValueError("The action handler must be initialized to monitor") - - # Event loop runs until the observer shuts down or - # an automatic shutdown is started. - while self._observer.is_alive() and not shutdown_in_progress: - duration_ms = 0 - start_ts = get_ts_ms() - await self._action_handler.on_timestep(start_ts) - - elapsed += start_ts - last_ts - last_ts = start_ts - - # check if there are no jobs being monitored - if self._can_shutdown(): - # cooldown period begins accumulating when no entities are monitored - if elapsed >= self._args.cooldown_ms: - shutdown_in_progress = True - logger.info("Cooldown complete. Beginning shutdown") - await self._action_handler.shutdown() - logger.debug("Beginning file monitor shutdown") - self._observer.stop() # type: ignore - logger.debug("Event loop shutdown complete") - break - else: - # reset cooldown any time jobs are running - elapsed = 0 - - # track time elapsed to execute metric collection - duration_ms = get_ts_ms() - start_ts - wait_ms = max(self._args.frequency_ms - duration_ms, 0) - - # delay next loop if collection time didn't exceed loop frequency - wait_sec = wait_ms / 1000 # convert to seconds for sleep - if elapsed > 0: - completion_pct = elapsed / self._args.cooldown_ms * 100 - logger.info(f"Cooldown {completion_pct:.2f}% complete") - logger.debug(f"Collection in {wait_sec:.2f}s") - await asyncio.sleep(wait_sec) - - logger.info("Exiting telemetry monitor event loop") - - async def run(self) -> int: - """Setup the monitoring entities and start the timer-based loop that - will poll for telemetry data - - :return: return code for the process - """ - logger.info("Executing telemetry monitor") - logger.info(f"Polling frequency: {self._args.frequency}s") - logger.info(f"Experiment directory: {self._experiment_dir}") - logger.info(f"Telemetry output: {self._telemetry_path}") - - # Convert second-based inputs to milliseconds - frequency_ms = int(self._args.frequency * 1000) - - # Create event handlers to trigger when target files are changed - log_handler = LoggingEventHandler(logger) - self._action_handler = ManifestEventHandler( - str(MANIFEST_FILENAME), - timeout_ms=frequency_ms, - ignore_patterns=["*.out", "*.err"], - ) - - try: - # The manifest may not exist when the telemetry monitor starts - if self._manifest_path.exists(): - self._action_handler.process_manifest(str(self._manifest_path)) - - # Add a handler to log file-system events - self._observer.schedule(log_handler, self._telemetry_path) # type:ignore - # Add a handler to perform actions on file-system events - self._observer.schedule( - self._action_handler, self._telemetry_path - ) # type:ignore - self._observer.start() # type: ignore - - # kick off the 'infinite' monitoring loop - await self.monitor() - return os.EX_OK - except Exception as ex: - logger.error(ex) - finally: - await self._action_handler.shutdown() - self.cleanup() - logger.info("Telemetry monitor shutdown complete") - - return os.EX_SOFTWARE - - def cleanup(self) -> None: - """Perform cleanup for all allocated resources""" - if self._observer is not None and self._observer.is_alive(): - logger.debug("Cleaning up manifest observer") - self._observer.stop() # type: ignore - self._observer.join() diff --git a/smartsim/_core/utils/telemetry/util.py b/smartsim/_core/utils/telemetry/util.py deleted file mode 100644 index 86a824bd6b..0000000000 --- a/smartsim/_core/utils/telemetry/util.py +++ /dev/null @@ -1,113 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2025 Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# import asyncio -import json -import logging -import os -import pathlib -import typing as t - -from smartsim._core.launcher.stepInfo import StepInfo -from smartsim.status import TERMINAL_STATUSES, SmartSimStatus - -_EventClass = t.Literal["start", "stop", "timestep"] - -logger = logging.getLogger("TelemetryMonitor") - - -def write_event( - timestamp: int, - task_id: t.Union[int, str], - step_id: str, - entity_type: str, - event_type: _EventClass, - status_dir: pathlib.Path, - detail: str = "", - return_code: t.Optional[int] = None, -) -> None: - """Write a record to durable storage for a SmartSimEntity lifecycle event. - Does not overwrite existing records. - - :param timestamp: when the event occurred - :param task_id: the task_id of a managed task - :param step_id: the step_id of an unmanaged task - :param entity_type: the SmartSimEntity subtype - (e.g. `orchestrator`, `ensemble`, `model`, `dbnode`, ...) - :param event_type: the event subtype - :param status_dir: path where the SmartSimEntity outputs are written - :param detail: (optional) additional information to write with the event - :param return_code: (optional) the return code of a completed task - """ - tgt_path = status_dir / f"{event_type}.json" - tgt_path.parent.mkdir(parents=True, exist_ok=True) - - try: - if task_id: - task_id = int(task_id) - except ValueError: - if not isinstance(task_id, str): - logger.exception(f"Unable to parse task_id: {task_id}") - - entity_dict = { - "timestamp": timestamp, - "job_id": task_id, - "step_id": step_id, - "type": entity_type, - "action": event_type, - } - - if detail is not None: - entity_dict["detail"] = detail - - if return_code is not None: - entity_dict["return_code"] = return_code - - try: - if not tgt_path.exists(): - # Don't overwrite existing tracking files - bytes_written = tgt_path.write_text(json.dumps(entity_dict, indent=2)) - if bytes_written < 1: - logger.warning("event tracking failed to write tracking file.") - except Exception: - logger.error("Unable to write tracking file.", exc_info=True) - - -def map_return_code(step_info: StepInfo) -> t.Optional[int]: - """Converts a return code from a workload manager into a SmartSim status. - - A non-terminal status is converted to null. This indicates - that the process referenced in the `StepInfo` is running - and does not yet have a return code. - - :param step_info: step information produced by job manager status update queries - :return: a return code if the step is finished, otherwise None - """ - rc_map = {s: 1 for s in TERMINAL_STATUSES} # return `1` for all terminal statuses - rc_map.update( - {SmartSimStatus.STATUS_COMPLETED: os.EX_OK} - ) # return `0` for full success - - return rc_map.get(step_info.status, None) # return `None` when in-progress diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 56ca160dcb..25ec48f4e0 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -43,7 +43,7 @@ from .._core.utils.helpers import is_valid_cmd, unpack_db_identifier from .._core.utils.network import get_ip_from_host from .._core.utils.shell import execute_cmd -from ..entity import DBNode, EntityList, TelemetryConfiguration +from ..entity import DBNode, EntityList from ..error import ( SmartSimError, SSConfigError, @@ -68,7 +68,7 @@ logger = get_logger(__name__) -by_launcher: t.Dict[str, t.List[str]] = { +by_launcher: dict[str, list[str]] = { "dragon": [""], "slurm": ["srun", "mpirun", "mpiexec"], "pbs": ["aprun", "mpirun", "mpiexec"], @@ -93,7 +93,7 @@ def _detect_command(launcher: str) -> str: raise SmartSimError(msg) -def _autodetect(launcher: str, run_command: str) -> t.Tuple[str, str]: +def _autodetect(launcher: str, run_command: str) -> tuple[str, str]: """Automatically detect the launcher and run command to use""" if launcher == "auto": launcher = detect_launcher() @@ -163,22 +163,22 @@ class Orchestrator(EntityList[DBNode]): def __init__( self, - path: t.Optional[str] = getcwd(), + path: str | None = getcwd(), port: int = 6379, - interface: t.Union[str, t.List[str]] = "lo", + interface: str | list[str] = "lo", launcher: str = "local", run_command: str = "auto", db_nodes: int = 1, batch: bool = False, - hosts: t.Optional[t.Union[t.List[str], str]] = None, - account: t.Optional[str] = None, - time: t.Optional[str] = None, - alloc: t.Optional[str] = None, + hosts: list[str] | str | None = None, + account: str | None = None, + time: str | None = None, + alloc: str | None = None, single_cmd: bool = False, *, - threads_per_queue: t.Optional[int] = None, - inter_op_threads: t.Optional[int] = None, - intra_op_threads: t.Optional[int] = None, + threads_per_queue: int | None = None, + inter_op_threads: int | None = None, + intra_op_threads: int | None = None, db_identifier: str = "orchestrator", **kwargs: t.Any, ) -> None: @@ -213,9 +213,9 @@ def __init__( single_cmd = _get_single_command( self.run_command, self.launcher, batch, single_cmd ) - self.ports: t.List[int] = [] - self._hosts: t.List[str] = [] - self._user_hostlist: t.List[str] = [] + self.ports: list[int] = [] + self._hosts: list[str] = [] + self._user_hostlist: list[str] = [] if isinstance(interface, str): interface = [interface] self._interfaces = interface @@ -223,10 +223,9 @@ def __init__( self.queue_threads = threads_per_queue self.inter_threads = inter_op_threads self.intra_threads = intra_op_threads - self._telemetry_cfg = TelemetryConfiguration() - gpus_per_shard: t.Optional[int] = None - cpus_per_shard: t.Optional[int] = None + gpus_per_shard: int | None = None + cpus_per_shard: int | None = None super().__init__( name=db_identifier, @@ -285,8 +284,8 @@ def __init__( "Orchestrator with mpirun", ) ) - self._reserved_run_args: t.Dict[t.Type[RunSettings], t.List[str]] = {} - self._reserved_batch_args: t.Dict[t.Type[BatchSettings], t.List[str]] = {} + self._reserved_run_args: dict[type[RunSettings], list[str]] = {} + self._reserved_batch_args: dict[type[BatchSettings], list[str]] = {} self._fill_reserved() def _mpi_has_sge_support(self) -> bool: @@ -335,7 +334,7 @@ def db_nodes(self) -> int: return self.num_shards @property - def hosts(self) -> t.List[str]: + def hosts(self) -> list[str]: """Return the hostnames of Orchestrator instance hosts Note that this will only be populated after the orchestrator @@ -347,14 +346,6 @@ def hosts(self) -> t.List[str]: self._hosts = self._get_db_hosts() return self._hosts - @property - def telemetry(self) -> TelemetryConfiguration: - """Return the telemetry configuration for this entity. - - :returns: configuration of telemetry for this entity - """ - return self._telemetry_cfg - def reset_hosts(self) -> None: """Clear hosts or reset them to last user choice""" for node in self.entities: @@ -369,7 +360,7 @@ def remove_stale_files(self) -> None: for db in self.entities: db.remove_stale_dbnode_files() - def get_address(self) -> t.List[str]: + def get_address(self) -> list[str]: """Return database addresses :return: addresses @@ -382,7 +373,7 @@ def get_address(self) -> t.List[str]: raise SmartSimError("Database is not active") return self._get_address() - def _get_address(self) -> t.List[str]: + def _get_address(self) -> list[str]: return [ f"{host}:{port}" for host, port in itertools.product(self._hosts, self.ports) @@ -400,7 +391,7 @@ def is_active(self) -> bool: return db_is_active(hosts, self.ports, self.num_shards) @property - def _rai_module(self) -> t.Tuple[str, ...]: + def _rai_module(self) -> tuple[str, ...]: """Get the RedisAI module from third-party installations :return: Tuple of args to pass to the orchestrator exe @@ -469,7 +460,7 @@ def set_walltime(self, walltime: str) -> None: if hasattr(self, "batch_settings") and self.batch_settings: self.batch_settings.set_walltime(walltime) - def set_hosts(self, host_list: t.Union[t.List[str], str]) -> None: + def set_hosts(self, host_list: list[str] | str) -> None: """Specify the hosts for the ``Orchestrator`` to launch on :param host_list: list of host (compute node names) @@ -505,7 +496,7 @@ def set_hosts(self, host_list: t.Union[t.List[str], str]) -> None: for i, mpmd_runsettings in enumerate(db.run_settings.mpmd, 1): mpmd_runsettings.set_hostlist(host_list[i]) - def set_batch_arg(self, arg: str, value: t.Optional[str] = None) -> None: + def set_batch_arg(self, arg: str, value: str | None = None) -> None: """Set a batch argument the orchestrator should launch with Some commonly used arguments such as --job-name are used @@ -526,7 +517,7 @@ def set_batch_arg(self, arg: str, value: t.Optional[str] = None) -> None: else: self.batch_settings.batch_args[arg] = value - def set_run_arg(self, arg: str, value: t.Optional[str] = None) -> None: + def set_run_arg(self, arg: str, value: str | None = None) -> None: """Set a run argument the orchestrator should launch each node with (it will be passed to `jrun`) @@ -663,9 +654,9 @@ def _build_batch_settings( account: str, time: str, *, - launcher: t.Optional[str] = None, + launcher: str | None = None, **kwargs: t.Any, - ) -> t.Optional[BatchSettings]: + ) -> BatchSettings | None: batch_settings = None if launcher is None: @@ -683,9 +674,9 @@ def _build_batch_settings( def _build_run_settings( self, exe: str, - exe_args: t.List[t.List[str]], + exe_args: list[list[str]], *, - run_args: t.Optional[t.Dict[str, t.Any]] = None, + run_args: dict[str, t.Any] | None = None, db_nodes: int = 1, single_cmd: bool = True, **kwargs: t.Any, @@ -778,7 +769,7 @@ def _initialize_entities_mpmd( ) -> None: cluster = db_nodes >= 3 mpmd_node_name = self.name + "_0" - exe_args_mpmd: t.List[t.List[str]] = [] + exe_args_mpmd: list[list[str]] = [] for db_id in range(db_nodes): db_shard_name = "_".join((self.name, str(db_id))) @@ -789,7 +780,7 @@ def _initialize_entities_mpmd( ) exe_args = " ".join(start_script_args) exe_args_mpmd.append(sh_split(exe_args)) - run_settings: t.Optional[RunSettings] = None + run_settings: RunSettings | None = None run_settings = self._build_run_settings( sys.executable, exe_args_mpmd, db_nodes=db_nodes, port=port, **kwargs @@ -808,9 +799,7 @@ def _initialize_entities_mpmd( self.entities.append(node) self.ports = [port] - def _get_start_script_args( - self, name: str, port: int, cluster: bool - ) -> t.List[str]: + def _get_start_script_args(self, name: str, port: int, cluster: bool) -> list[str]: cmd = [ "-m", "smartsim._core.entrypoints.redis", # entrypoint @@ -827,7 +816,7 @@ def _get_start_script_args( return cmd - def _get_db_hosts(self) -> t.List[str]: + def _get_db_hosts(self) -> list[str]: hosts = [] for db in self.entities: if not db.is_mpmd: diff --git a/smartsim/entity/__init__.py b/smartsim/entity/__init__.py index 823623c76a..e1a0205335 100644 --- a/smartsim/entity/__init__.py +++ b/smartsim/entity/__init__.py @@ -27,7 +27,7 @@ from .dbnode import DBNode from .dbobject import * from .ensemble import Ensemble -from .entity import SmartSimEntity, TelemetryConfiguration +from .entity import SmartSimEntity from .entityList import EntityList, EntitySequence from .files import TaggedFilesHierarchy from .model import Model diff --git a/smartsim/entity/dbnode.py b/smartsim/entity/dbnode.py index 98f7baed69..9dd32d7649 100644 --- a/smartsim/entity/dbnode.py +++ b/smartsim/entity/dbnode.py @@ -31,6 +31,7 @@ import os.path as osp import time import typing as t +from collections.abc import Iterable from dataclasses import dataclass from .._core.config import CONFIG @@ -56,14 +57,14 @@ def __init__( name: str, path: str, run_settings: RunSettings, - ports: t.List[int], - output_files: t.List[str], + ports: list[int], + output_files: list[str], db_identifier: str = "", ) -> None: """Initialize a database node within an orchestrator.""" super().__init__(name, path, run_settings) self.ports = ports - self._hosts: t.Optional[t.List[str]] = None + self._hosts: list[str] | None = None if not output_files: raise ValueError("output_files cannot be empty") @@ -93,7 +94,7 @@ def host(self) -> str: return host @property - def hosts(self) -> t.List[str]: + def hosts(self) -> list[str]: if not self._hosts: self._hosts = self._parse_db_hosts() return self._hosts @@ -109,7 +110,7 @@ def is_mpmd(self) -> bool: return bool(self.run_settings.mpmd) - def set_hosts(self, hosts: t.List[str]) -> None: + def set_hosts(self, hosts: list[str]) -> None: self._hosts = [str(host) for host in hosts] def remove_stale_dbnode_files(self) -> None: @@ -140,7 +141,7 @@ def remove_stale_dbnode_files(self) -> None: if osp.exists(file_name): os.remove(file_name) - def _get_cluster_conf_filenames(self, port: int) -> t.List[str]: + def _get_cluster_conf_filenames(self, port: int) -> list[str]: """Returns the .conf file name for the given port number This function should bu used if and only if ``_mpmd==True`` @@ -157,8 +158,8 @@ def _get_cluster_conf_filenames(self, port: int) -> t.List[str]: @staticmethod def _parse_launched_shard_info_from_iterable( - stream: t.Iterable[str], num_shards: t.Optional[int] = None - ) -> "t.List[LaunchedShardData]": + stream: Iterable[str], num_shards: int | None = None + ) -> "list[LaunchedShardData]": lines = (line.strip() for line in stream) lines = (line for line in lines if line) tokenized = (line.split(maxsplit=1) for line in lines) @@ -167,7 +168,7 @@ def _parse_launched_shard_info_from_iterable( kwjson for first, kwjson in tokenized if "SMARTSIM_ORC_SHARD_INFO" in first ) shard_data_kwargs = (json.loads(kwjson) for kwjson in shard_data_jsons) - shard_data: "t.Iterable[LaunchedShardData]" = ( + shard_data: "Iterable[LaunchedShardData]" = ( LaunchedShardData(**kwargs) for kwargs in shard_data_kwargs ) if num_shards: @@ -176,18 +177,18 @@ def _parse_launched_shard_info_from_iterable( @classmethod def _parse_launched_shard_info_from_files( - cls, file_paths: t.List[str], num_shards: t.Optional[int] = None - ) -> "t.List[LaunchedShardData]": + cls, file_paths: list[str], num_shards: int | None = None + ) -> "list[LaunchedShardData]": with fileinput.FileInput(file_paths) as ifstream: return cls._parse_launched_shard_info_from_iterable(ifstream, num_shards) - def get_launched_shard_info(self) -> "t.List[LaunchedShardData]": + def get_launched_shard_info(self) -> "list[LaunchedShardData]": """Parse the launched database shard info from the output files :raises SSDBFilesNotParseable: if all shard info could not be found :return: The found launched shard info """ - ips: "t.List[LaunchedShardData]" = [] + ips: "list[LaunchedShardData]" = [] trials = CONFIG.database_file_parse_trials interval = CONFIG.database_file_parse_interval output_files = [osp.join(self.path, file) for file in self._output_files] @@ -214,7 +215,7 @@ def get_launched_shard_info(self) -> "t.List[LaunchedShardData]": raise SSDBFilesNotParseable(msg) return ips - def _parse_db_hosts(self) -> t.List[str]: + def _parse_db_hosts(self) -> list[str]: """Parse the database hosts/IPs from the output files The IP address is preferred, but if hostname is only present @@ -236,8 +237,8 @@ class LaunchedShardData: cluster: bool @property - def cluster_conf_file(self) -> t.Optional[str]: + def cluster_conf_file(self) -> str | None: return f"nodes-{self.name}-{self.port}.conf" if self.cluster else None - def to_dict(self) -> t.Dict[str, t.Any]: + def to_dict(self) -> dict[str, t.Any]: return dict(self.__dict__) diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py index 3c0e216b4b..e0239c7df0 100644 --- a/smartsim/entity/dbobject.py +++ b/smartsim/entity/dbobject.py @@ -45,17 +45,15 @@ class DBObject(t.Generic[_DBObjectFuncT]): def __init__( self, name: str, - func: t.Optional[_DBObjectFuncT], - file_path: t.Optional[str], + func: _DBObjectFuncT | None, + file_path: str | None, device: str, devices_per_node: int, first_device: int, ) -> None: self.name = name - self.func: t.Optional[_DBObjectFuncT] = func - self.file: t.Optional[Path] = ( - None # Need to have this explicitly to check on it - ) + self.func: _DBObjectFuncT | None = func + self.file: Path | None = None # Need to have this explicitly to check on it if file_path: self.file = self._check_filepath(file_path) self.device = self._check_device(device) @@ -64,7 +62,7 @@ def __init__( self._check_devices(device, devices_per_node, first_device) @property - def devices(self) -> t.List[str]: + def devices(self) -> list[str]: return self._enumerate_devices() @property @@ -73,9 +71,9 @@ def is_file(self) -> bool: @staticmethod def _check_tensor_args( - inputs: t.Union[str, t.Optional[t.List[str]]], - outputs: t.Union[str, t.Optional[t.List[str]]], - ) -> t.Tuple[t.List[str], t.List[str]]: + inputs: str | list[str] | None, + outputs: str | list[str] | None, + ) -> tuple[list[str], list[str]]: if isinstance(inputs, str): inputs = [inputs] if isinstance(outputs, str): @@ -107,7 +105,7 @@ def _check_device(device: str) -> str: raise ValueError("Device argument must start with either CPU or GPU") return device - def _enumerate_devices(self) -> t.List[str]: + def _enumerate_devices(self) -> list[str]: """Enumerate devices for a DBObject :param dbobject: DBObject to enumerate @@ -154,8 +152,8 @@ class DBScript(DBObject[str]): def __init__( self, name: str, - script: t.Optional[str] = None, - script_path: t.Optional[str] = None, + script: str | None = None, + script_path: str | None = None, device: str = Device.CPU.value.upper(), devices_per_node: int = 1, first_device: int = 0, @@ -187,7 +185,7 @@ def __init__( raise ValueError("Either script or script_path must be provided") @property - def script(self) -> t.Optional[t.Union[bytes, str]]: + def script(self) -> bytes | str | None: return self.func def __str__(self) -> str: @@ -210,8 +208,8 @@ def __init__( self, name: str, backend: str, - model: t.Optional[bytes] = None, - model_file: t.Optional[str] = None, + model: bytes | None = None, + model_file: str | None = None, device: str = Device.CPU.value.upper(), devices_per_node: int = 1, first_device: int = 0, @@ -219,8 +217,8 @@ def __init__( min_batch_size: int = 0, min_batch_timeout: int = 0, tag: str = "", - inputs: t.Optional[t.List[str]] = None, - outputs: t.Optional[t.List[str]] = None, + inputs: list[str] | None = None, + outputs: list[str] | None = None, ) -> None: """A TF, TF-lite, PT, or ONNX model to load into the DB at runtime @@ -254,7 +252,7 @@ def __init__( self.inputs, self.outputs = self._check_tensor_args(inputs, outputs) @property - def model(self) -> t.Optional[bytes]: + def model(self) -> bytes | None: return self.func def __str__(self) -> str: diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py index cbf36c4313..8ec9a0c0aa 100644 --- a/smartsim/entity/ensemble.py +++ b/smartsim/entity/ensemble.py @@ -26,6 +26,7 @@ import os.path as osp import typing as t +from collections.abc import Callable, Collection from copy import deepcopy from os import getcwd @@ -49,9 +50,7 @@ logger = get_logger(__name__) -StrategyFunction = t.Callable[ - [t.List[str], t.List[t.List[str]], int], t.List[t.Dict[str, str]] -] +StrategyFunction = Callable[[list[str], list[list[str]], int], list[dict[str, str]]] class Ensemble(EntityList[Model]): @@ -62,11 +61,11 @@ class Ensemble(EntityList[Model]): def __init__( self, name: str, - params: t.Dict[str, t.Any], - path: t.Optional[str] = getcwd(), - params_as_args: t.Optional[t.List[str]] = None, - batch_settings: t.Optional[BatchSettings] = None, - run_settings: t.Optional[RunSettings] = None, + params: dict[str, t.Any], + path: str | None = getcwd(), + params_as_args: list[str] | None = None, + batch_settings: BatchSettings | None = None, + run_settings: RunSettings | None = None, perm_strat: str = "all_perm", **kwargs: t.Any, ) -> None: @@ -100,7 +99,7 @@ def __init__( super().__init__(name, str(path), perm_strat=perm_strat, **kwargs) @property - def models(self) -> t.Collection[Model]: + def models(self) -> Collection[Model]: """An alias for a shallow copy of the ``entities`` attribute""" return list(self.entities) @@ -235,9 +234,9 @@ def query_key_prefixing(self) -> bool: def attach_generator_files( self, - to_copy: t.Optional[t.List[str]] = None, - to_symlink: t.Optional[t.List[str]] = None, - to_configure: t.Optional[t.List[str]] = None, + to_copy: list[str] | None = None, + to_symlink: list[str] | None = None, + to_configure: list[str] | None = None, ) -> None: """Attach files to each model within the ensemble for generation @@ -307,7 +306,7 @@ def _set_strategy(strategy: str) -> StrategyFunction: f"Permutation strategy given is not supported: {strategy}" ) - def _read_model_parameters(self) -> t.Tuple[t.List[str], t.List[t.List[str]]]: + def _read_model_parameters(self) -> tuple[list[str], list[list[str]]]: """Take in the parameters given to the ensemble and prepare to create models for the ensemble @@ -320,8 +319,8 @@ def _read_model_parameters(self) -> t.Tuple[t.List[str], t.List[t.List[str]]]: "Ensemble initialization argument 'params' must be of type dict" ) - param_names: t.List[str] = [] - parameters: t.List[t.List[str]] = [] + param_names: list[str] = [] + parameters: list[list[str]] = [] for name, val in self.params.items(): param_names.append(name) @@ -341,8 +340,8 @@ def add_ml_model( self, name: str, backend: str, - model: t.Optional[bytes] = None, - model_path: t.Optional[str] = None, + model: bytes | None = None, + model_path: str | None = None, device: str = Device.CPU.value.upper(), devices_per_node: int = 1, first_device: int = 0, @@ -350,8 +349,8 @@ def add_ml_model( min_batch_size: int = 0, min_batch_timeout: int = 0, tag: str = "", - inputs: t.Optional[t.List[str]] = None, - outputs: t.Optional[t.List[str]] = None, + inputs: list[str] | None = None, + outputs: list[str] | None = None, ) -> None: """A TF, TF-lite, PT, or ONNX model to load into the DB at runtime @@ -411,8 +410,8 @@ def add_ml_model( def add_script( self, name: str, - script: t.Optional[str] = None, - script_path: t.Optional[str] = None, + script: str | None = None, + script_path: str | None = None, device: str = Device.CPU.value.upper(), devices_per_node: int = 1, first_device: int = 0, @@ -466,7 +465,7 @@ def add_script( def add_function( self, name: str, - function: t.Optional[str] = None, + function: str | None = None, device: str = Device.CPU.value.upper(), devices_per_node: int = 1, first_device: int = 0, @@ -517,7 +516,7 @@ def add_function( self._extend_entity_db_scripts(entity, [db_script]) @staticmethod - def _extend_entity_db_models(model: Model, db_models: t.List[DBModel]) -> None: + def _extend_entity_db_models(model: Model, db_models: list[DBModel]) -> None: """ Ensures that the Machine Learning model names being added to the Ensemble are unique. @@ -545,7 +544,7 @@ def _extend_entity_db_models(model: Model, db_models: t.List[DBModel]) -> None: model.add_ml_model_object(add_ml_model) @staticmethod - def _extend_entity_db_scripts(model: Model, db_scripts: t.List[DBScript]) -> None: + def _extend_entity_db_scripts(model: Model, db_scripts: list[DBScript]) -> None: """ Ensures that the script/function names being added to the Ensemble are unique. diff --git a/smartsim/entity/entity.py b/smartsim/entity/entity.py index 3e40004cbf..1f33c52b05 100644 --- a/smartsim/entity/entity.py +++ b/smartsim/entity/entity.py @@ -31,64 +31,6 @@ import smartsim.settings.base -class TelemetryConfiguration: - """A base class for configuraing telemetry production behavior on - existing `SmartSimEntity` subclasses. Any class that will have - optional telemetry collection must expose access to an instance - of `TelemetryConfiguration` such as: - - ``` - @property - def telemetry(self) -> TelemetryConfiguration: - # Return the telemetry configuration for this entity. - # :returns: Configuration object indicating the configuration - # status of telemetry for this entity - return self._telemetry_producer - ``` - - An instance will be used by to conditionally serialize - values to the `RuntimeManifest` - """ - - def __init__(self, enabled: bool = False) -> None: - """Initialize the telemetry producer and immediately call the `_on_enable` hook. - - :param enabled: flag indicating the initial state of telemetry - """ - self._is_on = enabled - - if self._is_on: - self._on_enable() - else: - self._on_disable() - - @property - def is_enabled(self) -> bool: - """Boolean flag indicating if telemetry is currently enabled - - :returns: `True` if enabled, `False` otherwise - """ - return self._is_on - - def enable(self) -> None: - """Enable telemetry for this producer""" - self._is_on = True - self._on_enable() - - def disable(self) -> None: - """Disable telemetry for this producer""" - self._is_on = False - self._on_disable() - - def _on_enable(self) -> None: - """Overridable hook called after telemetry is `enabled`. Allows subclasses - to perform actions when attempts to change configuration are made""" - - def _on_disable(self) -> None: - """Overridable hook called after telemetry is `disabled`. Allows subclasses - to perform actions when attempts to change configuration are made""" - - class SmartSimEntity: def __init__( self, name: str, path: str, run_settings: "smartsim.settings.base.RunSettings" diff --git a/smartsim/entity/entityList.py b/smartsim/entity/entityList.py index c5eb7571cc..1eccc470cd 100644 --- a/smartsim/entity/entityList.py +++ b/smartsim/entity/entityList.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import typing as t +from collections.abc import Iterable, Sequence from .entity import SmartSimEntity @@ -67,9 +68,9 @@ def __init__(self, name: str, path: str, **kwargs: t.Any) -> None: # object construction into the class' constructor. # --------------------------------------------------------------------- # - self.entities: t.Sequence[_T_co] = [] - self._db_models: t.Sequence["smartsim.entity.DBModel"] = [] - self._db_scripts: t.Sequence["smartsim.entity.DBScript"] = [] + self.entities: Sequence[_T_co] = [] + self._db_models: Sequence["smartsim.entity.DBModel"] = [] + self._db_scripts: Sequence["smartsim.entity.DBScript"] = [] # # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -80,12 +81,12 @@ def _initialize_entities(self, **kwargs: t.Any) -> None: raise NotImplementedError @property - def db_models(self) -> t.Iterable["smartsim.entity.DBModel"]: + def db_models(self) -> Iterable["smartsim.entity.DBModel"]: """Return an immutable collection of attached models""" return (model for model in self._db_models) @property - def db_scripts(self) -> t.Iterable["smartsim.entity.DBScript"]: + def db_scripts(self) -> Iterable["smartsim.entity.DBScript"]: """Return an immutable collection of attached scripts""" return (script for script in self._db_scripts) @@ -110,7 +111,7 @@ def set_path(self, new_path: str) -> None: for entity in self.entities: entity.path = new_path - def __getitem__(self, name: str) -> t.Optional[_T_co]: + def __getitem__(self, name: str) -> _T_co | None: for entity in self.entities: if entity.name == name: return entity @@ -129,9 +130,9 @@ class EntityList(EntitySequence[_T]): def __init__(self, name: str, path: str, **kwargs: t.Any) -> None: super().__init__(name, path, **kwargs) # Change container types to be invariant ``list``s - self.entities: t.List[_T] = list(self.entities) - self._db_models: t.List["smartsim.entity.DBModel"] = list(self._db_models) - self._db_scripts: t.List["smartsim.entity.DBScript"] = list(self._db_scripts) + self.entities: list[_T] = list(self.entities) + self._db_models: list["smartsim.entity.DBModel"] = list(self._db_models) + self._db_scripts: list["smartsim.entity.DBScript"] = list(self._db_scripts) def _initialize_entities(self, **kwargs: t.Any) -> None: """Initialize the SmartSimEntity objects in the container""" diff --git a/smartsim/entity/files.py b/smartsim/entity/files.py index 5eaca8c655..35868098fc 100644 --- a/smartsim/entity/files.py +++ b/smartsim/entity/files.py @@ -51,9 +51,9 @@ class EntityFiles: def __init__( self, - tagged: t.Optional[t.List[str]] = None, - copy: t.Optional[t.List[str]] = None, - symlink: t.Optional[t.List[str]] = None, + tagged: list[str] | None = None, + copy: list[str] | None = None, + symlink: list[str] | None = None, ) -> None: """Initialize an EntityFiles instance @@ -93,9 +93,7 @@ def _check_files(self) -> None: self.link[i] = self._check_path(value) @staticmethod - def _type_check_files( - file_list: t.Union[t.List[str], None], file_type: str - ) -> t.List[str]: + def _type_check_files(file_list: list[str] | None, file_type: str) -> list[str]: """Check the type of the files provided by the user. :param file_list: either tagged, copy, or symlink files @@ -169,7 +167,7 @@ class TaggedFilesHierarchy: tagged file directory structure can be replicated """ - def __init__(self, parent: t.Optional[t.Any] = None, subdir_name: str = "") -> None: + def __init__(self, parent: t.Any | None = None, subdir_name: str = "") -> None: """Initialize a TaggedFilesHierarchy :param parent: The parent hierarchy of the new hierarchy, @@ -203,8 +201,8 @@ def __init__(self, parent: t.Optional[t.Any] = None, subdir_name: str = "") -> N self._base: str = path.join(parent.base, subdir_name) if parent else "" self.parent: t.Any = parent - self.files: t.Set[str] = set() - self.dirs: t.Set[TaggedFilesHierarchy] = set() + self.files: set[str] = set() + self.dirs: set[TaggedFilesHierarchy] = set() @property def base(self) -> str: @@ -213,7 +211,7 @@ def base(self) -> str: @classmethod def from_list_paths( - cls, path_list: t.List[str], dir_contents_to_base: bool = False + cls, path_list: list[str], dir_contents_to_base: bool = False ) -> t.Any: """Given a list of absolute paths to files and dirs, create and return a TaggedFilesHierarchy instance representing the file hierarchy of @@ -264,7 +262,7 @@ def _add_dir(self, dir_path: str) -> None: [path.join(dir_path, file) for file in os.listdir(dir_path)] ) - def _add_paths(self, paths: t.List[str]) -> None: + def _add_paths(self, paths: list[str]) -> None: """Takes a list of paths and iterates over it, determining if each path is to a file or a dir and then appropriatly adding it to the TaggedFilesHierarchy. diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 70bc6c34c0..76c60ad1d0 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -32,6 +32,7 @@ import sys import typing as t import warnings +from collections.abc import Iterable, Mapping from os import getcwd from os import path as osp @@ -48,13 +49,13 @@ logger = get_logger(__name__) -def _parse_model_parameters(params_dict: t.Dict[str, t.Any]) -> t.Dict[str, str]: +def _parse_model_parameters(params_dict: dict[str, t.Any]) -> dict[str, str]: """Convert the values in a params dict to strings :raises TypeError: if params are of the wrong type :return: param dictionary with values and keys cast as strings """ - param_names: t.List[str] = [] - parameters: t.List[str] = [] + param_names: list[str] = [] + parameters: list[str] = [] for name, val in params_dict.items(): param_names.append(name) if isinstance(val, (str, numbers.Number)): @@ -71,11 +72,11 @@ class Model(SmartSimEntity): def __init__( self, name: str, - params: t.Dict[str, str], + params: dict[str, str], run_settings: RunSettings, - path: t.Optional[str] = getcwd(), - params_as_args: t.Optional[t.List[str]] = None, - batch_settings: t.Optional[BatchSettings] = None, + path: str | None = getcwd(), + params_as_args: list[str] | None = None, + batch_settings: BatchSettings | None = None, ): """Initialize a ``Model`` @@ -93,15 +94,15 @@ def __init__( super().__init__(name, str(path), run_settings) self.params = _parse_model_parameters(params) self.params_as_args = params_as_args - self.incoming_entities: t.List[SmartSimEntity] = [] + self.incoming_entities: list[SmartSimEntity] = [] self._key_prefixing_enabled = False self.batch_settings = batch_settings - self._db_models: t.List[DBModel] = [] - self._db_scripts: t.List[DBScript] = [] - self.files: t.Optional[EntityFiles] = None + self._db_models: list[DBModel] = [] + self._db_scripts: list[DBScript] = [] + self.files: EntityFiles | None = None @property - def db_models(self) -> t.Iterable[DBModel]: + def db_models(self) -> Iterable[DBModel]: """Retrieve an immutable collection of attached models :return: Return an immutable collection of attached models @@ -109,7 +110,7 @@ def db_models(self) -> t.Iterable[DBModel]: return (model for model in self._db_models) @property - def db_scripts(self) -> t.Iterable[DBScript]: + def db_scripts(self) -> Iterable[DBScript]: """Retrieve an immutable collection attached of scripts :return: Return an immutable collection of attached scripts @@ -161,9 +162,9 @@ def query_key_prefixing(self) -> bool: def attach_generator_files( self, - to_copy: t.Optional[t.List[str]] = None, - to_symlink: t.Optional[t.List[str]] = None, - to_configure: t.Optional[t.List[str]] = None, + to_copy: list[str] | None = None, + to_symlink: list[str] | None = None, + to_configure: list[str] | None = None, ) -> None: """Attach files to an entity for generation @@ -235,7 +236,7 @@ def colocate_db_uds( unix_socket: str = "/tmp/redis.socket", socket_permissions: int = 755, db_cpus: int = 1, - custom_pinning: t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]] = None, + custom_pinning: Iterable[int | Iterable[int]] | None = None, debug: bool = False, db_identifier: str = "", **kwargs: t.Any, @@ -276,7 +277,7 @@ def colocate_db_uds( f"Invalid name for unix socket: {unix_socket}. Must only " "contain alphanumeric characters or . : _ - /" ) - uds_options: t.Dict[str, t.Union[int, str]] = { + uds_options: dict[str, int | str] = { "unix_socket": unix_socket, "socket_permissions": socket_permissions, # This is hardcoded to 0 as recommended by redis for UDS @@ -294,9 +295,9 @@ def colocate_db_uds( def colocate_db_tcp( self, port: int = 6379, - ifname: t.Union[str, list[str]] = "lo", + ifname: str | list[str] = "lo", db_cpus: int = 1, - custom_pinning: t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]] = None, + custom_pinning: Iterable[int | Iterable[int]] | None = None, debug: bool = False, db_identifier: str = "", **kwargs: t.Any, @@ -343,18 +344,12 @@ def colocate_db_tcp( def _set_colocated_db_settings( self, - connection_options: t.Mapping[str, t.Union[int, t.List[str], str]], - common_options: t.Dict[ + connection_options: Mapping[str, int | list[str] | str], + common_options: dict[ str, - t.Union[ - t.Union[t.Iterable[t.Union[int, t.Iterable[int]]], None], - bool, - int, - str, - None, - ], + Iterable[int | Iterable[int]] | None | bool | int | str | None, ], - **kwargs: t.Union[int, None], + **kwargs: int | None, ) -> None: """ Ingest the connection-specific options (UDS/TCP) and set the final settings @@ -378,7 +373,7 @@ def _set_colocated_db_settings( # TODO list which db settings can be extras custom_pinning_ = t.cast( - t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]], + Iterable[int | Iterable[int]] | None, common_options.get("custom_pinning"), ) cpus_ = t.cast(int, common_options.get("cpus")) @@ -386,20 +381,20 @@ def _set_colocated_db_settings( custom_pinning_, cpus_ ) - colo_db_config: t.Dict[ + colo_db_config: dict[ str, - t.Union[ - bool, - int, - str, - None, - t.List[str], - t.Iterable[t.Union[int, t.Iterable[int]]], - t.List[DBModel], - t.List[DBScript], - t.Dict[str, t.Union[int, None]], - t.Dict[str, str], - ], + ( + bool + | int + | str + | None + | list[str] + | Iterable[int | Iterable[int]] + | list[DBModel] + | list[DBScript] + | dict[str, int | None] + | dict[str, str] + ), ] = {} colo_db_config.update(connection_options) colo_db_config.update(common_options) @@ -423,8 +418,8 @@ def _set_colocated_db_settings( @staticmethod def _create_pinning_string( - pin_ids: t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]], cpus: int - ) -> t.Optional[str]: + pin_ids: Iterable[int | Iterable[int]] | None, cpus: int + ) -> str | None: """Create a comma-separated string of CPU ids. By default, ``None`` returns 0,1,...,cpus-1; an empty iterable will disable pinning altogether, and an iterable constructs a comma separated string of @@ -432,7 +427,7 @@ def _create_pinning_string( """ def _stringify_id(_id: int) -> str: - """Return the cPU id as a string if an int, otherwise raise a ValueError""" + """Return the CPU id as a string if an int, otherwise raise a ValueError""" if isinstance(_id, int): if _id < 0: raise ValueError("CPU id must be a nonnegative number") @@ -491,8 +486,8 @@ def add_ml_model( self, name: str, backend: str, - model: t.Optional[bytes] = None, - model_path: t.Optional[str] = None, + model: bytes | None = None, + model_path: str | None = None, device: str = Device.CPU.value.upper(), devices_per_node: int = 1, first_device: int = 0, @@ -500,8 +495,8 @@ def add_ml_model( min_batch_size: int = 0, min_batch_timeout: int = 0, tag: str = "", - inputs: t.Optional[t.List[str]] = None, - outputs: t.Optional[t.List[str]] = None, + inputs: list[str] | None = None, + outputs: list[str] | None = None, ) -> None: """A TF, TF-lite, PT, or ONNX model to load into the DB at runtime @@ -550,8 +545,8 @@ def add_ml_model( def add_script( self, name: str, - script: t.Optional[str] = None, - script_path: t.Optional[str] = None, + script: str | None = None, + script_path: str | None = None, device: str = Device.CPU.value.upper(), devices_per_node: int = 1, first_device: int = 0, @@ -597,7 +592,7 @@ def add_script( def add_function( self, name: str, - function: t.Optional[str] = None, + function: str | None = None, device: str = Device.CPU.value.upper(), devices_per_node: int = 1, first_device: int = 0, diff --git a/smartsim/entity/strategies.py b/smartsim/entity/strategies.py index 5d0c48a46c..923db4113e 100644 --- a/smartsim/entity/strategies.py +++ b/smartsim/entity/strategies.py @@ -26,15 +26,14 @@ # Generation Strategies import random -import typing as t from itertools import product # create permutations of all parameters # single model if parameters only have one value def create_all_permutations( - param_names: t.List[str], param_values: t.List[t.List[str]], _n_models: int = 0 -) -> t.List[t.Dict[str, str]]: + param_names: list[str], param_values: list[list[str]], _n_models: int = 0 +) -> list[dict[str, str]]: perms = list(product(*param_values)) all_permutations = [] for permutation in perms: @@ -44,8 +43,8 @@ def create_all_permutations( def step_values( - param_names: t.List[str], param_values: t.List[t.List[str]], _n_models: int = 0 -) -> t.List[t.Dict[str, str]]: + param_names: list[str], param_values: list[list[str]], _n_models: int = 0 +) -> list[dict[str, str]]: permutations = [] for param_value in zip(*param_values): permutations.append(dict(zip(param_names, param_value))) @@ -53,8 +52,8 @@ def step_values( def random_permutations( - param_names: t.List[str], param_values: t.List[t.List[str]], n_models: int = 0 -) -> t.List[t.Dict[str, str]]: + param_names: list[str], param_values: list[list[str]], n_models: int = 0 +) -> list[dict[str, str]]: permutations = create_all_permutations(param_names, param_values) # sample from available permutations if n_models is specified diff --git a/smartsim/error/errors.py b/smartsim/error/errors.py index e36f24dda4..dd0519dec9 100644 --- a/smartsim/error/errors.py +++ b/smartsim/error/errors.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t # Exceptions @@ -124,8 +123,8 @@ class ShellError(LauncherError): def __init__( self, message: str, - command_list: t.Union[str, t.List[str]], - details: t.Optional[t.Union[Exception, str]] = None, + command_list: str | list[str], + details: Exception | str | None = None, ) -> None: msg = self.create_message(message, command_list, details=details) super().__init__(msg) @@ -133,8 +132,8 @@ def __init__( @staticmethod def create_message( message: str, - command_list: t.Union[str, t.List[str]], - details: t.Optional[t.Union[Exception, str]], + command_list: str | list[str], + details: Exception | str | None, ) -> str: if isinstance(command_list, list): command_list = " ".join(command_list) @@ -145,18 +144,6 @@ def create_message( return msg -class TelemetryError(SSInternalError): - """Raised when SmartSim runs into trouble establishing or communicating - telemetry information - """ - - -class UnproxyableStepError(TelemetryError): - """Raised when a user attempts to proxy a managed ``Step`` through the - unmanaged step proxy entry point - """ - - class SmartSimCLIActionCancelled(SmartSimError): """Raised when a `smart` CLI command is terminated""" diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 401187b02f..e04ff5fe78 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -39,13 +39,7 @@ from ._core import Controller, Generator, Manifest, previewrenderer from .database import Orchestrator -from .entity import ( - Ensemble, - EntitySequence, - Model, - SmartSimEntity, - TelemetryConfiguration, -) +from .entity import Ensemble, EntitySequence, Model, SmartSimEntity from .error import SmartSimError from .log import ctx_exp_path, get_logger, method_contextualizer from .settings import Container, base, settings @@ -63,23 +57,6 @@ def _exp_path_map(exp: "Experiment") -> str: _contextualize = method_contextualizer(ctx_exp_path, _exp_path_map) -class ExperimentTelemetryConfiguration(TelemetryConfiguration): - """Customized telemetry configuration for an `Experiment`. Ensures - backwards compatible behavior with drivers using environment variables - to enable experiment telemetry""" - - def __init__(self) -> None: - super().__init__(enabled=CONFIG.telemetry_enabled) - - def _on_enable(self) -> None: - """Modify the environment variable to enable telemetry.""" - environ["SMARTSIM_FLAG_TELEMETRY"] = "1" - - def _on_disable(self) -> None: - """Modify the environment variable to disable telemetry.""" - environ["SMARTSIM_FLAG_TELEMETRY"] = "0" - - # pylint: disable=no-self-use class Experiment: """Experiment is a factory class that creates stages of a workflow @@ -101,7 +78,7 @@ class Experiment: def __init__( self, name: str, - exp_path: t.Optional[str] = None, + exp_path: str | None = None, launcher: str = "local", ): """Initialize an Experiment instance. @@ -172,8 +149,7 @@ def __init__( self._control = Controller(launcher=self._launcher) - self.db_identifiers: t.Set[str] = set() - self._telemetry_cfg = ExperimentTelemetryConfiguration() + self.db_identifiers: set[str] = set() def _set_dragon_server_path(self) -> None: """Set path for dragon server through environment varialbes""" @@ -185,7 +161,7 @@ def _set_dragon_server_path(self) -> None: @_contextualize def start( self, - *args: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]], + *args: SmartSimEntity | EntitySequence[SmartSimEntity], block: bool = True, summary: bool = False, kill_on_interrupt: bool = True, @@ -252,9 +228,7 @@ def start( raise @_contextualize - def stop( - self, *args: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] - ) -> None: + def stop(self, *args: SmartSimEntity | EntitySequence[SmartSimEntity]) -> None: """Stop specific instances launched by this ``Experiment`` Instances of ``Model``, ``Ensemble`` and ``Orchestrator`` @@ -294,8 +268,8 @@ def stop( @_contextualize def generate( self, - *args: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]], - tag: t.Optional[str] = None, + *args: SmartSimEntity | EntitySequence[SmartSimEntity], + tag: str | None = None, overwrite: bool = False, verbose: bool = False, ) -> None: @@ -389,8 +363,8 @@ def finished(self, entity: SmartSimEntity) -> bool: @_contextualize def get_status( - self, *args: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] - ) -> t.List[SmartSimStatus]: + self, *args: SmartSimEntity | EntitySequence[SmartSimEntity] + ) -> list[SmartSimStatus]: """Query the status of launched entity instances Return a smartsim.status string representing @@ -417,7 +391,7 @@ def get_status( """ try: manifest = Manifest(*args) - statuses: t.List[SmartSimStatus] = [] + statuses: list[SmartSimStatus] = [] for entity in manifest.models: statuses.append(self._control.get_entity_status(entity)) for entity_list in manifest.all_entity_lists: @@ -431,12 +405,12 @@ def get_status( def create_ensemble( self, name: str, - params: t.Optional[t.Dict[str, t.Any]] = None, - batch_settings: t.Optional[base.BatchSettings] = None, - run_settings: t.Optional[base.RunSettings] = None, - replicas: t.Optional[int] = None, + params: dict[str, t.Any] | None = None, + batch_settings: base.BatchSettings | None = None, + run_settings: base.RunSettings | None = None, + replicas: int | None = None, perm_strategy: str = "all_perm", - path: t.Optional[str] = None, + path: str | None = None, **kwargs: t.Any, ) -> Ensemble: """Create an ``Ensemble`` of ``Model`` instances @@ -507,10 +481,10 @@ def create_model( self, name: str, run_settings: base.RunSettings, - params: t.Optional[t.Dict[str, t.Any]] = None, - path: t.Optional[str] = None, + params: dict[str, t.Any] | None = None, + path: str | None = None, enable_key_prefixing: bool = False, - batch_settings: t.Optional[base.BatchSettings] = None, + batch_settings: base.BatchSettings | None = None, ) -> Model: """Create a general purpose ``Model`` @@ -615,11 +589,11 @@ def create_model( def create_run_settings( self, exe: str, - exe_args: t.Optional[t.List[str]] = None, + exe_args: list[str] | None = None, run_command: str = "auto", - run_args: t.Optional[t.Dict[str, t.Union[int, str, float, None]]] = None, - env_vars: t.Optional[t.Dict[str, t.Optional[str]]] = None, - container: t.Optional[Container] = None, + run_args: dict[str, int | str | float | None] | None = None, + env_vars: dict[str, str | None] | None = None, + container: Container | None = None, **kwargs: t.Any, ) -> settings.RunSettings: """Create a ``RunSettings`` instance. @@ -675,7 +649,7 @@ def create_batch_settings( time: str = "", queue: str = "", account: str = "", - batch_args: t.Optional[t.Dict[str, str]] = None, + batch_args: dict[str, str] | None = None, **kwargs: t.Any, ) -> base.BatchSettings: """Create a ``BatchSettings`` instance @@ -727,15 +701,15 @@ def create_batch_settings( def create_database( self, port: int = 6379, - path: t.Optional[str] = None, + path: str | None = None, db_nodes: int = 1, batch: bool = False, - hosts: t.Optional[t.Union[t.List[str], str]] = None, + hosts: list[str] | str | None = None, run_command: str = "auto", - interface: t.Union[str, t.List[str]] = "ipogif0", - account: t.Optional[str] = None, - time: t.Optional[str] = None, - queue: t.Optional[str] = None, + interface: str | list[str] = "ipogif0", + account: str | None = None, + time: str | None = None, + queue: str | None = None, single_cmd: bool = True, db_identifier: str = "orchestrator", **kwargs: t.Any, @@ -822,7 +796,7 @@ def preview( *args: t.Any, verbosity_level: previewrenderer.Verbosity = previewrenderer.Verbosity.INFO, output_format: previewrenderer.Format = previewrenderer.Format.PLAINTEXT, - output_filename: t.Optional[str] = None, + output_filename: str | None = None, ) -> None: """Preview entity information prior to launch. This method aggregates multiple pieces of information to give users insight @@ -908,14 +882,6 @@ def summary(self, style: str = "github") -> str: disable_numparse=True, ) - @property - def telemetry(self) -> TelemetryConfiguration: - """Return the telemetry configuration for this entity. - - :returns: configuration of telemetry for this entity - """ - return self._telemetry_cfg - def _launch_summary(self, manifest: Manifest) -> None: """Experiment pre-launch summary of entities that will be launched @@ -941,7 +907,7 @@ def _launch_summary(self, manifest: Manifest) -> None: logger.info(summary) def _create_entity_dir(self, start_manifest: Manifest) -> None: - def create_entity_dir(entity: t.Union[Orchestrator, Model, Ensemble]) -> None: + def create_entity_dir(entity: Orchestrator | Model | Ensemble) -> None: if not os.path.isdir(entity.path): os.makedirs(entity.path) diff --git a/smartsim/log.py b/smartsim/log.py index d96229c8c3..9437adb2d4 100644 --- a/smartsim/log.py +++ b/smartsim/log.py @@ -31,6 +31,7 @@ import sys import threading import typing as t +from collections.abc import Callable from contextvars import ContextVar, copy_context import coloredlogs @@ -89,7 +90,7 @@ def _translate_log_level(user_log_level: str = "info") -> str: return "info" -def get_exp_log_paths() -> t.Tuple[t.Optional[pathlib.Path], t.Optional[pathlib.Path]]: +def get_exp_log_paths() -> tuple[pathlib.Path | None, pathlib.Path | None]: """Returns the output and error file paths to experiment logs. Returns None for both paths if experiment context is unavailable. @@ -98,8 +99,8 @@ def get_exp_log_paths() -> t.Tuple[t.Optional[pathlib.Path], t.Optional[pathlib. default_paths = None, None if _path := ctx_exp_path.get(): - file_out = pathlib.Path(_path) / CONFIG.telemetry_subdir / "logs/smartsim.out" - file_err = pathlib.Path(_path) / CONFIG.telemetry_subdir / "logs/smartsim.err" + file_out = pathlib.Path(_path) / "logs/smartsim.out" + file_err = pathlib.Path(_path) / "logs/smartsim.err" return file_out, file_err return default_paths @@ -154,7 +155,7 @@ class ContextAwareLogger(logging.Logger): """A logger customized to automatically write experiment logs to a dynamic target directory by inspecting the value of a context var""" - def __init__(self, name: str, level: t.Union[int, str] = 0) -> None: + def __init__(self, name: str, level: int | str = 0) -> None: super().__init__(name, level) self.addFilter(ContextInjectingLogFilter(name="exp-ctx-log-filter")) @@ -163,8 +164,8 @@ def _log( level: int, msg: object, args: t.Any, - exc_info: t.Optional[t.Any] = None, - extra: t.Optional[t.Any] = None, + exc_info: t.Any | None = None, + extra: t.Any | None = None, stack_info: bool = False, stacklevel: int = 1, ) -> None: @@ -189,7 +190,7 @@ def _log( def get_logger( - name: str, log_level: t.Optional[str] = None, fmt: t.Optional[str] = None + name: str, log_level: str | None = None, fmt: str | None = None ) -> logging.Logger: """Return a logger instance @@ -272,8 +273,8 @@ def log_to_exp_file( filename: str, logger: logging.Logger, log_level: str = "warn", - fmt: t.Optional[str] = EXPERIMENT_LOG_FORMAT, - log_filter: t.Optional[logging.Filter] = None, + fmt: str | None = EXPERIMENT_LOG_FORMAT, + log_filter: logging.Filter | None = None, ) -> logging.Handler: """Installs a second filestream handler to the root logger, allowing subsequent logging calls to be sent to filename. @@ -308,10 +309,10 @@ def log_to_exp_file( def method_contextualizer( ctx_var: ContextVar[_ContextT], - ctx_map: t.Callable[[_T], _ContextT], -) -> """t.Callable[ - [t.Callable[Concatenate[_T, _PR], _RT]], - t.Callable[Concatenate[_T, _PR], _RT], + ctx_map: Callable[[_T], _ContextT], +) -> """Callable[ + [Callable[Concatenate[_T, _PR], _RT]], + Callable[Concatenate[_T, _PR], _RT], ]""": """Parameterized-decorator factory that enables a target value to be placed into global context prior to execution of the @@ -325,8 +326,8 @@ def method_contextualizer( """ def _contextualize( - fn: "t.Callable[Concatenate[_T, _PR], _RT]", / - ) -> "t.Callable[Concatenate[_T, _PR], _RT]": + fn: "Callable[Concatenate[_T, _PR], _RT]", / + ) -> "Callable[Concatenate[_T, _PR], _RT]": """Executes the decorated method in a cloned context and ensures `ctx_var` is updated to the value returned by `ctx_map` prior to calling the decorated method""" diff --git a/smartsim/ml/data.py b/smartsim/ml/data.py index 332966bbe5..bd49024ff4 100644 --- a/smartsim/ml/data.py +++ b/smartsim/ml/data.py @@ -69,7 +69,7 @@ def __init__( list_name: str, sample_name: str = "samples", target_name: str = "targets", - num_classes: t.Optional[int] = None, + num_classes: int | None = None, ) -> None: self.list_name = list_name self.sample_name = sample_name @@ -160,10 +160,10 @@ def __init__( list_name: str = "training_data", sample_name: str = "samples", target_name: str = "targets", - num_classes: t.Optional[int] = None, + num_classes: int | None = None, cluster: bool = True, - address: t.Optional[str] = None, - rank: t.Optional[int] = None, + address: str | None = None, + rank: int | None = None, verbose: bool = False, ) -> None: if not list_name: @@ -190,7 +190,7 @@ def target_name(self) -> str: return self._info.target_name @property - def num_classes(self) -> t.Optional[int]: + def num_classes(self) -> int | None: return self._info.num_classes def publish_info(self) -> None: @@ -199,7 +199,7 @@ def publish_info(self) -> None: def put_batch( self, samples: np.ndarray, # type: ignore[type-arg] - targets: t.Optional[np.ndarray] = None, # type: ignore[type-arg] + targets: np.ndarray | None = None, # type: ignore[type-arg] ) -> None: batch_ds_name = form_name("training_samples", self.rank, self.batch_idx) batch_ds = Dataset(batch_ds_name) @@ -276,12 +276,12 @@ class DataDownloader: def __init__( self, - data_info_or_list_name: t.Union[str, DataInfo], + data_info_or_list_name: str | DataInfo, batch_size: int = 32, dynamic: bool = True, shuffle: bool = True, cluster: bool = True, - address: t.Optional[str] = None, + address: str | None = None, replica_rank: int = 0, num_replicas: int = 1, verbose: bool = False, @@ -292,8 +292,8 @@ def __init__( self.address = address self.cluster = cluster self.verbose = verbose - self.samples: t.Optional["npt.NDArray[t.Any]"] = None - self.targets: t.Optional["npt.NDArray[t.Any]"] = None + self.samples: "npt.NDArray[t.Any] | None" = None + self.targets: "npt.NDArray[t.Any] | None" = None self.num_samples = 0 self.indices = np.arange(0) self.shuffle = shuffle @@ -307,7 +307,7 @@ def __init__( self._info.download(client) else: raise TypeError("data_info_or_list_name must be either DataInfo or str") - self._client: t.Optional[Client] = None + self._client: Client | None = None sskeyin = environ.get("SSKEYIN", "") self.uploader_keys = sskeyin.split(",") @@ -348,7 +348,7 @@ def target_name(self) -> str: return self._info.target_name @property - def num_classes(self) -> t.Optional[int]: + def num_classes(self) -> int | None: return self._info.num_classes @property @@ -368,7 +368,7 @@ def _calc_indices(self, index: int) -> np.ndarray: # type: ignore[type-arg] def __iter__( self, - ) -> t.Iterator[t.Tuple[np.ndarray, np.ndarray]]: # type: ignore[type-arg] + ) -> t.Iterator[tuple[np.ndarray, np.ndarray]]: # type: ignore[type-arg] self.update_data() # Generate data if len(self) < 1: @@ -416,8 +416,8 @@ def _data_exists(self, batch_name: str, target_name: str) -> bool: return bool(self.client.tensor_exists(batch_name)) - def _add_samples(self, indices: t.List[int]) -> None: - datasets: t.List[Dataset] = [] + def _add_samples(self, indices: list[int]) -> None: + datasets: list[Dataset] = [] if self.num_replicas == 1: datasets = self.client.get_dataset_list_range( @@ -483,7 +483,7 @@ def update_data(self) -> None: def _data_generation( self, indices: "npt.NDArray[t.Any]" - ) -> t.Tuple["npt.NDArray[t.Any]", "npt.NDArray[t.Any]"]: + ) -> tuple["npt.NDArray[t.Any]", "npt.NDArray[t.Any]"]: # Initialization if self.samples is None: raise ValueError("Samples have not been initialized") diff --git a/smartsim/ml/tf/data.py b/smartsim/ml/tf/data.py index 23885d5050..d582833450 100644 --- a/smartsim/ml/tf/data.py +++ b/smartsim/ml/tf/data.py @@ -38,7 +38,7 @@ class _TFDataGenerationCommon(DataDownloader, keras.utils.Sequence): def __getitem__( self, index: int - ) -> t.Tuple[np.ndarray, np.ndarray]: # type: ignore[type-arg] + ) -> tuple[np.ndarray, np.ndarray]: # type: ignore[type-arg] if len(self) < 1: raise ValueError( "Not enough samples in generator for one batch. Please " @@ -65,7 +65,7 @@ def on_epoch_end(self) -> None: def _data_generation( self, indices: "npt.NDArray[t.Any]" - ) -> t.Tuple["npt.NDArray[t.Any]", "npt.NDArray[t.Any]"]: + ) -> tuple["npt.NDArray[t.Any]", "npt.NDArray[t.Any]"]: # Initialization if self.samples is None: raise ValueError("No samples loaded for data generation") diff --git a/smartsim/ml/tf/utils.py b/smartsim/ml/tf/utils.py index 2de6a0bcf6..f334784bce 100644 --- a/smartsim/ml/tf/utils.py +++ b/smartsim/ml/tf/utils.py @@ -36,7 +36,7 @@ def freeze_model( model: keras.Model, output_dir: str, file_name: str -) -> t.Tuple[str, t.List[str], t.List[str]]: +) -> tuple[str, list[str], list[str]]: """Freeze a Keras or TensorFlow Graph to use a Keras or TensorFlow model in SmartSim, the model @@ -78,7 +78,7 @@ def freeze_model( return model_file_path, input_names, output_names -def serialize_model(model: keras.Model) -> t.Tuple[str, t.List[str], t.List[str]]: +def serialize_model(model: keras.Model) -> tuple[str, list[str], list[str]]: """Serialize a Keras or TensorFlow Graph to use a Keras or TensorFlow model in SmartSim, the model diff --git a/smartsim/ml/torch/data.py b/smartsim/ml/torch/data.py index 04e508d345..bd8582bbd7 100644 --- a/smartsim/ml/torch/data.py +++ b/smartsim/ml/torch/data.py @@ -44,13 +44,13 @@ def __init__(self, **kwargs: t.Any) -> None: "init_samples=False. Setting it to False automatically." ) - def _add_samples(self, indices: t.List[int]) -> None: + def _add_samples(self, indices: list[int]) -> None: if self.client is None: client = Client(self.cluster, self.address) else: client = self.client - datasets: t.List[Dataset] = [] + datasets: list[Dataset] = [] if self.num_replicas == 1: datasets = client.get_dataset_list_range( self.list_name, start_index=indices[0], end_index=indices[-1] diff --git a/smartsim/settings/alpsSettings.py b/smartsim/settings/alpsSettings.py index 51d99f02aa..6059cc1936 100644 --- a/smartsim/settings/alpsSettings.py +++ b/smartsim/settings/alpsSettings.py @@ -36,9 +36,9 @@ class AprunSettings(RunSettings): def __init__( self, exe: str, - exe_args: t.Optional[t.Union[str, t.List[str]]] = None, - run_args: t.Optional[t.Dict[str, t.Union[int, str, float, None]]] = None, - env_vars: t.Optional[t.Dict[str, t.Optional[str]]] = None, + exe_args: t.Optional[str | list[str]] = None, + run_args: dict[str, int | str | float | None] | None = None, + env_vars: dict[str, str | None] | None = None, **kwargs: t.Any, ): """Settings to run job with ``aprun`` command @@ -58,7 +58,7 @@ def __init__( env_vars=env_vars, **kwargs, ) - self.mpmd: t.List[RunSettings] = [] + self.mpmd: list[RunSettings] = [] def make_mpmd(self, settings: RunSettings) -> None: """Make job an MPMD job @@ -105,7 +105,7 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None: """ self.run_args["pes-per-node"] = int(tasks_per_node) - def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: + def set_hostlist(self, host_list: str | list[str]) -> None: """Specify the hostlist for this job :param host_list: hosts to launch on @@ -128,7 +128,7 @@ def set_hostlist_from_file(self, file_path: str) -> None: """ self.run_args["node-list-file"] = file_path - def set_excluded_hosts(self, host_list: t.Union[str, t.List[str]]) -> None: + def set_excluded_hosts(self, host_list: str | list[str]) -> None: """Specify a list of hosts to exclude for launching this job :param host_list: hosts to exclude @@ -142,7 +142,7 @@ def set_excluded_hosts(self, host_list: t.Union[str, t.List[str]]) -> None: raise TypeError("host_list argument must be list of strings") self.run_args["exclude-node-list"] = ",".join(host_list) - def set_cpu_bindings(self, bindings: t.Union[int, t.List[int]]) -> None: + def set_cpu_bindings(self, bindings: int | list[int]) -> None: """Specifies the cores to which MPI processes are bound This sets ``--cpu-binding`` @@ -186,7 +186,7 @@ def set_quiet_launch(self, quiet: bool) -> None: else: self.run_args.pop("quiet", None) - def format_run_args(self) -> t.List[str]: + def format_run_args(self) -> list[str]: """Return a list of ALPS formatted run arguments :return: list of ALPS arguments for these settings @@ -208,7 +208,7 @@ def format_run_args(self) -> t.List[str]: args += ["=".join((prefix + opt, str(value)))] return args - def format_env_vars(self) -> t.List[str]: + def format_env_vars(self) -> list[str]: """Format the environment variables for aprun :return: list of env vars diff --git a/smartsim/settings/base.py b/smartsim/settings/base.py index 03ea0cadfc..039d5844e2 100644 --- a/smartsim/settings/base.py +++ b/smartsim/settings/base.py @@ -26,6 +26,7 @@ import copy import typing as t +from collections.abc import Iterable from smartsim.settings.containers import Container @@ -48,11 +49,11 @@ class RunSettings(SettingsBase): def __init__( self, exe: str, - exe_args: t.Optional[t.Union[str, t.List[str]]] = None, + exe_args: str | list[str] | None = None, run_command: str = "", - run_args: t.Optional[t.Dict[str, t.Union[int, str, float, None]]] = None, - env_vars: t.Optional[t.Dict[str, t.Optional[str]]] = None, - container: t.Optional[Container] = None, + run_args: dict[str, int | str | float | None] | None = None, + env_vars: dict[str, str | None] | None = None, + container: Container | None = None, **_kwargs: t.Any, ) -> None: """Run parameters for a ``Model`` @@ -89,26 +90,27 @@ def __init__( self.container = container self._run_command = run_command self.in_batch = False - self.colocated_db_settings: t.Optional[ - t.Dict[ + self.colocated_db_settings: ( + dict[ str, - t.Union[ - bool, - int, - str, - None, - t.List[str], - t.Iterable[t.Union[int, t.Iterable[int]]], - t.List[DBModel], - t.List[DBScript], - t.Dict[str, t.Union[int, None]], - t.Dict[str, str], - ], + ( + bool + | int + | str + | None + | list[str] + | Iterable[int | Iterable[int]] + | list[DBModel] + | list[DBScript] + | dict[str, int | None] + | dict[str, str] + ), ] - ] = None + | None + ) = None @property - def exe_args(self) -> t.Union[str, t.List[str]]: + def exe_args(self) -> str | list[str]: """Return an immutable list of attached executable arguments. :returns: attached executable arguments @@ -116,7 +118,7 @@ def exe_args(self) -> t.Union[str, t.List[str]]: return self._exe_args @exe_args.setter - def exe_args(self, value: t.Union[str, t.List[str], None]) -> None: + def exe_args(self, value: str | list[str] | None) -> None: """Set the executable arguments. :param value: executable arguments @@ -124,7 +126,7 @@ def exe_args(self, value: t.Union[str, t.List[str], None]) -> None: self._exe_args = self._build_exe_args(value) @property - def run_args(self) -> t.Dict[str, t.Union[int, str, float, None]]: + def run_args(self) -> dict[str, int | str | float | None]: """Return an immutable list of attached run arguments. :returns: attached run arguments @@ -132,7 +134,7 @@ def run_args(self) -> t.Dict[str, t.Union[int, str, float, None]]: return self._run_args @run_args.setter - def run_args(self, value: t.Dict[str, t.Union[int, str, float, None]]) -> None: + def run_args(self, value: dict[str, int | str | float | None]) -> None: """Set the run arguments. :param value: run arguments @@ -140,7 +142,7 @@ def run_args(self, value: t.Dict[str, t.Union[int, str, float, None]]) -> None: self._run_args = copy.deepcopy(value) @property - def env_vars(self) -> t.Dict[str, t.Optional[str]]: + def env_vars(self) -> dict[str, str | None]: """Return an immutable list of attached environment variables. :returns: attached environment variables @@ -148,7 +150,7 @@ def env_vars(self) -> t.Dict[str, t.Optional[str]]: return self._env_vars @env_vars.setter - def env_vars(self, value: t.Dict[str, t.Optional[str]]) -> None: + def env_vars(self, value: dict[str, str | None]) -> None: """Set the environment variables. :param value: environment variables @@ -218,7 +220,7 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: ) ) - def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: + def set_hostlist(self, host_list: str | list[str]) -> None: """Specify the hostlist for this job :param host_list: hosts to launch on @@ -242,7 +244,7 @@ def set_hostlist_from_file(self, file_path: str) -> None: ) ) - def set_excluded_hosts(self, host_list: t.Union[str, t.List[str]]) -> None: + def set_excluded_hosts(self, host_list: str | list[str]) -> None: """Specify a list of hosts to exclude for launching this job :param host_list: hosts to exclude @@ -254,7 +256,7 @@ def set_excluded_hosts(self, host_list: t.Union[str, t.List[str]]) -> None: ) ) - def set_cpu_bindings(self, bindings: t.Union[int, t.List[int]]) -> None: + def set_cpu_bindings(self, bindings: int | list[int]) -> None: """Set the cores to which MPI processes are bound :param bindings: List specifing the cores to which MPI processes are bound @@ -302,7 +304,7 @@ def set_quiet_launch(self, quiet: bool) -> None: ) ) - def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: + def set_broadcast(self, dest_path: str | None = None) -> None: """Copy executable file to allocated compute nodes :param dest_path: Path to copy an executable file @@ -325,7 +327,7 @@ def set_time(self, hours: int = 0, minutes: int = 0, seconds: int = 0) -> None: self._fmt_walltime(int(hours), int(minutes), int(seconds)) ) - def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None: + def set_node_feature(self, feature_list: str | list[str]) -> None: """Specify the node feature for this job :param feature_list: node feature to launch on @@ -377,7 +379,7 @@ def set_binding(self, binding: str) -> None: ) ) - def set_mpmd_preamble(self, preamble_lines: t.List[str]) -> None: + def set_mpmd_preamble(self, preamble_lines: list[str]) -> None: """Set preamble to a file to make a job MPMD :param preamble_lines: lines to put at the beginning of a file. @@ -402,7 +404,7 @@ def make_mpmd(self, settings: RunSettings) -> None: ) @property - def run_command(self) -> t.Optional[str]: + def run_command(self) -> str | None: """Return the launch binary used to launch the executable Attempt to expand the path to the executable if possible @@ -421,7 +423,7 @@ def run_command(self) -> t.Optional[str]: # run without run command return None - def update_env(self, env_vars: t.Dict[str, t.Union[str, int, float, bool]]) -> None: + def update_env(self, env_vars: dict[str, str | int | float | bool]) -> None: """Update the job environment variables To fully inherit the current user environment, add the @@ -443,7 +445,7 @@ def update_env(self, env_vars: t.Dict[str, t.Union[str, int, float, bool]]) -> N self.env_vars[env] = str(val) - def add_exe_args(self, args: t.Union[str, t.List[str]]) -> None: + def add_exe_args(self, args: str | list[str]) -> None: """Add executable arguments to executable :param args: executable arguments @@ -451,9 +453,7 @@ def add_exe_args(self, args: t.Union[str, t.List[str]]) -> None: args = self._build_exe_args(args) self._exe_args.extend(args) - def set( - self, arg: str, value: t.Optional[str] = None, condition: bool = True - ) -> None: + def set(self, arg: str, value: str | None = None, condition: bool = True) -> None: """Allows users to set individual run arguments. A method that allows users to set run arguments after object @@ -523,7 +523,7 @@ def set( self.run_args[arg] = value @staticmethod - def _build_exe_args(exe_args: t.Optional[t.Union[str, t.List[str]]]) -> t.List[str]: + def _build_exe_args(exe_args: str | list[str] | None) -> list[str]: """Check and convert exe_args input to a desired collection format""" if not exe_args: return [] @@ -545,7 +545,7 @@ def _build_exe_args(exe_args: t.Optional[t.Union[str, t.List[str]]]) -> t.List[s return exe_args - def format_run_args(self) -> t.List[str]: + def format_run_args(self) -> list[str]: """Return formatted run arguments For ``RunSettings``, the run arguments are passed @@ -559,7 +559,7 @@ def format_run_args(self) -> t.List[str]: formatted.append(str(value)) return formatted - def format_env_vars(self) -> t.List[str]: + def format_env_vars(self) -> list[str]: """Build environment variable string :returns: formatted list of strings to export variables @@ -588,12 +588,12 @@ class BatchSettings(SettingsBase): def __init__( self, batch_cmd: str, - batch_args: t.Optional[t.Dict[str, t.Optional[str]]] = None, + batch_args: dict[str, str | None] | None = None, **kwargs: t.Any, ) -> None: self._batch_cmd = batch_cmd self.batch_args = batch_args or {} - self._preamble: t.List[str] = [] + self._preamble: list[str] = [] nodes = kwargs.get("nodes", None) if nodes: self.set_nodes(nodes) @@ -623,7 +623,7 @@ def batch_cmd(self) -> str: return self._batch_cmd @property - def batch_args(self) -> t.Dict[str, t.Optional[str]]: + def batch_args(self) -> dict[str, str | None]: """Retrieve attached batch arguments :returns: attached batch arguments @@ -631,7 +631,7 @@ def batch_args(self) -> t.Dict[str, t.Optional[str]]: return self._batch_args @batch_args.setter - def batch_args(self, value: t.Dict[str, t.Optional[str]]) -> None: + def batch_args(self, value: dict[str, str | None]) -> None: """Attach batch arguments :param value: dictionary of batch arguments @@ -641,7 +641,7 @@ def batch_args(self, value: t.Dict[str, t.Optional[str]]) -> None: def set_nodes(self, num_nodes: int) -> None: raise NotImplementedError - def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: + def set_hostlist(self, host_list: str | list[str]) -> None: raise NotImplementedError def set_queue(self, queue: str) -> None: @@ -653,7 +653,7 @@ def set_walltime(self, walltime: str) -> None: def set_account(self, account: str) -> None: raise NotImplementedError - def format_batch_args(self) -> t.List[str]: + def format_batch_args(self) -> list[str]: raise NotImplementedError def set_batch_command(self, command: str) -> None: @@ -663,7 +663,7 @@ def set_batch_command(self, command: str) -> None: """ self._batch_cmd = command - def add_preamble(self, lines: t.List[str]) -> None: + def add_preamble(self, lines: list[str]) -> None: """Add lines to the batch file preamble. The lines are just written (unmodified) at the beginning of the batch file (after the WLM directives) and can be used to e.g. @@ -679,7 +679,7 @@ def add_preamble(self, lines: t.List[str]) -> None: raise TypeError("Expected str or List[str] for lines argument") @property - def preamble(self) -> t.Iterable[str]: + def preamble(self) -> Iterable[str]: """Return an iterable of preamble clauses to be prepended to the batch file :return: attached preamble clauses diff --git a/smartsim/settings/containers.py b/smartsim/settings/containers.py index f187bbb48c..05f7f6ac8b 100644 --- a/smartsim/settings/containers.py +++ b/smartsim/settings/containers.py @@ -101,7 +101,7 @@ class Singularity(Container): def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: super().__init__(*args, **kwargs) - def _container_cmds(self, default_working_directory: str = "") -> t.List[str]: + def _container_cmds(self, default_working_directory: str = "") -> list[str]: """Return list of container commands to be inserted before exe. Container members are validated during this call. diff --git a/smartsim/settings/dragonRunSettings.py b/smartsim/settings/dragonRunSettings.py index 666f490a0b..76939e7083 100644 --- a/smartsim/settings/dragonRunSettings.py +++ b/smartsim/settings/dragonRunSettings.py @@ -40,8 +40,8 @@ class DragonRunSettings(RunSettings): def __init__( self, exe: str, - exe_args: t.Optional[t.Union[str, t.List[str]]] = None, - env_vars: t.Optional[t.Dict[str, t.Optional[str]]] = None, + exe_args: t.Optional[str | list[str]] = None, + env_vars: dict[str, str | None] | None = None, **kwargs: t.Any, ) -> None: """Initialize run parameters for a Dragon process @@ -82,7 +82,7 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None: self.run_args["tasks-per-node"] = tasks_per_node @override - def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None: + def set_node_feature(self, feature_list: str | list[str]) -> None: """Specify the node feature for this job :param feature_list: a collection of strings representing the required @@ -95,14 +95,14 @@ def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None: self.run_args["node-feature"] = ",".join(feature_list) - def set_cpu_affinity(self, devices: t.List[int]) -> None: + def set_cpu_affinity(self, devices: list[int]) -> None: """Set the CPU affinity for this job :param devices: list of CPU indices to execute on """ self.run_args["cpu-affinity"] = ",".join(str(device) for device in devices) - def set_gpu_affinity(self, devices: t.List[int]) -> None: + def set_gpu_affinity(self, devices: list[int]) -> None: """Set the GPU affinity for this job :param devices: list of GPU indices to execute on. diff --git a/smartsim/settings/mpiSettings.py b/smartsim/settings/mpiSettings.py index ff698a9fb5..d356c8879d 100644 --- a/smartsim/settings/mpiSettings.py +++ b/smartsim/settings/mpiSettings.py @@ -43,10 +43,10 @@ class _BaseMPISettings(RunSettings): def __init__( self, exe: str, - exe_args: t.Optional[t.Union[str, t.List[str]]] = None, + exe_args: t.Optional[str | list[str]] = None, run_command: str = "mpiexec", - run_args: t.Optional[t.Dict[str, t.Union[int, str, float, None]]] = None, - env_vars: t.Optional[t.Dict[str, t.Optional[str]]] = None, + run_args: dict[str, int | str | float | None] | None = None, + env_vars: dict[str, str | None] | None = None, fail_if_missing_exec: bool = True, **kwargs: t.Any, ) -> None: @@ -75,8 +75,8 @@ def __init__( env_vars=env_vars, **kwargs, ) - self.mpmd: t.List[RunSettings] = [] - self.affinity_script: t.List[str] = [] + self.mpmd: list[RunSettings] = [] + self.affinity_script: list[str] = [] if not shutil.which(self._run_command): msg = ( @@ -151,7 +151,7 @@ def set_tasks(self, tasks: int) -> None: """ self.run_args["n"] = int(tasks) - def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: + def set_hostlist(self, host_list: str | list[str]) -> None: """Set the hostlist for the ``mpirun`` command This sets ``--host`` @@ -200,7 +200,7 @@ def set_quiet_launch(self, quiet: bool) -> None: else: self.run_args.pop("quiet", None) - def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: + def set_broadcast(self, dest_path: str | None = None) -> None: """Copy the specified executable(s) to remote machines This sets ``--preload-binary`` @@ -225,7 +225,7 @@ def set_walltime(self, walltime: str) -> None: """ self.run_args["timeout"] = walltime - def format_run_args(self) -> t.List[str]: + def format_run_args(self) -> list[str]: """Return a list of MPI-standard formatted run arguments :return: list of MPI-standard arguments for these settings @@ -243,7 +243,7 @@ def format_run_args(self) -> t.List[str]: args += [prefix + opt, str(value)] return args - def format_env_vars(self) -> t.List[str]: + def format_env_vars(self) -> list[str]: """Format the environment variables for mpirun :return: list of env vars @@ -264,9 +264,9 @@ class MpirunSettings(_BaseMPISettings): def __init__( self, exe: str, - exe_args: t.Optional[t.Union[str, t.List[str]]] = None, - run_args: t.Optional[t.Dict[str, t.Union[int, str, float, None]]] = None, - env_vars: t.Optional[t.Dict[str, t.Optional[str]]] = None, + exe_args: t.Optional[str | list[str]] = None, + run_args: dict[str, int | str | float | None] | None = None, + env_vars: dict[str, str | None] | None = None, **kwargs: t.Any, ) -> None: """Settings to run job with ``mpirun`` command (MPI-standard) @@ -291,9 +291,9 @@ class MpiexecSettings(_BaseMPISettings): def __init__( self, exe: str, - exe_args: t.Optional[t.Union[str, t.List[str]]] = None, - run_args: t.Optional[t.Dict[str, t.Union[int, str, float, None]]] = None, - env_vars: t.Optional[t.Dict[str, t.Optional[str]]] = None, + exe_args: t.Optional[str | list[str]] = None, + run_args: dict[str, int | str | float | None] | None = None, + env_vars: dict[str, str | None] | None = None, **kwargs: t.Any, ) -> None: """Settings to run job with ``mpiexec`` command (MPI-standard) @@ -327,9 +327,9 @@ class OrterunSettings(_BaseMPISettings): def __init__( self, exe: str, - exe_args: t.Optional[t.Union[str, t.List[str]]] = None, - run_args: t.Optional[t.Dict[str, t.Union[int, str, float, None]]] = None, - env_vars: t.Optional[t.Dict[str, t.Optional[str]]] = None, + exe_args: t.Optional[str | list[str]] = None, + run_args: dict[str, int | str | float | None] | None = None, + env_vars: dict[str, str | None] | None = None, **kwargs: t.Any, ) -> None: """Settings to run job with ``orterun`` command (MPI-standard) diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py index 1d6e9bedfb..e619bc9910 100644 --- a/smartsim/settings/palsSettings.py +++ b/smartsim/settings/palsSettings.py @@ -53,9 +53,9 @@ class PalsMpiexecSettings(_BaseMPISettings): def __init__( self, exe: str, - exe_args: t.Optional[t.Union[str, t.List[str]]] = None, - run_args: t.Optional[t.Dict[str, t.Union[int, str, float, None]]] = None, - env_vars: t.Optional[t.Dict[str, t.Optional[str]]] = None, + exe_args: t.Optional[str | list[str]] = None, + run_args: dict[str, int | str | float | None] | None = None, + env_vars: dict[str, str | None] | None = None, fail_if_missing_exec: bool = True, **kwargs: t.Any, ) -> None: @@ -142,7 +142,7 @@ def set_quiet_launch(self, quiet: bool) -> None: logger.warning("set_quiet_launch not supported under PALS") - def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: + def set_broadcast(self, dest_path: str | None = None) -> None: """Copy the specified executable(s) to remote machines This sets ``--preload-binary`` @@ -174,7 +174,7 @@ def set_gpu_affinity_script(self, affinity: str, *args: t.Any) -> None: for arg in args: self.affinity_script.append(str(arg)) - def format_run_args(self) -> t.List[str]: + def format_run_args(self) -> list[str]: """Return a list of MPI-standard formatted run arguments :return: list of MPI-standard arguments for these settings @@ -196,7 +196,7 @@ def format_run_args(self) -> t.List[str]: return args - def format_env_vars(self) -> t.List[str]: + def format_env_vars(self) -> list[str]: """Format the environment variables for mpirun :return: list of env vars @@ -216,7 +216,7 @@ def format_env_vars(self) -> t.List[str]: return formatted - def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: + def set_hostlist(self, host_list: str | list[str]) -> None: """Set the hostlist for the PALS ``mpiexec`` command This sets ``--hosts`` diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 8869c2529d..2ec952f622 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -36,13 +36,13 @@ class QsubBatchSettings(BatchSettings): def __init__( self, - nodes: t.Optional[int] = None, - ncpus: t.Optional[int] = None, - time: t.Optional[str] = None, - queue: t.Optional[str] = None, - account: t.Optional[str] = None, - resources: t.Optional[t.Dict[str, t.Union[str, int]]] = None, - batch_args: t.Optional[t.Dict[str, t.Optional[str]]] = None, + nodes: int | None = None, + ncpus: int | None = None, + time: str | None = None, + queue: str | None = None, + account: str | None = None, + resources: dict[str, str | int] | None = None, + batch_args: dict[str, str | None] | None = None, **kwargs: t.Any, ): """Specify ``qsub`` batch parameters for a job @@ -84,14 +84,14 @@ def __init__( **kwargs, ) - self._hosts: t.List[str] = [] + self._hosts: list[str] = [] @property - def resources(self) -> t.Dict[str, t.Union[str, int]]: + def resources(self) -> dict[str, str | int]: return self._resources.copy() @resources.setter - def resources(self, resources: t.Dict[str, t.Union[str, int]]) -> None: + def resources(self, resources: dict[str, str | int]) -> None: self._sanity_check_resources(resources) self._resources = resources.copy() @@ -110,7 +110,7 @@ def set_nodes(self, num_nodes: int) -> None: if num_nodes: self.set_resource("nodes", num_nodes) - def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: + def set_hostlist(self, host_list: str | list[str]) -> None: """Specify the hostlist for this job :param host_list: hosts to launch on @@ -146,7 +146,7 @@ def set_queue(self, queue: str) -> None: if queue: self.batch_args["q"] = str(queue) - def set_ncpus(self, num_cpus: t.Union[int, str]) -> None: + def set_ncpus(self, num_cpus: int | str) -> None: """Set the number of cpus obtained in each node. If a select argument is provided in @@ -165,7 +165,7 @@ def set_account(self, account: str) -> None: if account: self.batch_args["A"] = str(account) - def set_resource(self, resource_name: str, value: t.Union[str, int]) -> None: + def set_resource(self, resource_name: str, value: str | int) -> None: """Set a resource value for the Qsub batch If a select statement is provided, the nodes and ncpus @@ -181,7 +181,7 @@ def set_resource(self, resource_name: str, value: t.Union[str, int]) -> None: self._sanity_check_resources(updated_dict) self.resources = updated_dict - def format_batch_args(self) -> t.List[str]: + def format_batch_args(self) -> list[str]: """Get the formatted batch arguments for a preview :return: batch arguments for Qsub @@ -196,7 +196,7 @@ def format_batch_args(self) -> t.List[str]: return opts def _sanity_check_resources( - self, resources: t.Optional[t.Dict[str, t.Union[str, int]]] = None + self, resources: dict[str, str | int] | None = None ) -> None: """Check that only select or nodes was specified in resources @@ -233,7 +233,7 @@ def _sanity_check_resources( "and str are allowed." ) - def _create_resource_list(self) -> t.List[str]: + def _create_resource_list(self) -> list[str]: self._sanity_check_resources() res = [] diff --git a/smartsim/settings/settings.py b/smartsim/settings/settings.py index 03c37a6851..ecd32f3db0 100644 --- a/smartsim/settings/settings.py +++ b/smartsim/settings/settings.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import typing as t +from collections.abc import Callable from .._core.utils.helpers import is_valid_cmd from ..error import SmartSimError @@ -45,16 +46,16 @@ ) from ..wlm import detect_launcher -_TRunSettingsSelector = t.Callable[[str], t.Callable[..., RunSettings]] +_TRunSettingsSelector = Callable[[str], Callable[..., RunSettings]] def create_batch_settings( launcher: str, - nodes: t.Optional[int] = None, + nodes: int | None = None, time: str = "", - queue: t.Optional[str] = None, - account: t.Optional[str] = None, - batch_args: t.Optional[t.Dict[str, str]] = None, + queue: str | None = None, + account: str | None = None, + batch_args: dict[str, str] | None = None, **kwargs: t.Any, ) -> base.BatchSettings: """Create a ``BatchSettings`` instance @@ -72,7 +73,7 @@ def create_batch_settings( :raises SmartSimError: if batch creation fails """ # all supported batch class implementations - by_launcher: t.Dict[str, t.Callable[..., base.BatchSettings]] = { + by_launcher: dict[str, Callable[..., base.BatchSettings]] = { "pbs": QsubBatchSettings, "slurm": SbatchSettings, "pals": QsubBatchSettings, @@ -110,11 +111,11 @@ def create_batch_settings( def create_run_settings( launcher: str, exe: str, - exe_args: t.Optional[t.List[str]] = None, + exe_args: list[str] | None = None, run_command: str = "auto", - run_args: t.Optional[t.Dict[str, t.Union[int, str, float, None]]] = None, - env_vars: t.Optional[t.Dict[str, t.Optional[str]]] = None, - container: t.Optional[Container] = None, + run_args: dict[str, int | str | float | None] | None = None, + env_vars: dict[str, str | None] | None = None, + container: Container | None = None, **kwargs: t.Any, ) -> RunSettings: """Create a ``RunSettings`` instance. @@ -133,7 +134,7 @@ def create_run_settings( :raises SmartSimError: if run_command=="auto" and detection fails """ # all supported RunSettings child classes - supported: t.Dict[str, _TRunSettingsSelector] = { + supported: dict[str, _TRunSettingsSelector] = { "aprun": lambda launcher: AprunSettings, "srun": lambda launcher: SrunSettings, "mpirun": lambda launcher: MpirunSettings, diff --git a/smartsim/settings/sgeSettings.py b/smartsim/settings/sgeSettings.py index 5a46c9f1bd..0bbae9218d 100644 --- a/smartsim/settings/sgeSettings.py +++ b/smartsim/settings/sgeSettings.py @@ -36,13 +36,13 @@ class SgeQsubBatchSettings(BatchSettings): def __init__( self, - time: t.Optional[str] = None, - ncpus: t.Optional[int] = None, - pe_type: t.Optional[str] = None, - account: t.Optional[str] = None, + time: str | None = None, + ncpus: int | None = None, + pe_type: str | None = None, + account: str | None = None, shebang: str = "#!/bin/bash -l", - resources: t.Optional[t.Dict[str, t.Union[str, int]]] = None, - batch_args: t.Optional[t.Dict[str, t.Optional[str]]] = None, + resources: dict[str, str | int] | None = None, + batch_args: dict[str, str | None] | None = None, **kwargs: t.Any, ): """Specify SGE batch parameters for a job @@ -75,19 +75,19 @@ def __init__( **kwargs, ) - self._context_variables: t.List[str] = [] - self._env_vars: t.Dict[str, str] = {} + self._context_variables: list[str] = [] + self._env_vars: dict[str, str] = {} @property - def resources(self) -> t.Dict[str, t.Union[str, int]]: + def resources(self) -> dict[str, str | int]: return self._resources.copy() @resources.setter - def resources(self, resources: t.Dict[str, t.Union[str, int]]) -> None: + def resources(self, resources: dict[str, str | int]) -> None: self._sanity_check_resources(resources) self._resources = resources.copy() - def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: + def set_hostlist(self, host_list: str | list[str]) -> None: raise LauncherUnsupportedFeature( "SGE does not support requesting specific hosts in batch jobs" ) @@ -117,7 +117,7 @@ def set_walltime(self, walltime: str) -> None: if walltime: self.set_resource("h_rt", walltime) - def set_nodes(self, num_nodes: t.Optional[int]) -> None: + def set_nodes(self, num_nodes: int | None) -> None: """Set the number of nodes, invalid for SGE :param nodes: Number of nodes, any integer other than 0 is invalid @@ -127,14 +127,14 @@ def set_nodes(self, num_nodes: t.Optional[int]) -> None: "SGE does not support setting the number of nodes" ) - def set_ncpus(self, num_cpus: t.Union[int, str]) -> None: + def set_ncpus(self, num_cpus: int | str) -> None: """Set the number of cpus obtained in each node. :param num_cpus: number of cpus per node in select """ self.set_resource("ncpus", int(num_cpus)) - def set_ngpus(self, num_gpus: t.Union[int, str]) -> None: + def set_ngpus(self, num_gpus: int | str) -> None: """Set the number of GPUs obtained in each node. :param num_gpus: number of GPUs per node in select @@ -161,7 +161,7 @@ def update_context_variables( self, action: t.Literal["ac", "sc", "dc"], var_name: str, - value: t.Optional[t.Union[int, str]] = None, + value: int | str | None = None, ) -> None: """ Add, set, or delete context variables @@ -214,7 +214,7 @@ def set_threads_per_pe(self, threads_per_core: int) -> None: self._env_vars["OMP_NUM_THREADS"] = str(threads_per_core) - def set_resource(self, resource_name: str, value: t.Union[str, int]) -> None: + def set_resource(self, resource_name: str, value: str | int) -> None: """Set a resource value for the SGE batch If a select statement is provided, the nodes and ncpus @@ -228,7 +228,7 @@ def set_resource(self, resource_name: str, value: t.Union[str, int]) -> None: self._sanity_check_resources(updated_dict) self.resources = updated_dict - def format_batch_args(self) -> t.List[str]: + def format_batch_args(self) -> list[str]: """Get the formatted batch arguments for a preview :return: batch arguments for SGE @@ -243,7 +243,7 @@ def format_batch_args(self) -> t.List[str]: return opts def _sanity_check_resources( - self, resources: t.Optional[t.Dict[str, t.Union[str, int]]] = None + self, resources: dict[str, str | int] | None = None ) -> None: """Check that resources are correctly formatted""" # Note: isinstance check here to avoid collision with default @@ -261,7 +261,7 @@ def _sanity_check_resources( "and str are allowed." ) - def _create_resource_list(self) -> t.List[str]: + def _create_resource_list(self) -> list[str]: self._sanity_check_resources() res = [] diff --git a/smartsim/settings/slurmSettings.py b/smartsim/settings/slurmSettings.py index faffc7837a..af30ec8a49 100644 --- a/smartsim/settings/slurmSettings.py +++ b/smartsim/settings/slurmSettings.py @@ -29,6 +29,7 @@ import datetime import os import typing as t +from collections.abc import Iterable from ..error import SSUnsupportedError from ..log import get_logger @@ -41,10 +42,10 @@ class SrunSettings(RunSettings): def __init__( self, exe: str, - exe_args: t.Optional[t.Union[str, t.List[str]]] = None, - run_args: t.Optional[t.Dict[str, t.Union[int, str, float, None]]] = None, - env_vars: t.Optional[t.Dict[str, t.Optional[str]]] = None, - alloc: t.Optional[str] = None, + exe_args: t.Optional[str | list[str]] = None, + run_args: dict[str, int | str | float | None] | None = None, + env_vars: dict[str, str | None] | None = None, + alloc: str | None = None, **kwargs: t.Any, ) -> None: """Initialize run parameters for a slurm job with ``srun`` @@ -69,7 +70,7 @@ def __init__( **kwargs, ) self.alloc = alloc - self.mpmd: t.List[RunSettings] = [] + self.mpmd: list[RunSettings] = [] reserved_run_args = frozenset({"chdir", "D"}) @@ -104,7 +105,7 @@ def make_mpmd(self, settings: RunSettings) -> None: ) self.mpmd.append(settings) - def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: + def set_hostlist(self, host_list: str | list[str]) -> None: """Specify the hostlist for this job This sets ``--nodelist`` @@ -129,7 +130,7 @@ def set_hostlist_from_file(self, file_path: str) -> None: """ self.run_args["nodefile"] = file_path - def set_excluded_hosts(self, host_list: t.Union[str, t.List[str]]) -> None: + def set_excluded_hosts(self, host_list: str | list[str]) -> None: """Specify a list of hosts to exclude for launching this job :param host_list: hosts to exclude @@ -170,7 +171,7 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None: """ self.run_args["ntasks-per-node"] = int(tasks_per_node) - def set_cpu_bindings(self, bindings: t.Union[int, t.List[int]]) -> None: + def set_cpu_bindings(self, bindings: int | list[int]) -> None: """Bind by setting CPU masks on tasks This sets ``--cpu-bind`` using the ``map_cpu:`` option @@ -216,7 +217,7 @@ def set_quiet_launch(self, quiet: bool) -> None: else: self.run_args.pop("quiet", None) - def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: + def set_broadcast(self, dest_path: str | None = None) -> None: """Copy executable file to allocated compute nodes This sets ``--bcast`` @@ -225,7 +226,7 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: """ self.run_args["bcast"] = dest_path - def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None: + def set_node_feature(self, feature_list: str | list[str]) -> None: """Specify the node feature for this job This sets ``-C`` @@ -261,7 +262,7 @@ def set_walltime(self, walltime: str) -> None: """ self.run_args["time"] = str(walltime) - def set_het_group(self, het_group: t.Iterable[int]) -> None: + def set_het_group(self, het_group: Iterable[int]) -> None: """Set the heterogeneous group for this job this sets `--het-group` @@ -291,7 +292,7 @@ def set_het_group(self, het_group: t.Iterable[int]) -> None: logger.warning(msg) self.run_args["het-group"] = ",".join(str(group) for group in het_group) - def format_run_args(self) -> t.List[str]: + def format_run_args(self) -> list[str]: """Return a list of slurm formatted run arguments :return: list of slurm arguments for these settings @@ -331,7 +332,7 @@ def check_env_vars(self) -> None: ) logger.warning(msg) - def format_env_vars(self) -> t.List[str]: + def format_env_vars(self) -> list[str]: """Build bash compatible environment variable string for Slurm :returns: the formatted string of environment variables @@ -339,7 +340,7 @@ def format_env_vars(self) -> t.List[str]: self.check_env_vars() return [f"{k}={v}" for k, v in self.env_vars.items() if "," not in str(v)] - def format_comma_sep_env_vars(self) -> t.Tuple[str, t.List[str]]: + def format_comma_sep_env_vars(self) -> tuple[str, list[str]]: """Build environment variable string for Slurm Slurm takes exports in comma separated lists @@ -393,10 +394,10 @@ def fmt_walltime(hours: int, minutes: int, seconds: int) -> str: class SbatchSettings(BatchSettings): def __init__( self, - nodes: t.Optional[int] = None, + nodes: int | None = None, time: str = "", - account: t.Optional[str] = None, - batch_args: t.Optional[t.Dict[str, t.Optional[str]]] = None, + account: str | None = None, + batch_args: dict[str, str | None] | None = None, **kwargs: t.Any, ) -> None: """Specify run parameters for a Slurm batch job @@ -477,7 +478,7 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: """ self.batch_args["cpus-per-task"] = str(int(cpus_per_task)) - def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: + def set_hostlist(self, host_list: str | list[str]) -> None: """Specify the hostlist for this job :param host_list: hosts to launch on @@ -491,7 +492,7 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: raise TypeError("host_list argument must be list of strings") self.batch_args["nodelist"] = ",".join(host_list) - def format_batch_args(self) -> t.List[str]: + def format_batch_args(self) -> list[str]: """Get the formatted batch arguments for a preview :return: batch arguments for Sbatch diff --git a/smartsim/wlm/__init__.py b/smartsim/wlm/__init__.py index 1f70dcf3f6..b870de74a7 100644 --- a/smartsim/wlm/__init__.py +++ b/smartsim/wlm/__init__.py @@ -66,7 +66,7 @@ def detect_launcher() -> str: return "local" -def get_hosts(launcher: t.Optional[str] = None) -> t.List[str]: +def get_hosts(launcher: str | None = None) -> list[str]: """Get the name of the hosts used in an allocation. :param launcher: Name of the WLM to use to collect allocation info. If no launcher @@ -83,7 +83,7 @@ def get_hosts(launcher: t.Optional[str] = None) -> t.List[str]: raise SSUnsupportedError(f"SmartSim cannot get hosts for launcher `{launcher}`") -def get_queue(launcher: t.Optional[str] = None) -> str: +def get_queue(launcher: str | None = None) -> str: """Get the name of the queue used in an allocation. :param launcher: Name of the WLM to use to collect allocation info. If no launcher @@ -100,7 +100,7 @@ def get_queue(launcher: t.Optional[str] = None) -> str: raise SSUnsupportedError(f"SmartSim cannot get queue for launcher `{launcher}`") -def get_tasks(launcher: t.Optional[str] = None) -> int: +def get_tasks(launcher: str | None = None) -> int: """Get the number of tasks in an allocation. :param launcher: Name of the WLM to use to collect allocation info. If no launcher @@ -117,7 +117,7 @@ def get_tasks(launcher: t.Optional[str] = None) -> int: raise SSUnsupportedError(f"SmartSim cannot get tasks for launcher `{launcher}`") -def get_tasks_per_node(launcher: t.Optional[str] = None) -> t.Dict[str, int]: +def get_tasks_per_node(launcher: str | None = None) -> dict[str, int]: """Get a map of nodes in an allocation to the number of tasks on each node. :param launcher: Name of the WLM to use to collect allocation info. If no launcher diff --git a/smartsim/wlm/pbs.py b/smartsim/wlm/pbs.py index a7e1dae87c..0f7133072c 100644 --- a/smartsim/wlm/pbs.py +++ b/smartsim/wlm/pbs.py @@ -26,7 +26,6 @@ import json import os -import typing as t from shutil import which from smartsim.error.errors import LauncherError, SmartSimError @@ -34,7 +33,7 @@ from .._core.launcher.pbs.pbsCommands import qstat -def get_hosts() -> t.List[str]: +def get_hosts() -> list[str]: """Get the name of the hosts used in a PBS allocation. :returns: Names of the host nodes @@ -92,7 +91,7 @@ def get_tasks() -> int: ) -def get_tasks_per_node() -> t.Dict[str, int]: +def get_tasks_per_node() -> dict[str, int]: """Get the number of processes on each chunk in a PBS allocation. .. note:: diff --git a/smartsim/wlm/slurm.py b/smartsim/wlm/slurm.py index 490e46b218..f4fd579735 100644 --- a/smartsim/wlm/slurm.py +++ b/smartsim/wlm/slurm.py @@ -25,7 +25,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os -import typing as t from shutil import which from .._core.launcher.slurm.slurmCommands import salloc, scancel, scontrol, sinfo @@ -45,9 +44,9 @@ def get_allocation( nodes: int = 1, - time: t.Optional[str] = None, - account: t.Optional[str] = None, - options: t.Optional[t.Dict[str, str]] = None, + time: str | None = None, + account: str | None = None, + options: dict[str, str] | None = None, ) -> str: """Request an allocation @@ -125,7 +124,7 @@ def release_allocation(alloc_id: str) -> None: logger.info(f"Successfully freed allocation {alloc_id}") -def validate(nodes: int = 1, ppn: int = 1, partition: t.Optional[str] = None) -> bool: +def validate(nodes: int = 1, ppn: int = 1, partition: str | None = None) -> bool: """Check that there are sufficient resources in the provided Slurm partitions. if no partition is provided, the default partition is found and used. @@ -191,14 +190,14 @@ def get_default_partition() -> str: return default -def _get_system_partition_info() -> t.Dict[str, Partition]: +def _get_system_partition_info() -> dict[str, Partition]: """Build a dictionary of slurm partitions :returns: dict of Partition objects """ sinfo_output, _ = sinfo(["--noheader", "--format", "%R %n %c"]) - partitions: t.Dict[str, Partition] = {} + partitions: dict[str, Partition] = {} for line in sinfo_output.split("\n"): line = line.strip() if line == "": @@ -220,10 +219,10 @@ def _get_system_partition_info() -> t.Dict[str, Partition]: def _get_alloc_cmd( nodes: int, - time: t.Optional[str] = None, - account: t.Optional[str] = None, - options: t.Optional[t.Dict[str, str]] = None, -) -> t.List[str]: + time: str | None = None, + account: str | None = None, + options: dict[str, str] | None = None, +) -> list[str]: """Return the command to request an allocation from Slurm with the class variables as the slurm options. """ @@ -278,7 +277,7 @@ def _validate_time_format(time: str) -> str: return fmt_walltime(hours, minutes, seconds) -def get_hosts() -> t.List[str]: +def get_hosts() -> list[str]: """Get the name of the nodes used in a slurm allocation. .. note:: @@ -327,7 +326,7 @@ def get_tasks() -> int: raise SmartSimError("Could not parse number of requested tasks from SLURM_NTASKS") -def get_tasks_per_node() -> t.Dict[str, int]: +def get_tasks_per_node() -> dict[str, int]: """Get the number of tasks per each node in a slurm allocation. .. note:: diff --git a/tests/on_wlm/test_dragon_entrypoint.py b/tests/on_wlm/test_dragon_entrypoint.py index 287088a7fb..c0ae04d1f1 100644 --- a/tests/on_wlm/test_dragon_entrypoint.py +++ b/tests/on_wlm/test_dragon_entrypoint.py @@ -40,7 +40,7 @@ @pytest.fixture -def mock_argv() -> t.List[str]: +def mock_argv() -> list[str]: """Fixture for returning valid arguments to the entrypoint""" return ["+launching_address", "mock-addr", "+interface", "mock-interface"] @@ -83,7 +83,7 @@ def test_file_removal_on_bad_path(test_dir: str, monkeypatch: pytest.MonkeyPatch def test_dragon_failure( - mock_argv: t.List[str], test_dir: str, monkeypatch: pytest.MonkeyPatch + mock_argv: list[str], test_dir: str, monkeypatch: pytest.MonkeyPatch ): """Verify that the expected cleanup actions are taken when the dragon entrypoint exits""" @@ -110,7 +110,7 @@ def raiser(args_) -> int: def test_dragon_main( - mock_argv: t.List[str], test_dir: str, monkeypatch: pytest.MonkeyPatch + mock_argv: list[str], test_dir: str, monkeypatch: pytest.MonkeyPatch ): """Verify that the expected startup & cleanup actions are taken when the dragon entrypoint exits""" @@ -228,7 +228,7 @@ def increment_counter(*args, **kwargs): def test_signal_handler_registration(test_dir: str, monkeypatch: pytest.MonkeyPatch): """Verify that signal handlers are registered for all expected signals""" - sig_nums: t.List[int] = [] + sig_nums: list[int] = [] def track_args(*args, **kwargs): nonlocal sig_nums diff --git a/tests/on_wlm/test_preview_wlm.py b/tests/on_wlm/test_preview_wlm.py index 78da30c9af..277356b000 100644 --- a/tests/on_wlm/test_preview_wlm.py +++ b/tests/on_wlm/test_preview_wlm.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2025, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/test_cli.py b/tests/test_cli.py index 7abf490811..a6db1169d6 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -51,13 +51,6 @@ _TEST_LOGGER = logging.getLogger(__name__) -try: - import smartdashboard -except: - test_dash_plugin = False -else: - test_dash_plugin = True - def mock_execute_custom(msg: str = None, good: bool = True) -> int: retval = 0 if good else 1 @@ -66,20 +59,20 @@ def mock_execute_custom(msg: str = None, good: bool = True) -> int: def mock_execute_good( - _ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = None + _ns: argparse.Namespace, _unparsed: list[str] | None = None ) -> int: return mock_execute_custom("GOOD THINGS", good=True) def mock_execute_fail( - _ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = None + _ns: argparse.Namespace, _unparsed: list[str] | None = None ) -> int: return mock_execute_custom("BAD THINGS", good=False) def test_cli_default_args_parsing(capsys): """Test default parser behaviors with no subparsers""" - menu: t.List[cli.MenuItemConfig] = [] + menu: list[cli.MenuItemConfig] = [] smart_cli = cli.SmartCli(menu) captured = capsys.readouterr() # throw away existing output @@ -118,7 +111,7 @@ def test_cli_invalid_command(capsys): def test_cli_bad_default_args_parsing_bad_help(capsys): """Test passing an argument name that is incorrect""" - menu: t.List[cli.MenuItemConfig] = [] + menu: list[cli.MenuItemConfig] = [] smart_cli = cli.SmartCli(menu) captured = capsys.readouterr() # throw away existing output @@ -134,7 +127,7 @@ def test_cli_bad_default_args_parsing_bad_help(capsys): def test_cli_bad_default_args_parsing_good_help(capsys): """Test passing an argument name that is correct""" - menu: t.List[cli.MenuItemConfig] = [] + menu: list[cli.MenuItemConfig] = [] smart_cli = cli.SmartCli(menu) captured = capsys.readouterr() # throw away existing output @@ -342,25 +335,6 @@ def test_cli_default_cli(capsys): assert ret_val == os.EX_USAGE -@pytest.mark.skipif(not test_dash_plugin, reason="plugin not found") -def test_cli_plugin_dashboard(capfd): - """Ensure expected dashboard CLI plugin commands are supported""" - smart_cli = cli.default_cli() - capfd.readouterr() # throw away existing output - - # execute with `dashboard` argument, expect dashboard-specific help text - build_args = ["smart", "dashboard", "-h"] - rc = smart_cli.execute(build_args) - - captured = capfd.readouterr() # capture new output - - assert "[-d DIRECTORY]" in captured.out - assert "[-p PORT]" in captured.out - - assert "optional arguments:" in captured.out - assert rc == 0 - - def test_cli_plugin_invalid( monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture ): @@ -371,9 +345,9 @@ def test_cli_plugin_invalid( plugin_module = "notinstalled.Experiment_Overview" bad_plugins = [ lambda: MenuItemConfig( - "dashboard", - "Start the SmartSim dashboard", - plugin.dynamic_execute(plugin_module, "Dashboard!"), + "testplugin", + "Test plugin for invalid plugin test", + plugin.dynamic_execute(plugin_module, "TestPlugin!"), is_plugin=True, ) ] @@ -387,8 +361,8 @@ def test_cli_plugin_invalid( smart_cli = cli.default_cli() - # execute with `dashboard` argument, expect failure to find dashboard plugin - build_args = ["smart", "dashboard", "-h"] + # execute with invalid plugin argument, expect failure to find plugin + build_args = ["smart", "testplugin", "-h"] rc = smart_cli.execute(build_args) @@ -414,7 +388,7 @@ def test_cli_plugin_invalid( def test_cli_action(capsys, monkeypatch, command, mock_location, exp_output): """Ensure the default CLI executes the build action""" - def mock_execute(ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = None): + def mock_execute(ns: argparse.Namespace, _unparsed: list[str] | None = None): print(exp_output) return 0 @@ -470,7 +444,7 @@ def test_cli_optional_args( ): """Ensure the parser for a command handles expected optional arguments""" - def mock_execute(ns: argparse.Namespace, _unparsed: t.Optional[t.List[str]] = None): + def mock_execute(ns: argparse.Namespace, _unparsed: list[str] | None = None): print(exp_output) return 0 @@ -521,7 +495,7 @@ def test_cli_help_support( ): """Ensure the parser supports help optional for commands as expected""" - def mock_execute(ns: argparse.Namespace, unparsed: t.Optional[t.List[str]] = None): + def mock_execute(ns: argparse.Namespace, unparsed: list[str] | None = None): print(mock_output) return 0 @@ -560,7 +534,7 @@ def test_cli_invalid_optional_args( ): """Ensure the parser throws expected error for an invalid argument""" - def mock_execute(ns: argparse.Namespace, unparsed: t.Optional[t.List[str]] = None): + def mock_execute(ns: argparse.Namespace, unparsed: list[str] | None = None): print(exp_output) return 0 diff --git a/tests/test_collector_manager.py b/tests/test_collector_manager.py deleted file mode 100644 index f4f0d0397e..0000000000 --- a/tests/test_collector_manager.py +++ /dev/null @@ -1,481 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2025, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import asyncio -import datetime - -import pytest - -from conftest import MockCollectorEntityFunc -from smartsim._core.utils.telemetry.collector import ( - CollectorManager, - DBConnectionCollector, - DBConnectionCountCollector, - DBMemoryCollector, - FileSink, - redisa, -) -from smartsim._core.utils.telemetry.telemetry import JobEntity - -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_a - - -def test_collector_manager_add(mock_entity: MockCollectorEntityFunc, mock_sink) -> None: - """Ensure that collector manager add & clear work as expected""" - entity1 = mock_entity(telemetry_on=True) - - con_col = DBConnectionCollector(entity1, mock_sink()) - mem_col = DBMemoryCollector(entity1, mock_sink()) - - manager = CollectorManager() - - # ensure manager starts empty - assert len(list(manager.all_collectors)) == 0 - - # ensure added item is in the collector list - manager.add(con_col) - assert len(list(manager.all_collectors)) == 1 - - # ensure a duplicate isn't added - manager.add(con_col) - assert len(list(manager.all_collectors)) == 1 - - # ensure another collector for the same entity is added - manager.add(mem_col) - assert len(list(manager.all_collectors)) == 2 - - # create a collector for another entity - entity2 = mock_entity(telemetry_on=True) - con_col2 = DBConnectionCollector(entity2, mock_sink()) - - # ensure collectors w/same type for new entities are not treated as dupes - manager.add(con_col2) - assert len(list(manager.all_collectors)) == 3 - - # verify no dupe on second entity - manager.add(con_col2) - assert len(list(manager.all_collectors)) == 3 - - manager.clear() - assert len(list(manager.all_collectors)) == 0 - - # ensure post-clear adding still works - manager.add(con_col2) - assert len(list(manager.all_collectors)) == 1 - - -def test_collector_manager_add_multi( - mock_entity: MockCollectorEntityFunc, mock_sink -) -> None: - """Ensure that collector manager multi-add works as expected""" - entity = mock_entity(telemetry_on=True) - - con_col = DBConnectionCollector(entity, mock_sink()) - mem_col = DBMemoryCollector(entity, mock_sink()) - manager = CollectorManager() - - # add multiple items at once - manager.add_all([con_col, mem_col]) - - assert len(list(manager.all_collectors)) == 2 - - # ensure multi-add does not produce dupes - con_col2 = DBConnectionCollector(entity, mock_sink()) - mem_col2 = DBMemoryCollector(entity, mock_sink()) - - manager.add_all([con_col2, mem_col2]) - assert len(list(manager.all_collectors)) == 2 - - -@pytest.mark.asyncio -async def test_collector_manager_remove( - mock_entity: MockCollectorEntityFunc, mock_sink -) -> None: - """Ensure that collector manager solo remove works as expected""" - entity1 = mock_entity(telemetry_on=True) - entity2 = mock_entity(telemetry_on=True) - - con_col1 = DBConnectionCollector(entity1, mock_sink()) - mem_col1 = DBMemoryCollector(entity1, mock_sink()) - manager = CollectorManager() - - # ensure multi-add does not produce dupes - con_col2 = DBConnectionCollector(entity2, mock_sink()) - mem_col2 = DBMemoryCollector(entity2, mock_sink()) - - manager.add_all([con_col1, mem_col1, con_col2, mem_col2]) - assert len(manager.all_collectors) == 4 - - await manager.remove(entity1) - assert len(manager.all_collectors) == 2 - - await manager.remove(entity1) - assert len(manager.all_collectors) == 2 - - await manager.remove(entity2) - assert len(manager.all_collectors) == 0 - - -@pytest.mark.asyncio -async def test_collector_manager_remove_all( - mock_entity: MockCollectorEntityFunc, mock_sink -) -> None: - """Ensure that collector manager multi-remove works as expected""" - entity1 = mock_entity(telemetry_on=True) - entity2 = mock_entity(telemetry_on=True) - - con_col1 = DBConnectionCollector(entity1, mock_sink()) - mem_col1 = DBMemoryCollector(entity1, mock_sink()) - manager = CollectorManager() - - # ensure multi-add does not produce dupes - con_col2 = DBConnectionCollector(entity2, mock_sink()) - mem_col2 = DBMemoryCollector(entity2, mock_sink()) - - manager.add_all([con_col1, mem_col1, con_col2, mem_col2]) - assert len(manager.all_collectors) == 4 - - await manager.remove_all([entity1, entity2]) - assert len(manager.all_collectors) == 0 - - -@pytest.mark.asyncio -async def test_collector_manager_collect( - mock_entity: MockCollectorEntityFunc, - mock_redis, - monkeypatch: pytest.MonkeyPatch, - mock_con, - mock_mem, - mock_sink, -) -> None: - """Ensure that all collectors are executed and some metric is retrieved - NOTE: responses & producer are mocked""" - entity1 = mock_entity(port=1234, name="entity1", telemetry_on=True) - entity2 = mock_entity(port=2345, name="entity2", telemetry_on=True) - - sinks = [mock_sink(), mock_sink(), mock_sink()] - con_col1 = DBConnectionCollector(entity1, sinks[0]) - mem_col1 = DBMemoryCollector(entity1, sinks[1]) - mem_col2 = DBMemoryCollector(entity2, sinks[2]) - - manager = CollectorManager() - manager.add_all([con_col1, mem_col1, mem_col2]) - - # Execute collection - with monkeypatch.context() as ctx: - ctx.setattr( - redisa, - "Redis", - mock_redis(client_stats=mock_con(1, 10), mem_stats=mock_mem(1, 10)), - ) - await manager.collect() - - # verify each collector retrieved some metric & sent it to the sink - for sink in sinks: - value = sink.args - assert value - - -@pytest.mark.asyncio -async def test_collector_manager_collect_filesink( - mock_entity: MockCollectorEntityFunc, - mock_redis, - monkeypatch, - mock_mem, - mock_con, -) -> None: - """Ensure that all collectors are executed and some metric is retrieved - and the FileSink is written to as expected""" - entity1 = mock_entity(port=1234, name="entity1", telemetry_on=True) - entity2 = mock_entity(port=2345, name="entity2", telemetry_on=True) - - sinks = [ - FileSink(entity1.status_dir + "/1_con.csv"), - FileSink(entity1.status_dir + "/1_mem.csv"), - FileSink(entity2.status_dir + "/2_mem.csv"), - ] - con_col1 = DBConnectionCollector(entity1, sinks[0]) - mem_col1 = DBMemoryCollector(entity1, sinks[1]) - mem_col2 = DBMemoryCollector(entity2, sinks[2]) - - manager = CollectorManager() - manager.add_all([con_col1, mem_col1, mem_col2]) - - # Execute collection - with monkeypatch.context() as ctx: - ctx.setattr( - redisa, - "Redis", - mock_redis(client_stats=mock_con(1, 10), mem_stats=mock_mem(1, 10)), - ) - await manager.collect() - - # verify each collector retrieved some metric & sent it to the sink - for sink in sinks: - save_to = sink.path - assert save_to.exists() - if "con" in str(save_to): - assert "127.0.0." in save_to.read_text() - else: - # look for something multiplied by 1000 - assert "000" in save_to.read_text() - - -@pytest.mark.asyncio -async def test_collector_manager_collect_integration( - test_dir: str, mock_entity: MockCollectorEntityFunc, prepare_db, local_db, mock_sink -) -> None: - """Ensure that all collectors are executed and some metric is retrieved""" - - db = prepare_db(local_db).orchestrator - entity1 = mock_entity(port=db.ports[0], name="e1", telemetry_on=True) - entity2 = mock_entity(port=db.ports[0], name="e2", telemetry_on=True) - - # todo: consider a MockSink so i don't have to save the last value in the collector - sinks = [mock_sink(), mock_sink(), mock_sink()] - con_col1 = DBConnectionCollector(entity1, sinks[0]) - mem_col1 = DBMemoryCollector(entity1, sinks[1]) - mem_col2 = DBMemoryCollector(entity2, sinks[2]) - - manager = CollectorManager() - manager.add_all([con_col1, mem_col1, mem_col2]) - - # Execute collection - await manager.collect() - - # verify each collector retrieved some metric & sent it to the sink - for sink in sinks: - value = sink.args - assert value - - -@pytest.mark.parametrize( - "timeout_at,delay_for,expect_fail", - [ - pytest.param(1000, 5000, True, id="1s timeout"), - pytest.param(2000, 5000, True, id="2s timeout"), - pytest.param(3000, 5000, True, id="3s timeout"), - pytest.param(4000, 5000, True, id="4s timeout"), - pytest.param(2000, 1000, False, id="under timeout"), - ], -) -@pytest.mark.asyncio -async def test_collector_manager_timeout_db( - mock_entity: MockCollectorEntityFunc, - mock_redis, - monkeypatch: pytest.MonkeyPatch, - mock_mem, - mock_con, - timeout_at: int, - delay_for: int, - expect_fail: bool, - mock_sink, -) -> None: - """Ensure that the collector timeout is honored""" - entity1 = mock_entity(port=1234, name="e1", telemetry_on=True) - entity2 = mock_entity(port=2345, name="e2", telemetry_on=True) - - sinks = [mock_sink(), mock_sink(), mock_sink()] - con_col1 = DBConnectionCollector(entity1, sinks[0]) - mem_col1 = DBMemoryCollector(entity1, sinks[1]) - mem_col2 = DBMemoryCollector(entity2, sinks[2]) - - manager = CollectorManager(timeout_ms=timeout_at) - manager.add_all([con_col1, mem_col1, mem_col2]) - - async def snooze() -> None: - await asyncio.sleep(delay_for / 1000) - - # Execute collection - with monkeypatch.context() as ctx: - ctx.setattr( - redisa, - "Redis", - mock_redis( - client_stats=mock_con(1, 10), - mem_stats=mock_mem(1, 10), - coll_side_effect=snooze, - ), - ) - - ts0 = datetime.datetime.utcnow() - await manager.collect() - ts1 = datetime.datetime.utcnow() - - t_diff = ts1 - ts0 - actual_delay = 1000 * t_diff.seconds - - if expect_fail: - assert timeout_at <= actual_delay < delay_for - else: - assert delay_for <= actual_delay < timeout_at - - -@pytest.mark.parametrize( - "e_type,telemetry_on", - [ - pytest.param("model", False, id="models"), - pytest.param("model", True, id="models, telemetry enabled"), - pytest.param("ensemble", False, id="ensemble"), - pytest.param("ensemble", True, id="ensemble, telemetry enabled"), - pytest.param("orchestrator", False, id="orchestrator"), - pytest.param("orchestrator", True, id="orchestrator, telemetry enabled"), - pytest.param("dbnode", False, id="dbnode"), - pytest.param("dbnode", True, id="dbnode, telemetry enabled"), - ], -) -@pytest.mark.asyncio -async def test_collector_manager_find_nondb( - mock_entity: MockCollectorEntityFunc, - e_type: str, - telemetry_on: bool, -) -> None: - """Ensure that the number of collectors returned for entity types match expectations - NOTE: even orchestrator returns 0 mapped collectors because no collector output - paths are set on the entity""" - entity = mock_entity(port=1234, name="e1", type=e_type, telemetry_on=telemetry_on) - manager = CollectorManager(timeout_ms=10000) - - # Ask manager to produce appliable collectors - manager.register_collectors(entity) - collectors = manager.all_collectors - - # Verify collector counts, assuming no per-collector config - assert 0 == len(collectors) - - -@pytest.mark.asyncio -async def test_collector_manager_find_db(mock_entity: MockCollectorEntityFunc) -> None: - """Ensure that the manifest allows individually enabling a given collector""" - entity: JobEntity = mock_entity( - port=1234, name="entity1", type="model", telemetry_on=True - ) - manager = CollectorManager() - - # 0. popping all should result in no collectors mapping to the entity - manager.register_collectors(entity) - collectors = manager.all_collectors - - assert len(collectors) == 0 - - # 1. ensure DBConnectionCountCollector is mapped - entity = mock_entity( - port=1234, name="entity1", type="orchestrator", telemetry_on=True - ) - entity.collectors["client"] = "mock/path.csv" - manager = CollectorManager() - - # 2. client count collector should be mapped - manager.register_collectors(entity) - collectors = manager.all_collectors - - assert len(collectors) == 1 - assert isinstance(collectors[0], DBConnectionCollector) - - # 3. ensure DBConnectionCountCollector is mapped - entity = mock_entity( - port=1234, name="entity1", type="orchestrator", telemetry_on=True - ) - entity.collectors["client_count"] = "mock/path.csv" - manager = CollectorManager() - - # 4. client count collector should be mapped - manager.register_collectors(entity) - collectors = manager.all_collectors - - assert len(collectors) == 1 - assert isinstance(collectors[0], DBConnectionCountCollector) - - # ensure DbMemoryCollector is mapped - entity = mock_entity( - port=1234, name="entity1", type="orchestrator", telemetry_on=True - ) - entity.collectors["memory"] = "mock/path.csv" - manager = CollectorManager() - - # 5. memory collector should be mapped - manager.register_collectors(entity) - collectors = manager.all_collectors - - assert len(collectors) == 1 - assert isinstance(collectors[0], DBMemoryCollector) - - -@pytest.mark.asyncio -async def test_collector_manager_find_entity_disabled( - mock_entity: MockCollectorEntityFunc, -) -> None: - """Ensure that disabling telemetry on the entity results in no collectors""" - entity: JobEntity = mock_entity(port=1234, name="entity1", type="orchestrator") - - # set paths for all known collectors - entity.collectors["client"] = "mock/path.csv" - entity.collectors["client_count"] = "mock/path.csv" - entity.collectors["memory"] = "mock/path.csv" - - manager = CollectorManager() - - # ON behavior should locate multiple collectors - entity.telemetry_on = True - manager.register_collectors(entity) - collectors = manager.all_collectors - assert len(collectors) > 0 - - # OFF behavior should locate ZERO collectors - entity.telemetry_on = False - manager.register_collectors(entity) - collectors = manager.all_collectors - assert len(collectors) == 0 - - -@pytest.mark.asyncio -async def test_collector_manager_find_entity_unmapped( - mock_entity: MockCollectorEntityFunc, -) -> None: - """Ensure that an entity type that is not mapped results in no collectors""" - entity: JobEntity = mock_entity( - port=1234, name="entity1", type="model", telemetry_on=True - ) - manager = CollectorManager() - - # set paths for all known collectors - entity.collectors["client"] = "mock/path.csv" - entity.collectors["client_count"] = "mock/path.csv" - entity.collectors["memory"] = "mock/path.csv" - - manager = CollectorManager() - - # ON behavior should locate ZERO collectors - entity.telemetry_on = True - manager.register_collectors(entity) - collectors = manager.all_collectors - assert len(collectors) == 0 - - # OFF behavior should locate ZERO collectors - entity.telemetry_on = False - manager.register_collectors(entity) - collectors = manager.all_collectors - assert len(collectors) == 0 diff --git a/tests/test_collector_sink.py b/tests/test_collector_sink.py deleted file mode 100644 index f36a905272..0000000000 --- a/tests/test_collector_sink.py +++ /dev/null @@ -1,107 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2025, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import uuid - -import pytest - -from conftest import MockCollectorEntityFunc -from smartsim._core.utils.telemetry.collector import FileSink - -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_a - - -@pytest.mark.asyncio -async def test_sink_null_filename(mock_entity: MockCollectorEntityFunc) -> None: - """Ensure the filesink handles a null filename as expected""" - with pytest.raises(ValueError): - # pass null file path - sink = FileSink(None) # type: ignore - - -@pytest.mark.asyncio -async def test_sink_write(mock_entity: MockCollectorEntityFunc) -> None: - """Ensure the FileSink writes values to the output file as expected""" - entity = mock_entity(port=1234, name="e1") - sink = FileSink(entity.status_dir + "/test.csv") - - # all values are converted to strings before saving - v1, v2, v3 = str(uuid.uuid4()), str(uuid.uuid4()), str(uuid.uuid4()) - await sink.save(v1, v2, v3) - - # show file was written - path = sink.path - assert path.exists() - - # show each value is found in the file - content = path.read_text() - for value in [v1, v2, v3]: - assert str(value) in content - - -@pytest.mark.asyncio -async def test_sink_write_nonstring_input(mock_entity: MockCollectorEntityFunc) -> None: - """Ensure the FileSink writes values to the output file as expected - when inputs are non-strings""" - entity = mock_entity(port=1234, name="e1") - sink = FileSink(entity.status_dir + "/test.csv") - - # v1, v2 are not converted to strings - v1, v2 = 1, uuid.uuid4() - await sink.save(v1, v2) - - # show file was written - path = sink.path - assert path.exists() - - # split down to individual elements to ensure expected default format - content = path.read_text() - lines = content.splitlines() - line = lines[0].split(",") - - # show each value can be found - assert [str(v1), str(v2)] == line - - -@pytest.mark.asyncio -async def test_sink_write_no_inputs(mock_entity: MockCollectorEntityFunc) -> None: - """Ensure the FileSink writes to an output file without error if no - values are supplied""" - entity = mock_entity(port=1234, name="e1") - sink = FileSink(entity.status_dir + "/test.csv") - - num_saves = 5 - for _ in range(num_saves): - await sink.save() - - path = sink.path - assert path.exists() - - # show file was written - content = path.read_text() - - # show a line was written for each call to save - assert len(content.splitlines()) == num_saves diff --git a/tests/test_collectors.py b/tests/test_collectors.py deleted file mode 100644 index 3bd5ce625c..0000000000 --- a/tests/test_collectors.py +++ /dev/null @@ -1,305 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2025, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# import pathlib - -import typing as t - -import pytest - -import smartsim._core.entrypoints.telemetrymonitor -import smartsim._core.utils.telemetry.collector -from conftest import MockCollectorEntityFunc, MockSink -from smartsim._core.utils.telemetry.collector import ( - DBConnectionCollector, - DBConnectionCountCollector, - DBMemoryCollector, - redisa, -) - -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_a - -PrepareDB = t.Callable[[dict], smartsim.experiment.Orchestrator] - - -@pytest.mark.asyncio -async def test_dbmemcollector_prepare( - mock_entity: MockCollectorEntityFunc, mock_sink -) -> None: - """Ensure that collector preparation succeeds when expected""" - entity = mock_entity(telemetry_on=True) - - collector = DBMemoryCollector(entity, mock_sink()) - await collector.prepare() - assert collector._client - - -@pytest.mark.asyncio -async def test_dbmemcollector_prepare_fail( - mock_entity: MockCollectorEntityFunc, - mock_sink: MockSink, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Ensure that collector preparation reports a failure to connect - when the redis client cannot be created""" - entity = mock_entity(telemetry_on=True) - - with monkeypatch.context() as ctx: - # mock up a redis constructor that returns None - ctx.setattr(redisa, "Redis", lambda host, port: None) - - sink = mock_sink() - collector = DBMemoryCollector(entity, sink) - assert sink.num_saves == 0 - - await collector.prepare() - - # Attempt to save header when preparing... - assert not collector._client - assert sink.num_saves == 1 - - -@pytest.mark.asyncio -async def test_dbcollector_config( - mock_entity: MockCollectorEntityFunc, - mock_sink, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Ensure that missing required db collector config causes an exception""" - - # Check that a bad host causes exception - entity = mock_entity(host="", telemetry_on=True) - with pytest.raises(ValueError): - DBMemoryCollector(entity, mock_sink()) - - entity = mock_entity(host=" ", telemetry_on=True) - with pytest.raises(ValueError): - DBMemoryCollector(entity, mock_sink()) - - # Check that a bad port causes exception - entity = mock_entity(port="", telemetry_on=True) # type: ignore - with pytest.raises(ValueError): - DBMemoryCollector(entity, mock_sink()) - - -@pytest.mark.asyncio -async def test_dbmemcollector_prepare_fail_dep( - mock_entity: MockCollectorEntityFunc, - mock_sink, - monkeypatch: pytest.MonkeyPatch, - capsys: pytest.CaptureFixture[t.Any], -) -> None: - """Ensure that collector preparation attempts to connect, ensure it - reports a failure if the db conn bombs""" - entity = mock_entity(telemetry_on=True) - - def raiser(*args: t.Any, **kwargs: t.Any) -> None: - # mock raising exception on connect attempts to test err handling - raise redisa.ConnectionError("mock connection failure") - - sink = mock_sink() - collector = DBMemoryCollector(entity, sink) - with monkeypatch.context() as ctx: - ctx.setattr(redisa, "Redis", raiser) - - assert sink.num_saves == 0 - await collector.prepare() - - assert sink.num_saves == 1 - assert not collector._client - - -@pytest.mark.asyncio -async def test_dbmemcollector_collect( - mock_entity: MockCollectorEntityFunc, - mock_redis, - mock_mem, - mock_sink, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Ensure that a valid response is returned as expected""" - entity = mock_entity(telemetry_on=True) - - sink = mock_sink() - collector = DBMemoryCollector(entity, sink) - with monkeypatch.context() as ctx: - ctx.setattr(redisa, "Redis", mock_redis(mem_stats=mock_mem(1, 2))) - ctx.setattr( - smartsim._core.utils.telemetry.collector, - "get_ts_ms", - lambda: 12131415, - ) - - await collector.prepare() - await collector.collect() - - reqd_items = { - "timestamp", - "total_system_memory", - "used_memory", - "used_memory_peak", - } - actual_items = set(sink.args) - - reqd_values = {12131415, 1000.0, 1111.0, 1234.0} - actual_values = set(sink.args) - assert actual_values == reqd_values - - -@pytest.mark.asyncio -async def test_dbmemcollector_integration( - mock_entity: MockCollectorEntityFunc, - mock_sink: MockSink, - prepare_db: PrepareDB, - local_db: dict, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Integration test with a real orchestrator instance to ensure - output data matches expectations and proper db client API uage""" - - db = prepare_db(local_db).orchestrator - entity = mock_entity(port=db.ports[0], telemetry_on=True) - - sink = mock_sink() - collector = DBMemoryCollector(entity, sink) - - with monkeypatch.context() as ctx: - ctx.setattr( - smartsim._core.utils.telemetry.collector, - "get_ts_ms", - lambda: 12131415, - ) - assert sink.num_saves == 0 - await collector.prepare() - assert sink.num_saves == 1 - await collector.collect() - assert sink.num_saves == 2 - - stats = sink.args - assert len(stats) == 4 # show we have the expected amount of data points - ts = 12131415 - - assert ts in stats - - -@pytest.mark.asyncio -async def test_dbconncollector_collect( - mock_entity: MockCollectorEntityFunc, - mock_sink, - mock_redis, - mock_con, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Ensure that a valid response is returned as expected""" - entity = mock_entity(telemetry_on=True) - - sink = mock_sink() - collector = DBConnectionCollector(entity, sink) - with monkeypatch.context() as ctx: - ctx.setattr(redisa, "Redis", mock_redis(client_stats=mock_con(1, 2))) - - assert sink.num_saves == 0 - await collector.prepare() - assert sink.num_saves == 1 - await collector.collect() - assert sink.num_saves == 3 # save twice w/two datapoints - - stats = sink.args - - idx = 1 - id0, ip0 = f"ABC{idx}", f"127.0.0.{idx}:1234" - id1, ip1 = f"XYZ{idx}", f"127.0.0.{idx}:2345" - exp_clients = [{"id": id0, "addr": ip0}, {"id": id1, "addr": ip1}] - - assert len(exp_clients) + 1 == len(stats) # output includes timestamp - assert id0 in set(client["id"] for client in exp_clients) - assert id1 in set(client["id"] for client in exp_clients) - assert ip0 in set(client["addr"] for client in exp_clients) - assert ip1 in set(client["addr"] for client in exp_clients) - - -@pytest.mark.asyncio -async def test_dbconn_count_collector_collect( - mock_entity: MockCollectorEntityFunc, - mock_sink, - mock_redis, - mock_con, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Ensure that a valid response is returned as expected""" - entity = mock_entity(telemetry_on=True) - - sink = mock_sink() - collector = DBConnectionCountCollector(entity, sink) - with monkeypatch.context() as ctx: - ctx.setattr(redisa, "Redis", mock_redis(client_stats=mock_con(1, 2))) - - assert sink.num_saves == 0 - await collector.prepare() - assert sink.num_saves == 1 - await collector.collect() - assert sink.num_saves == 2 - - stats = sink.args - exp_counts = 2 - - assert exp_counts == len(stats) # output includes timestamp - - -@pytest.mark.asyncio -async def test_dbconncollector_integration( - mock_entity: MockCollectorEntityFunc, - mock_sink: MockSink, - prepare_db: PrepareDB, - local_db: dict, - monkeypatch: pytest.MonkeyPatch, -) -> None: - """Integration test with a real orchestrator instance to ensure - output data matches expectations and proper db client API uage""" - - db = prepare_db(local_db).orchestrator - entity = mock_entity(port=db.ports[0], telemetry_on=True) - - sink = mock_sink() - collector = DBConnectionCollector(entity, sink) - - with monkeypatch.context() as ctx: - ctx.setattr( - smartsim._core.utils.telemetry.collector, - "get_ts_ms", - lambda: 12131415, - ) - await collector.prepare() - await collector.collect() - stats = sink.args - - ip = "127.0.0.1:" - num_conns = int(stats[1]) - ts = 12131415 - - assert ts in stats - assert num_conns > 0 - assert ip in stats[2] diff --git a/tests/test_config.py b/tests/test_config.py index 458a6df601..16277e8349 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -55,9 +55,7 @@ def test_all_config_defaults(): config.test_device -def get_redisai_env( - rai_path: t.Optional[str], lib_path: t.Optional[str] -) -> t.Dict[str, str]: +def get_redisai_env(rai_path: str | None, lib_path: str | None) -> dict[str, str]: """Convenience method to create a set of environment variables that include RedisAI-specific variables :param rai_path: The path to the RedisAI library @@ -149,7 +147,7 @@ def test_redisai_valid_lib_path(test_dir, monkeypatch): def test_redisai_valid_lib_path_null_rai(test_dir, monkeypatch): """Missing RAI_PATH and valid SMARTSIM_DEP_INSTALL_PATH should succeed""" - rai_file_path: t.Optional[str] = None + rai_file_path: str | None = None lib_file_path = os.path.join(test_dir, "lib", "redisai.so") make_file(lib_file_path) env = get_redisai_env(rai_file_path, test_dir) @@ -197,64 +195,6 @@ def test_redis_cli(): os.environ.pop("REDIS_CLI_PATH") -@pytest.mark.parametrize( - "value, exp_result", - [ - pytest.param("0", False, id="letter zero"), - pytest.param("1", True, id="letter one"), - pytest.param("-1", False, id="letter negative one"), - pytest.param(None, True, id="not in env"), - ], -) -def test_telemetry_flag( - monkeypatch: pytest.MonkeyPatch, value: t.Optional[str], exp_result: bool -): - if value is not None: - monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", value) - else: - monkeypatch.delenv("SMARTSIM_FLAG_TELEMETRY", raising=False) - config = Config() - assert config.telemetry_enabled == exp_result - - -@pytest.mark.parametrize( - "value, exp_result", - [ - pytest.param("1", 1, id="1"), - pytest.param("123", 123, id="123"), - pytest.param(None, 5, id="not in env"), - ], -) -def test_telemetry_frequency( - monkeypatch: pytest.MonkeyPatch, value: t.Optional[str], exp_result: int -): - if value is not None: - monkeypatch.setenv("SMARTSIM_TELEMETRY_FREQUENCY", value) - else: - monkeypatch.delenv("SMARTSIM_TELEMETRY_FREQUENCY", raising=False) - config = Config() - assert config.telemetry_frequency == exp_result - - -@pytest.mark.parametrize( - "value, exp_result", - [ - pytest.param("30", 30, id="30"), - pytest.param("123", 123, id="123"), - pytest.param(None, 90, id="not in env"), - ], -) -def test_telemetry_cooldown( - monkeypatch: pytest.MonkeyPatch, value: t.Optional[str], exp_result: bool -): - if value is not None: - monkeypatch.setenv("SMARTSIM_TELEMETRY_COOLDOWN", value) - else: - monkeypatch.delenv("SMARTSIM_TELEMETRY_COOLDOWN", raising=False) - config = Config() - assert config.telemetry_cooldown == exp_result - - def test_key_path_unset(monkeypatch: pytest.MonkeyPatch): """Ensure that the default value of the key path meets expectations""" monkeypatch.delenv("SMARTSIM_KEY_PATH", raising=False) @@ -281,3 +221,10 @@ def test_key_path_non_default(monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("SMARTSIM_KEY_PATH", key_path2) actual_value = config.smartsim_key_path assert key_path2 == actual_value, "Key path 2 didn't match overridden value" + + +def test_metadata_subdir(): + """Test that metadata_subdir returns the expected path""" + config = Config() + expected_path = Path(".smartsim/metadata") + assert config.metadata_subdir == expected_path diff --git a/tests/test_configs/telemetry/colocatedmodel.json b/tests/test_configs/telemetry/colocatedmodel.json deleted file mode 100644 index f3e93ac762..0000000000 --- a/tests/test_configs/telemetry/colocatedmodel.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "schema info": { - "schema_name": "entity manifest", - "version": "0.0.1" - }, - "experiment": { - "name": "my-exp", - "path": "/tmp/my-exp", - "launcher": "Slurm" - }, - "runs": [ - { - "run_id": "002816b", - "timestamp": 1699037041106269774, - "model": [ - { - "name": "colocated_model", - "path": "/tmp/my-exp/colocated_model", - "exe_args": [ - "/path/to/my/script.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": {} - }, - "batch_settings": {}, - "params": {}, - "files": { - "Symlink": [], - "Configure": [], - "Copy": [] - }, - "colocated_db": { - "settings": { - "unix_socket": "/tmp/redis.socket", - "socket_permissions": 755, - "port": 0, - "cpus": 1, - "custom_pinning": "0", - "debug": false, - "db_identifier": "", - "rai_args": { - "threads_per_queue": null, - "inter_op_parallelism": null, - "intra_op_parallelism": null - }, - "extra_db_args": {} - }, - "scripts": [], - "models": [] - }, - "telemetry_metadata": { - "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_ensemble/002816b/model/colocated_model", - "step_id": "4139111.21", - "task_id": "21529", - "managed": true - }, - "out_file": "/tmp/my-exp/colocated_model/colocated_model.out", - "err_file": "/tmp/my-exp/colocated_model/colocated_model.err" - } - ], - "orchestrator": [], - "ensemble": [] - } - ] -} diff --git a/tests/test_configs/telemetry/db_and_model.json b/tests/test_configs/telemetry/db_and_model.json deleted file mode 100644 index 36edc74868..0000000000 --- a/tests/test_configs/telemetry/db_and_model.json +++ /dev/null @@ -1,89 +0,0 @@ -{ - "schema info": { - "schema_name": "entity manifest", - "version": "0.0.1" - }, - "experiment": { - "name": "my-exp", - "path": "/tmp/my-exp", - "launcher": "Slurm" - }, - "runs": [ - { - "run_id": "2ca19ad", - "timestamp": 1699038647234488933, - "model": [], - "orchestrator": [ - { - "name": "orchestrator", - "type": "redis", - "interface": [ - "ipogif0" - ], - "shards": [ - { - "name": "orchestrator_0", - "hostname": "10.128.0.4", - "port": 6780, - "cluster": false, - "conf_file": null, - "out_file": "/path/to/some/file.out", - "err_file": "/path/to/some/file.err", - "client_file": "/path/to/some/client.log", - "client_count_file": null, - "memory_file": "/path/to/some/mem.log", - "telemetry_metadata": { - "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/orchestrator/orchestrator_0", - "step_id": "4139111.27", - "task_id": "1452", - "managed": true - } - } - ] - } - ], - "ensemble": [] - }, - { - "run_id": "4b5507a", - "timestamp": 1699038661491043211, - "model": [ - { - "name": "perroquet", - "path": "/tmp/my-exp/perroquet", - "exe_args": [ - "/path/to/my/script.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks-per-node": 1 - } - }, - "batch_settings": {}, - "params": {}, - "files": { - "Symlink": [], - "Configure": [], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/4b5507a/model/perroquet", - "step_id": "4139111.28", - "task_id": "2929", - "managed": true - }, - "out_file": "/tmp/my-exp/perroquet/perroquet.out", - "err_file": "/tmp/my-exp/perroquet/perroquet.err" - } - ], - "orchestrator": [], - "ensemble": [] - } - ] -} diff --git a/tests/test_configs/telemetry/db_and_model_1run.json b/tests/test_configs/telemetry/db_and_model_1run.json deleted file mode 100644 index 44e32bfe40..0000000000 --- a/tests/test_configs/telemetry/db_and_model_1run.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema info": { - "schema_name": "entity manifest", - "version": "0.0.1" - }, - "experiment": { - "name": "my-exp", - "path": "/tmp/my-exp", - "launcher": "Slurm" - }, - "runs": [ - { - "run_id": "4b5507a", - "timestamp": 1699038661491043211, - "model": [ - { - "name": "perroquet", - "path": "/tmp/my-exp/perroquet", - "exe_args": [ - "/path/to/my/script.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks-per-node": 1 - } - }, - "batch_settings": {}, - "params": {}, - "files": { - "Symlink": [], - "Configure": [], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/4b5507a/model/perroquet", - "step_id": "4139111.28", - "task_id": "2929", - "managed": true - }, - "out_file": "/tmp/my-exp/perroquet/perroquet.out", - "err_file": "/tmp/my-exp/perroquet/perroquet.err" - } - ], - "orchestrator": [ - { - "name": "orchestrator", - "type": "redis", - "interface": [ - "ipogif0" - ], - "shards": [ - { - "name": "orchestrator_0", - "hostname": "10.128.0.4", - "port": 6780, - "cluster": false, - "conf_file": null, - "out_file": "/path/to/some/file.out", - "err_file": "/path/to/some/file.err", - "telemetry_metadata": { - "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/orchestrator/orchestrator_0", - "step_id": "4139111.27", - "task_id": "1452", - "managed": true - } - } - ] - } - ], - "ensemble": [] - } - ] -} diff --git a/tests/test_configs/telemetry/ensembles.json b/tests/test_configs/telemetry/ensembles.json deleted file mode 100644 index 632bf84068..0000000000 --- a/tests/test_configs/telemetry/ensembles.json +++ /dev/null @@ -1,329 +0,0 @@ -{ - "schema info": { - "schema_name": "entity manifest", - "version": "0.0.1" - }, - "experiment": { - "name": "my-exp", - "path": "/home/someuser/code/ss/my-exp", - "launcher": "Local" - }, - "runs": [ - { - "run_id": "d041b90", - "timestamp": 1698679830384608928, - "model": [], - "orchestrator": [], - "ensemble": [ - { - "name": "my-ens", - "params": { - "START": [ - "spam", - "foo" - ], - "MID": [ - "eggs", - "bar" - ], - "END": [ - "ham", - "baz" - ] - }, - "batch_settings": {}, - "models": [ - { - "name": "my-ens_0", - "path": "/home/someuser/code/ss", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/home/someuser/.pyenv/versions/3.10.16/envs/ss/bin/python" - ], - "run_command": null, - "run_args": {} - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "eggs", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/home/someuser/code/ss/manifest/demo/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_0", - "step_id": null, - "task_id": "88118", - "managed": false - }, - "out_file": "/home/someuser/code/ss/my-ens_0.out", - "err_file": "/home/someuser/code/ss/my-ens_0.err" - }, - { - "name": "my-ens_1", - "path": "/home/someuser/code/ss", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/home/someuser/.pyenv/versions/3.10.16/envs/ss/bin/python" - ], - "run_command": null, - "run_args": {} - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "eggs", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/home/someuser/code/ss/manifest/demo/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_1", - "step_id": null, - "task_id": "88131", - "managed": false - }, - "out_file": "/home/someuser/code/ss/my-ens_1.out", - "err_file": "/home/someuser/code/ss/my-ens_1.err" - }, - { - "name": "my-ens_2", - "path": "/home/someuser/code/ss", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/home/someuser/.pyenv/versions/3.10.16/envs/ss/bin/python" - ], - "run_command": null, - "run_args": {} - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "bar", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/home/someuser/code/ss/manifest/demo/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_2", - "step_id": null, - "task_id": "88146", - "managed": false - }, - "out_file": "/home/someuser/code/ss/my-ens_2.out", - "err_file": "/home/someuser/code/ss/my-ens_2.err" - }, - { - "name": "my-ens_3", - "path": "/home/someuser/code/ss", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/home/someuser/.pyenv/versions/3.10.16/envs/ss/bin/python" - ], - "run_command": null, - "run_args": {} - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "bar", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/home/someuser/code/ss/manifest/demo/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_3", - "step_id": null, - "task_id": "88170", - "managed": false - }, - "out_file": "/home/someuser/code/ss/my-ens_3.out", - "err_file": "/home/someuser/code/ss/my-ens_3.err" - }, - { - "name": "my-ens_4", - "path": "/home/someuser/code/ss", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/home/someuser/.pyenv/versions/3.10.16/envs/ss/bin/python" - ], - "run_command": null, - "run_args": {} - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "eggs", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/home/someuser/code/ss/manifest/demo/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_4", - "step_id": null, - "task_id": "88178", - "managed": false - }, - "out_file": "/home/someuser/code/ss/my-ens_4.out", - "err_file": "/home/someuser/code/ss/my-ens_4.err" - }, - { - "name": "my-ens_5", - "path": "/home/someuser/code/ss", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/home/someuser/.pyenv/versions/3.10.16/envs/ss/bin/python" - ], - "run_command": null, - "run_args": {} - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "eggs", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/home/someuser/code/ss/manifest/demo/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_5", - "step_id": null, - "task_id": "88193", - "managed": false - }, - "out_file": "/home/someuser/code/ss/my-ens_5.out", - "err_file": "/home/someuser/code/ss/my-ens_5.err" - }, - { - "name": "my-ens_6", - "path": "/home/someuser/code/ss", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/home/someuser/.pyenv/versions/3.10.16/envs/ss/bin/python" - ], - "run_command": null, - "run_args": {} - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "bar", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/home/someuser/code/ss/manifest/demo/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_6", - "step_id": null, - "task_id": "88221", - "managed": false - }, - "out_file": "/home/someuser/code/ss/my-ens_6.out", - "err_file": "/home/someuser/code/ss/my-ens_6.err" - }, - { - "name": "my-ens_7", - "path": "/home/someuser/code/ss", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/home/someuser/.pyenv/versions/3.10.16/envs/ss/bin/python" - ], - "run_command": null, - "run_args": {} - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "bar", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/home/someuser/code/ss/manifest/demo/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_7", - "step_id": null, - "task_id": "88241", - "managed": false - }, - "out_file": "/home/someuser/code/ss/my-ens_7.out", - "err_file": "/home/someuser/code/ss/my-ens_7.err" - } - ] - } - ] - } - ] -} diff --git a/tests/test_configs/telemetry/serialmodels.json b/tests/test_configs/telemetry/serialmodels.json deleted file mode 100644 index 40337ecebe..0000000000 --- a/tests/test_configs/telemetry/serialmodels.json +++ /dev/null @@ -1,186 +0,0 @@ -{ - "schema info": { - "schema_name": "entity manifest", - "version": "0.0.1" - }, - "experiment": { - "name": "my-exp", - "path": "/tmp/my-exp", - "launcher": "Slurm" - }, - "runs": [ - { - "run_id": "8c0fbb1", - "timestamp": 1699037881502730708, - "model": [ - { - "name": "perroquet_0", - "path": "/tmp/my-exp/perroquet_0", - "exe_args": [ - "/tmp/echo.py" - ], - "run_settings": { - "exe": [ - "/path/to/some/python" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks-per-node": 1 - } - }, - "batch_settings": {}, - "params": {}, - "files": { - "Symlink": [], - "Configure": [], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_0", - "step_id": "4139111.22", - "task_id": "17966", - "managed": true - }, - "out_file": "/tmp/my-exp/perroquet_0/perroquet_0.out", - "err_file": "/tmp/my-exp/perroquet_0/perroquet_0.err" - }, - { - "name": "perroquet_1", - "path": "/tmp/my-exp/perroquet_1", - "exe_args": [ - "/tmp/echo.py" - ], - "run_settings": { - "exe": [ - "/path/to/some/python" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks-per-node": 1 - } - }, - "batch_settings": {}, - "params": {}, - "files": { - "Symlink": [], - "Configure": [], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_1", - "step_id": "4139111.23", - "task_id": "18100", - "managed": true - }, - "out_file": "/tmp/my-exp/perroquet_1/perroquet_1.out", - "err_file": "/tmp/my-exp/perroquet_1/perroquet_1.err" - }, - { - "name": "perroquet_2", - "path": "/tmp/my-exp/perroquet_2", - "exe_args": [ - "/tmp/echo.py" - ], - "run_settings": { - "exe": [ - "/path/to/some/python" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks-per-node": 1 - } - }, - "batch_settings": {}, - "params": {}, - "files": { - "Symlink": [], - "Configure": [], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_2", - "step_id": "4139111.24", - "task_id": "18159", - "managed": true - }, - "out_file": "/tmp/my-exp/perroquet_2/perroquet_2.out", - "err_file": "/tmp/my-exp/perroquet_2/perroquet_2.err" - }, - { - "name": "perroquet_3", - "path": "/tmp/my-exp/perroquet_3", - "exe_args": [ - "/tmp/echo.py" - ], - "run_settings": { - "exe": [ - "/path/to/some/python" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks-per-node": 1 - } - }, - "batch_settings": {}, - "params": {}, - "files": { - "Symlink": [], - "Configure": [], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_3", - "step_id": "4139111.25", - "task_id": "18499", - "managed": true - }, - "out_file": "/tmp/my-exp/perroquet_3/perroquet_3.out", - "err_file": "/tmp/my-exp/perroquet_3/perroquet_3.err" - }, - { - "name": "perroquet_4", - "path": "/tmp/my-exp/perroquet_4", - "exe_args": [ - "/tmp/echo.py" - ], - "run_settings": { - "exe": [ - "/path/to/some/python" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks-per-node": 1 - } - }, - "batch_settings": {}, - "params": {}, - "files": { - "Symlink": [], - "Configure": [], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_serial_models/8c0fbb1/model/perroquet_4", - "step_id": "4139111.26", - "task_id": "18832", - "managed": true - }, - "out_file": "/tmp/my-exp/perroquet_4/perroquet_4.out", - "err_file": "/tmp/my-exp/perroquet_4/perroquet_4.err" - } - ], - "orchestrator": [], - "ensemble": [] - } - ] -} diff --git a/tests/test_configs/telemetry/telemetry.json b/tests/test_configs/telemetry/telemetry.json deleted file mode 100644 index 916f5922b4..0000000000 --- a/tests/test_configs/telemetry/telemetry.json +++ /dev/null @@ -1,945 +0,0 @@ -{ - "experiment": { - "name": "my-exp", - "path": "/path/to/my-exp", - "launcher": "Slurm" - }, - "runs": [ - { - "run_id": "d999ad89-020f-4e6a-b834-dbd88658ce84", - "timestamp": 1697824072792854287, - "model": [ - { - "name": "my-model", - "path": "/path/to/my-exp/my-model", - "exe_args": [ - "hello", - "world" - ], - "run_settings": { - "exe": [ - "/usr/bin/echo" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": {}, - "files": { - "Symlink": [], - "Configure": [], - "Copy": [] - }, - "colocated_db": { - "settings": { - "port": 5757, - "ifname": "lo", - "cpus": 1, - "custom_pinning": "0", - "debug": false, - "db_identifier": "COLO", - "rai_args": { - "threads_per_queue": null, - "inter_op_parallelism": null, - "intra_op_parallelism": null - }, - "extra_db_args": {} - }, - "scripts": [], - "models": [ - { - "cnn": { - "backend": "TORCH", - "device": "CPU" - } - } - ] - }, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d999ad89-020f-4e6a-b834-dbd88658ce84/model/my-model", - "step_id": "4121050.30", - "task_id": "25230", - "managed": true - }, - "out_file": "/path/to/my-exp/my-model/my-model.out", - "err_file": "/path/to/my-exp/my-model/my-model.err" - } - ], - "orchestrator": [], - "ensemble": [] - }, - { - "run_id": "fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa", - "timestamp": 1697824102122439975, - "model": [], - "orchestrator": [ - { - "name": "orchestrator", - "type": "redis", - "interface": [ - "ipogif0" - ], - "shards": [ - { - "name": "orchestrator_1", - "hostname": "10.128.0.70", - "port": 2424, - "cluster": true, - "conf_file": "nodes-orchestrator_1-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", - "step_id": "4121050.31+2", - "task_id": "25241", - "managed": true - } - }, - { - "name": "orchestrator_2", - "hostname": "10.128.0.71", - "port": 2424, - "cluster": true, - "conf_file": "nodes-orchestrator_2-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", - "step_id": "4121050.31+2", - "task_id": "25241", - "managed": true - } - }, - { - "name": "orchestrator_0", - "hostname": "10.128.0.69", - "port": 2424, - "cluster": true, - "conf_file": "nodes-orchestrator_0-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", - "step_id": "4121050.31+2", - "task_id": "25241", - "managed": true - } - } - ] - } - ], - "ensemble": [] - }, - { - "run_id": "d65ae1df-cb5e-45d9-ab09-6fa641755997", - "timestamp": 1697824127962219505, - "model": [], - "orchestrator": [], - "ensemble": [ - { - "name": "my-ens", - "params": { - "START": [ - "spam", - "foo" - ], - "MID": [ - "eggs", - "bar" - ], - "END": [ - "ham", - "baz" - ] - }, - "batch_settings": {}, - "models": [ - { - "name": "my-ens_0", - "path": "/path/to/my-exp/my-ens/my-ens_0", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "eggs", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_0", - "step_id": "4121050.32", - "task_id": "25639", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.err" - }, - { - "name": "my-ens_1", - "path": "/path/to/my-exp/my-ens/my-ens_1", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "eggs", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_1", - "step_id": "4121050.33", - "task_id": "25768", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.err" - }, - { - "name": "my-ens_2", - "path": "/path/to/my-exp/my-ens/my-ens_2", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "bar", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_2", - "step_id": "4121050.34", - "task_id": "25817", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.err" - }, - { - "name": "my-ens_3", - "path": "/path/to/my-exp/my-ens/my-ens_3", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "bar", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_3", - "step_id": "4121050.35", - "task_id": "25837", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.err" - }, - { - "name": "my-ens_4", - "path": "/path/to/my-exp/my-ens/my-ens_4", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "eggs", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_4", - "step_id": "4121050.36", - "task_id": "25872", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.err" - }, - { - "name": "my-ens_5", - "path": "/path/to/my-exp/my-ens/my-ens_5", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "eggs", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_5", - "step_id": "4121050.37", - "task_id": "25930", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.err" - }, - { - "name": "my-ens_6", - "path": "/path/to/my-exp/my-ens/my-ens_6", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "bar", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_6", - "step_id": "4121050.38", - "task_id": "25945", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.err" - }, - { - "name": "my-ens_7", - "path": "/path/to/my-exp/my-ens/my-ens_7", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "bar", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_7", - "step_id": "4121050.39", - "task_id": "25967", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.err" - } - ] - } - ] - }, - { - "run_id": "e41f8e17-c4b2-441d-adf9-707443ee2c72", - "timestamp": 1697835227560376025, - "model": [ - { - "name": "my-model", - "path": "/path/to/my-exp/my-model", - "exe_args": [ - "hello", - "world" - ], - "run_settings": { - "exe": [ - "/usr/bin/echo" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": {}, - "files": { - "Symlink": [], - "Configure": [], - "Copy": [] - }, - "colocated_db": { - "settings": { - "port": 5757, - "ifname": "lo", - "cpus": 1, - "custom_pinning": "0", - "debug": false, - "db_identifier": "COLO", - "rai_args": { - "threads_per_queue": null, - "inter_op_parallelism": null, - "intra_op_parallelism": null - }, - "extra_db_args": {} - }, - "scripts": [], - "models": [ - { - "cnn": { - "backend": "TORCH", - "device": "CPU" - } - } - ] - }, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/e41f8e17-c4b2-441d-adf9-707443ee2c72/model/my-model", - "step_id": "4121904.0", - "task_id": "28277", - "managed": true - }, - "out_file": "/path/to/my-exp/my-model/my-model.out", - "err_file": "/path/to/my-exp/my-model/my-model.err" - } - ], - "orchestrator": [], - "ensemble": [] - }, - { - "run_id": "b33a5d27-6822-4795-8e0e-cfea18551fa4", - "timestamp": 1697835261956135240, - "model": [], - "orchestrator": [ - { - "name": "orchestrator", - "type": "redis", - "interface": [ - "ipogif0" - ], - "shards": [ - { - "name": "orchestrator_0", - "hostname": "10.128.0.2", - "port": 2424, - "cluster": true, - "conf_file": "nodes-orchestrator_0-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", - "step_id": "4121904.1+2", - "task_id": "28289", - "managed": true - } - }, - { - "name": "orchestrator_2", - "hostname": "10.128.0.4", - "port": 2424, - "cluster": true, - "conf_file": "nodes-orchestrator_2-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", - "step_id": "4121904.1+2", - "task_id": "28289", - "managed": true - } - }, - { - "name": "orchestrator_1", - "hostname": "10.128.0.3", - "port": 2424, - "cluster": true, - "conf_file": "nodes-orchestrator_1-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", - "step_id": "4121904.1+2", - "task_id": "28289", - "managed": true - } - } - ] - } - ], - "ensemble": [] - }, - { - "run_id": "45772df2-fd80-43fd-adf0-d5e319870182", - "timestamp": 1697835287798613875, - "model": [], - "orchestrator": [], - "ensemble": [ - { - "name": "my-ens", - "params": { - "START": [ - "spam", - "foo" - ], - "MID": [ - "eggs", - "bar" - ], - "END": [ - "ham", - "baz" - ] - }, - "batch_settings": {}, - "models": [ - { - "name": "my-ens_0", - "path": "/path/to/my-exp/my-ens/my-ens_0", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "eggs", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_0", - "step_id": "4121904.2", - "task_id": "28333", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.err" - }, - { - "name": "my-ens_1", - "path": "/path/to/my-exp/my-ens/my-ens_1", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "eggs", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_1", - "step_id": "4121904.3", - "task_id": "28342", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.err" - }, - { - "name": "my-ens_2", - "path": "/path/to/my-exp/my-ens/my-ens_2", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "bar", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_2", - "step_id": "4121904.4", - "task_id": "28353", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.err" - }, - { - "name": "my-ens_3", - "path": "/path/to/my-exp/my-ens/my-ens_3", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "bar", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_3", - "step_id": "4121904.5", - "task_id": "28362", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.err" - }, - { - "name": "my-ens_4", - "path": "/path/to/my-exp/my-ens/my-ens_4", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "eggs", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_4", - "step_id": "4121904.6", - "task_id": "28371", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.err" - }, - { - "name": "my-ens_5", - "path": "/path/to/my-exp/my-ens/my-ens_5", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "eggs", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_5", - "step_id": "4121904.7", - "task_id": "28380", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.err" - }, - { - "name": "my-ens_6", - "path": "/path/to/my-exp/my-ens/my-ens_6", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "bar", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_6", - "step_id": "4121904.8", - "task_id": "28389", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.err" - }, - { - "name": "my-ens_7", - "path": "/path/to/my-exp/my-ens/my-ens_7", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "bar", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_7", - "step_id": "4121904.9", - "task_id": "28398", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.err" - } - ] - } - ] - } - ] -} diff --git a/tests/test_controller.py b/tests/test_controller.py index 5a91b77888..2086b94b42 100644 --- a/tests/test_controller.py +++ b/tests/test_controller.py @@ -28,6 +28,7 @@ import pytest +from smartsim._core.config import CONFIG from smartsim._core.control.controller import Controller from smartsim._core.launcher.step import Step from smartsim.database.orchestrator import Orchestrator @@ -69,7 +70,7 @@ def test_controller_batch_step_creation_preserves_entity_order(collection, monke ) entity_names = [x.name for x in collection.entities] assert len(entity_names) == len(set(entity_names)) - _, steps = controller._create_batch_job_step( - collection, pathlib.Path("mock/exp/path") - ) + # Create a metadata directory for the test + metadata_dir = pathlib.Path("/tmp") / CONFIG.metadata_subdir + _, steps = controller._create_batch_job_step(collection, metadata_dir) assert entity_names == [step.name for step in steps] diff --git a/tests/test_controller_errors.py b/tests/test_controller_errors.py index d468cdb886..20a98e188f 100644 --- a/tests/test_controller_errors.py +++ b/tests/test_controller_errors.py @@ -163,7 +163,7 @@ def test_restarting_entity(test_dir, wlmutils, entity): step_settings = RunSettings("echo") test_launcher = wlmutils.get_test_launcher() step = MockStep("mock-step", test_dir, step_settings) - step.meta["status_dir"] = test_dir + step.meta["metadata_dir"] = test_dir entity.path = test_dir controller = Controller(test_launcher) controller._jobs.add_job(entity.name, job_id="1234", entity=entity) @@ -176,7 +176,7 @@ def test_restarting_orch(test_dir, wlmutils): step_settings = RunSettings("echo") test_launcher = wlmutils.get_test_launcher() step = MockStep("mock-step", test_dir, step_settings) - step.meta["status_dir"] = test_dir + step.meta["metadata_dir"] = test_dir orc.path = test_dir controller = Controller(test_launcher) controller._jobs.add_job(orc.name, job_id="1234", entity=orc) diff --git a/tests/test_controller_metadata_usage.py b/tests/test_controller_metadata_usage.py new file mode 100644 index 0000000000..3f50196b58 --- /dev/null +++ b/tests/test_controller_metadata_usage.py @@ -0,0 +1,173 @@ +"""Test the controller's metadata directory usage patterns""" + +# NOTE: This entire test file has been commented out because it tests +# LaunchedManifestBuilder functionality which has been removed. +# The tests are no longer relevant since LaunchedManifest, +# LaunchedManifestBuilder, and _LaunchedManifestMetadata classes +# have been deleted from the codebase. + +# import pathlib +# import shutil +# import tempfile +# from unittest.mock import MagicMock, patch +# +# import pytest +# +# from smartsim._core.control.controller import Controller +# from smartsim._core.control.manifest import LaunchedManifestBuilder, Manifest +# from smartsim.database import Orchestrator +# from smartsim.entity import Ensemble, Model +# from smartsim.settings import RunSettings + +# +# class TestControllerMetadataDirectoryUsage: +# """Test that the Controller properly uses metadata directories""" +# +# def setup_method(self): +# """Set up test fixtures""" +# self.temp_dir = tempfile.mkdtemp() +# self.controller = Controller("local") +# +# def teardown_method(self): +# """Clean up test fixtures""" +# shutil.rmtree(self.temp_dir, ignore_errors=True) +# +# def test_controller_creates_model_metadata_directory_only_when_models_present(self): +# """Test that model metadata directory is created only when models are present""" +# # Create manifest with model +# model = Model("test_model", {}, RunSettings("echo", ["hello"])) +# manifest = Manifest(model) +# +# with ( +# patch.object(self.controller, "_jobs") as mock_jobs, +# patch.object(self.controller, "_launch_step") as mock_launch_step, +# patch.object(self.controller, "symlink_output_files") as mock_symlink, +# ): +# +# mock_jobs.get_db_host_addresses.return_value = {} +# mock_jobs.actively_monitoring = False +# +# # Track LaunchedManifestBuilder method calls +# with patch.object( +# LaunchedManifestBuilder, "get_entity_metadata_subdirectory" +# ) as mock_get_dir: +# mock_metadata_dir = MagicMock() +# mock_get_dir.return_value = mock_metadata_dir +# +# launched_manifest = self.controller._launch( +# "test_exp", self.temp_dir, manifest +# ) +# +# # Verify that get_entity_metadata_subdirectory was called for "model" +# model_calls = [ +# call +# for call in mock_get_dir.call_args_list +# if call[0][0] == "model" +# ] +# assert len(model_calls) == 1 # Should be called once for model +# +# def test_controller_creates_ensemble_metadata_directory_only_when_ensembles_present( +# self, +# ): +# """Test that ensemble metadata directory is created only when ensembles are present""" +# # Create manifest with ensemble +# run_settings = RunSettings("echo", ["world"]) +# ensemble = Ensemble("test_ensemble", {}, run_settings=run_settings, replicas=2) +# manifest = Manifest(ensemble) +# +# with ( +# patch.object(self.controller, "_jobs") as mock_jobs, +# patch.object(self.controller, "_launch_step") as mock_launch_step, +# patch.object(self.controller, "symlink_output_files") as mock_symlink, +# ): +# +# mock_jobs.get_db_host_addresses.return_value = {} +# mock_jobs.actively_monitoring = False +# +# # Track LaunchedManifestBuilder method calls +# with patch.object( +# LaunchedManifestBuilder, "get_entity_metadata_subdirectory" +# ) as mock_get_dir: +# mock_metadata_dir = MagicMock() +# mock_get_dir.return_value = mock_metadata_dir +# +# launched_manifest = self.controller._launch( +# "test_exp", self.temp_dir, manifest +# ) +# +# # Verify that get_entity_metadata_subdirectory was called for "ensemble" +# ensemble_calls = [ +# call +# for call in mock_get_dir.call_args_list +# if call[0][0] == "ensemble" +# ] +# assert len(ensemble_calls) == 1 # Should be called once for ensemble +# +# def test_controller_does_not_create_entity_dirs_for_missing_entity_types(self): +# """Test that entity metadata directories are not created for missing entity types""" +# # Create manifest with only a model (no ensemble, no database) +# model = Model("test_model", {}, RunSettings("echo", ["hello"])) +# manifest = Manifest(model) +# +# with ( +# patch.object(self.controller, "_jobs") as mock_jobs, +# patch.object(self.controller, "_launch_step") as mock_launch_step, +# patch.object(self.controller, "symlink_output_files") as mock_symlink, +# ): +# +# mock_jobs.get_db_host_addresses.return_value = {} +# mock_jobs.actively_monitoring = False +# +# # Track LaunchedManifestBuilder method calls +# with patch.object( +# LaunchedManifestBuilder, "get_entity_metadata_subdirectory" +# ) as mock_get_dir: +# mock_metadata_dir = MagicMock() +# mock_get_dir.return_value = mock_metadata_dir +# +# launched_manifest = self.controller._launch( +# "test_exp", self.temp_dir, manifest +# ) +# +# # Only "model" should be requested, not "ensemble" or "database" +# requested_types = [call[0][0] for call in mock_get_dir.call_args_list] +# assert "model" in requested_types +# assert "ensemble" not in requested_types +# # Note: database might be requested by _launch_orchestrator even with empty dbs +# +# def test_controller_metadata_directory_lazy_creation_pattern(self): +# """Test that metadata directories follow lazy creation pattern""" +# # Create manifest with both model and ensemble +# model = Model("test_model", {}, RunSettings("echo", ["hello"])) +# run_settings = RunSettings("echo", ["world"]) +# ensemble = Ensemble("test_ensemble", {}, run_settings=run_settings, replicas=2) +# manifest = Manifest(model, ensemble) +# +# with ( +# patch.object(self.controller, "_jobs") as mock_jobs, +# patch.object(self.controller, "_launch_step") as mock_launch_step, +# patch.object(self.controller, "symlink_output_files") as mock_symlink, +# ): +# +# mock_jobs.get_db_host_addresses.return_value = {} +# mock_jobs.actively_monitoring = False +# +# # Track the order of calls to get_entity_metadata_subdirectory +# call_order = [] +# original_get_dir = LaunchedManifestBuilder.get_entity_metadata_subdirectory +# +# def track_calls(self, entity_type): +# call_order.append(entity_type) +# return original_get_dir(self, entity_type) +# +# with patch.object( +# LaunchedManifestBuilder, "get_entity_metadata_subdirectory", track_calls +# ): +# launched_manifest = self.controller._launch( +# "test_exp", self.temp_dir, manifest +# ) +# +# # Verify that directories are requested in the expected order +# # This tests that directories are created lazily as they're needed +# assert "model" in call_order +# assert "ensemble" in call_order diff --git a/tests/test_dragon_client.py b/tests/test_dragon_client.py index c4c75aa6b9..ba2a15ec29 100644 --- a/tests/test_dragon_client.py +++ b/tests/test_dragon_client.py @@ -30,6 +30,7 @@ import pytest +from smartsim._core.config import CONFIG from smartsim._core.launcher.step.dragonStep import DragonBatchStep, DragonStep from smartsim.settings import DragonRunSettings from smartsim.settings.slurmSettings import SbatchSettings @@ -53,9 +54,9 @@ def dragon_batch_step(test_dir: str) -> "DragonBatchStep": batch_settings = SbatchSettings(nodes=num_nodes) batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) - # ensure the status_dir is set - status_dir = (test_path / ".smartsim" / "logs").as_posix() - batch_step.meta["status_dir"] = status_dir + # ensure the metadata_dir is set + metadata_dir = (test_path / CONFIG.dragon_logs_subdir).as_posix() + batch_step.meta["metadata_dir"] = metadata_dir # create some steps to verify the requests file output changes rs0 = DragonRunSettings(exe="sleep", exe_args=["1"]) @@ -84,14 +85,14 @@ def dragon_batch_step(test_dir: str) -> "DragonBatchStep": for index, step in enumerate(steps): # ensure meta is configured... - step.meta["status_dir"] = status_dir + step.meta["metadata_dir"] = metadata_dir # ... and put all the steps into the batch batch_step.add_to_batch(steps[index]) return batch_step -def get_request_path_from_batch_script(launch_cmd: t.List[str]) -> pathlib.Path: +def get_request_path_from_batch_script(launch_cmd: list[str]) -> pathlib.Path: """Helper method for finding the path to a request file from the launch command""" script_path = pathlib.Path(launch_cmd[-1]) batch_script = script_path.read_text(encoding="utf-8") diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py index 7e233000f1..7445d5ff2d 100644 --- a/tests/test_dragon_installer.py +++ b/tests/test_dragon_installer.py @@ -29,6 +29,7 @@ import tarfile import typing as t from collections import namedtuple +from collections.abc import Collection import pytest from github.GitReleaseAsset import GitReleaseAsset @@ -84,7 +85,7 @@ def extraction_dir(test_dir: str) -> pathlib.Path: @pytest.fixture -def test_assets(monkeypatch: pytest.MonkeyPatch) -> t.Dict[str, GitReleaseAsset]: +def test_assets(monkeypatch: pytest.MonkeyPatch) -> dict[str, GitReleaseAsset]: requester = Requester( auth=None, base_url="https://github.com", @@ -99,7 +100,7 @@ def test_assets(monkeypatch: pytest.MonkeyPatch) -> t.Dict[str, GitReleaseAsset] attributes = {"mock-attr": "mock-attr-value"} completed = True - assets: t.List[GitReleaseAsset] = [] + assets: list[GitReleaseAsset] = [] mock_archive_name_tpl = "{}-{}.4.1-{}ac132fe95.tar.gz" for python_version in ["py3.10", "py3.11"]: @@ -205,7 +206,7 @@ def test_retrieve_cached( ], ) def test_retrieve_asset_info( - test_assets: t.Collection[GitReleaseAsset], + test_assets: Collection[GitReleaseAsset], monkeypatch: pytest.MonkeyPatch, dragon_pin: str, pyv: str, diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py index f2196e4eed..9147296d1b 100644 --- a/tests/test_dragon_launcher.py +++ b/tests/test_dragon_launcher.py @@ -38,6 +38,7 @@ import smartsim._core.config from smartsim._core._cli.scripts.dragon_install import create_dotenv +from smartsim._core.config import CONFIG from smartsim._core.config.config import get_config from smartsim._core.launcher.dragon.dragonLauncher import ( DragonConnector, @@ -70,9 +71,9 @@ def dragon_batch_step(test_dir: str) -> DragonBatchStep: batch_settings = SbatchSettings(nodes=num_nodes) batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) - # ensure the status_dir is set - status_dir = (test_path / ".smartsim" / "logs").as_posix() - batch_step.meta["status_dir"] = status_dir + # ensure the metadata_dir is set + status_dir = (test_path / CONFIG.dragon_logs_subdir).as_posix() + batch_step.meta["metadata_dir"] = status_dir # create some steps to verify the requests file output changes rs0 = DragonRunSettings(exe="sleep", exe_args=["1"]) @@ -101,7 +102,7 @@ def dragon_batch_step(test_dir: str) -> DragonBatchStep: for index, step in enumerate(steps): # ensure meta is configured... - step.meta["status_dir"] = status_dir + step.meta["metadata_dir"] = status_dir # ... and put all the steps into the batch batch_step.add_to_batch(steps[index]) @@ -587,11 +588,11 @@ def test_run_step_fail(test_dir: str) -> None: """Verify that the dragon launcher still returns the step id when the running step fails""" test_path = pathlib.Path(test_dir) - status_dir = (test_path / ".smartsim" / "logs").as_posix() + status_dir = (test_path / CONFIG.dragon_logs_subdir).as_posix() rs = DragonRunSettings(exe="sleep", exe_args=["1"]) step0 = DragonStep("step0", test_dir, rs) - step0.meta["status_dir"] = status_dir + step0.meta["metadata_dir"] = status_dir mock_connector = MagicMock(spec=DragonConnector) mock_connector.is_connected = True @@ -673,11 +674,11 @@ def test_run_step_batch_failure(dragon_batch_step: DragonBatchStep) -> None: def test_run_step_success(test_dir: str) -> None: """Verify that the dragon launcher sends the correctly formatted request for a step""" test_path = pathlib.Path(test_dir) - status_dir = (test_path / ".smartsim" / "logs").as_posix() + status_dir = (test_path / CONFIG.dragon_logs_subdir).as_posix() rs = DragonRunSettings(exe="sleep", exe_args=["1"]) step0 = DragonStep("step0", test_dir, rs) - step0.meta["status_dir"] = status_dir + step0.meta["metadata_dir"] = status_dir mock_connector = MagicMock(spec=DragonConnector) mock_connector.is_connected = True @@ -700,7 +701,7 @@ def test_run_step_success(test_dir: str) -> None: send_invocation = mock_connector.send_request send_invocation.assert_called_once() - args = send_invocation.call_args[0] # call_args == t.Tuple[args, kwargs] + args = send_invocation.call_args[0] # call_args == tuple[args, kwargs] dragon_run_request = args[0] req_name = dragon_run_request.name # name sent to dragon env diff --git a/tests/test_dragon_run_policy.py b/tests/test_dragon_run_policy.py index ed6e64b76d..c61123f8de 100644 --- a/tests/test_dragon_run_policy.py +++ b/tests/test_dragon_run_policy.py @@ -28,6 +28,7 @@ import pytest +from smartsim._core.config import CONFIG from smartsim._core.launcher.step.dragonStep import DragonBatchStep, DragonStep from smartsim.settings.dragonRunSettings import DragonRunSettings from smartsim.settings.slurmSettings import SbatchSettings @@ -59,9 +60,9 @@ def dragon_batch_step(test_dir: str) -> "DragonBatchStep": batch_settings = SbatchSettings(nodes=num_nodes) batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) - # ensure the status_dir is set - status_dir = (test_path / ".smartsim" / "logs").as_posix() - batch_step.meta["status_dir"] = status_dir + # ensure the metadata_dir is set + status_dir = (test_path / CONFIG.dragon_logs_subdir).as_posix() + batch_step.meta["metadata_dir"] = status_dir # create some steps to verify the requests file output changes rs0 = DragonRunSettings(exe="sleep", exe_args=["1"]) @@ -90,7 +91,7 @@ def dragon_batch_step(test_dir: str) -> "DragonBatchStep": for index, step in enumerate(steps): # ensure meta is configured... - step.meta["status_dir"] = status_dir + step.meta["metadata_dir"] = status_dir # ... and put all the steps into the batch batch_step.add_to_batch(steps[index]) diff --git a/tests/test_dragon_run_request.py b/tests/test_dragon_run_request.py index bc620dbd30..c664f66de6 100644 --- a/tests/test_dragon_run_request.py +++ b/tests/test_dragon_run_request.py @@ -58,7 +58,7 @@ class NodeMock(MagicMock): def __init__( - self, name: t.Optional[str] = None, num_gpus: int = 2, num_cpus: int = 8 + self, name: str | None = None, num_gpus: int = 2, num_cpus: int = 8 ) -> None: super().__init__() self._mock_id = name @@ -82,7 +82,7 @@ def num_gpus(self) -> str: def _set_id(self, value: str) -> None: self._mock_id = value - def gpus(self, parent: t.Any = None) -> t.List[str]: + def gpus(self, parent: t.Any = None) -> list[str]: if self._num_gpus: return [f"{self.hostname}-gpu{i}" for i in range(NodeMock._num_gpus)] return [] @@ -161,7 +161,7 @@ def get_mock_backend( def set_mock_group_infos( monkeypatch: pytest.MonkeyPatch, dragon_backend: "DragonBackend" -) -> t.Dict[str, "ProcessGroupInfo"]: +) -> dict[str, "ProcessGroupInfo"]: dragon_mock = MagicMock() process_mock = MagicMock() process_mock.configure_mock(**{"returncode": 0}) @@ -445,7 +445,6 @@ def test_shutdown_request( kill_jobs: bool, frontend_shutdown: bool, ) -> None: - monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", "0") dragon_backend = get_mock_backend(monkeypatch) monkeypatch.setattr(dragon_backend, "_cooldown_period", 1) set_mock_group_infos(monkeypatch, dragon_backend) @@ -486,22 +485,6 @@ def test_shutdown_request( assert dragon_backend._has_cooled_down == kill_jobs -@pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") -@pytest.mark.parametrize("telemetry_flag", ["0", "1"]) -def test_cooldown_is_set(monkeypatch: pytest.MonkeyPatch, telemetry_flag: str) -> None: - monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", telemetry_flag) - dragon_backend = get_mock_backend(monkeypatch) - - expected_cooldown = ( - 2 * CONFIG.telemetry_frequency + 5 if int(telemetry_flag) > 0 else 5 - ) - - if telemetry_flag: - assert dragon_backend.cooldown_period == expected_cooldown - else: - assert dragon_backend.cooldown_period == expected_cooldown - - @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") def test_heartbeat_and_time(monkeypatch: pytest.MonkeyPatch) -> None: dragon_backend = get_mock_backend(monkeypatch) @@ -535,7 +518,7 @@ def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None: @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @pytest.mark.parametrize("affinity", [[0], [0, 1], list(range(8))]) def test_can_honor_cpu_affinity( - monkeypatch: pytest.MonkeyPatch, affinity: t.List[int] + monkeypatch: pytest.MonkeyPatch, affinity: list[int] ) -> None: """Verify that valid CPU affinities are accepted""" dragon_backend = get_mock_backend(monkeypatch) @@ -579,7 +562,7 @@ def test_can_honor_cpu_affinity_out_of_range(monkeypatch: pytest.MonkeyPatch) -> @pytest.mark.skipif(not dragon_loaded, reason="Test is only for Dragon WLM systems") @pytest.mark.parametrize("affinity", [[0], [0, 1]]) def test_can_honor_gpu_affinity( - monkeypatch: pytest.MonkeyPatch, affinity: t.List[int] + monkeypatch: pytest.MonkeyPatch, affinity: list[int] ) -> None: """Verify that valid GPU affinities are accepted""" dragon_backend = get_mock_backend(monkeypatch) diff --git a/tests/test_dragon_run_request_nowlm.py b/tests/test_dragon_run_request_nowlm.py index 7a1cd90a25..1674892332 100644 --- a/tests/test_dragon_run_request_nowlm.py +++ b/tests/test_dragon_run_request_nowlm.py @@ -81,8 +81,8 @@ def test_run_request_with_empty_policy(monkeypatch: pytest.MonkeyPatch) -> None: ) def test_run_request_with_negative_affinity( device: str, - cpu_affinity: t.List[int], - gpu_affinity: t.List[int], + cpu_affinity: list[int], + gpu_affinity: list[int], ) -> None: """Verify that invalid affinity values fail validation""" with pytest.raises(ValidationError) as ex: diff --git a/tests/test_dragon_step.py b/tests/test_dragon_step.py index bcf939c48b..10c4e05986 100644 --- a/tests/test_dragon_step.py +++ b/tests/test_dragon_step.py @@ -32,6 +32,7 @@ import pytest +from smartsim._core.config import CONFIG from smartsim._core.launcher.step.dragonStep import DragonBatchStep, DragonStep from smartsim.settings import DragonRunSettings from smartsim.settings.pbsSettings import QsubBatchSettings @@ -55,9 +56,9 @@ def dragon_batch_step(test_dir: str) -> DragonBatchStep: batch_settings = SbatchSettings(nodes=num_nodes) batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) - # ensure the status_dir is set - status_dir = (test_path / ".smartsim" / "logs").as_posix() - batch_step.meta["status_dir"] = status_dir + # ensure the metadata_dir is set + status_dir = (test_path / CONFIG.dragon_logs_subdir).as_posix() + batch_step.meta["metadata_dir"] = status_dir # create some steps to verify the requests file output changes rs0 = DragonRunSettings(exe="sleep", exe_args=["1"]) @@ -86,14 +87,14 @@ def dragon_batch_step(test_dir: str) -> DragonBatchStep: for index, step in enumerate(steps): # ensure meta is configured... - step.meta["status_dir"] = status_dir + step.meta["metadata_dir"] = status_dir # ... and put all the steps into the batch batch_step.add_to_batch(steps[index]) return batch_step -def get_request_path_from_batch_script(launch_cmd: t.List[str]) -> pathlib.Path: +def get_request_path_from_batch_script(launch_cmd: list[str]) -> pathlib.Path: """Helper method for finding the path to a request file from the launch command""" script_path = pathlib.Path(launch_cmd[-1]) batch_script = script_path.read_text(encoding="utf-8") @@ -297,7 +298,7 @@ def test_dragon_batch_step_get_launch_command_meta_fail(test_dir: str) -> None: ) def test_dragon_batch_step_get_launch_command( test_dir: str, - batch_settings_class: t.Type, + batch_settings_class: type, batch_exe: str, batch_header: str, node_spec_tpl: str, @@ -311,9 +312,9 @@ def test_dragon_batch_step_get_launch_command( batch_settings = batch_settings_class(nodes=num_nodes) batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) - # ensure the status_dir is set - status_dir = (test_path / ".smartsim" / "logs").as_posix() - batch_step.meta["status_dir"] = status_dir + # ensure the metadata_dir is set + status_dir = (test_path / CONFIG.dragon_logs_subdir).as_posix() + batch_step.meta["metadata_dir"] = status_dir launch_cmd = batch_step.get_launch_cmd() assert launch_cmd @@ -353,9 +354,9 @@ def test_dragon_batch_step_write_request_file_no_steps(test_dir: str) -> None: batch_settings = SbatchSettings(nodes=num_nodes) batch_step = DragonBatchStep(batch_step_name, test_dir, batch_settings) - # ensure the status_dir is set - status_dir = (test_path / ".smartsim" / "logs").as_posix() - batch_step.meta["status_dir"] = status_dir + # ensure the metadata_dir is set + status_dir = (test_path / CONFIG.dragon_logs_subdir).as_posix() + batch_step.meta["metadata_dir"] = status_dir launch_cmd = batch_step.get_launch_cmd() requests_file = get_request_path_from_batch_script(launch_cmd) @@ -378,7 +379,7 @@ def test_dragon_batch_step_write_request_file( requests_file = get_request_path_from_batch_script(launch_cmd) requests_text = requests_file.read_text(encoding="utf-8") - requests_json: t.List[str] = json.loads(requests_text) + requests_json: list[str] = json.loads(requests_text) # verify that there is an item in file for each step added to the batch assert len(requests_json) == len(dragon_batch_step.steps) diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 3e350a2713..9e9513798c 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -34,7 +34,6 @@ from smartsim import Experiment from smartsim._core.config import CONFIG from smartsim._core.config.config import Config -from smartsim._core.utils import serialize from smartsim.database import Orchestrator from smartsim.entity import Model from smartsim.error import SmartSimError @@ -197,54 +196,6 @@ def test_launcher_detection( assert exp._launcher == wlmutils.get_test_launcher() -def test_enable_disable_telemetry( - monkeypatch: pytest.MonkeyPatch, test_dir: str, config: Config -) -> None: - # Global telemetry defaults to `on` and can be modified by - # setting the value of env var SMARTSIM_FLAG_TELEMETRY to 0/1 - monkeypatch.setattr(os, "environ", {}) - exp = Experiment("my-exp", exp_path=test_dir) - exp.telemetry.enable() - assert exp.telemetry.is_enabled - - exp.telemetry.disable() - assert not exp.telemetry.is_enabled - - exp.telemetry.enable() - assert exp.telemetry.is_enabled - - exp.telemetry.disable() - assert not exp.telemetry.is_enabled - - exp.start() - mani_path = ( - pathlib.Path(test_dir) / config.telemetry_subdir / serialize.MANIFEST_FILENAME - ) - assert mani_path.exists() - - -def test_telemetry_default( - monkeypatch: pytest.MonkeyPatch, test_dir: str, config: Config -) -> None: - """Ensure the default values for telemetry configuration match expectation - that experiment telemetry is on""" - - # If env var related to telemetry doesn't exist, experiment should default to True - monkeypatch.setattr(os, "environ", {}) - exp = Experiment("my-exp", exp_path=test_dir) - assert exp.telemetry.is_enabled - - # If telemetry disabled in env, should get False - monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", "0") - exp = Experiment("my-exp", exp_path=test_dir) - assert not exp.telemetry.is_enabled - - # If telemetry enabled in env, should get True - monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", "1") - exp = Experiment("my-exp", exp_path=test_dir) - assert exp.telemetry.is_enabled - - def test_error_on_cobalt() -> None: with pytest.raises(SSUnsupportedError): exp = Experiment("cobalt_exp", launcher="cobalt") diff --git a/tests/test_indirect.py b/tests/test_indirect.py deleted file mode 100644 index 24dcd9372b..0000000000 --- a/tests/test_indirect.py +++ /dev/null @@ -1,251 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2025, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pathlib -import sys - -import psutil -import pytest - -import conftest -from smartsim._core.config import CONFIG -from smartsim._core.entrypoints.indirect import cleanup, get_parser, get_ts_ms, main -from smartsim._core.utils.helpers import encode_cmd - -ALL_ARGS = { - "+command", - "+entity_type", - "+telemetry_dir", - "+output_file", - "+error_file", - "+working_dir", -} - -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_a - - -# fmt: off -@pytest.mark.parametrize( - ["cmd", "missing"], - [ - pytest.param("indirect.py", {"+name", "+command", "+entity_type", "+telemetry_dir", "+working_dir"}, id="no args"), - pytest.param("indirect.py -c echo +entity_type ttt +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+command"}, id="cmd typo"), - pytest.param("indirect.py -t orchestrator +command ccc +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+entity_type"}, id="etype typo"), - pytest.param("indirect.py -d /foo/bar +entity_type ttt +command ccc +output_file ooo +working_dir www +error_file eee", {"+telemetry_dir"}, id="dir typo"), - pytest.param("indirect.py +entity_type ttt +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+command"}, id="no cmd"), - pytest.param("indirect.py +command ccc +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+entity_type"}, id="no etype"), - pytest.param("indirect.py +command ccc +entity_type ttt +output_file ooo +working_dir www +error_file eee", {"+telemetry_dir"}, id="no dir"), - ] -) -# fmt: on -def test_parser(capsys, cmd, missing): - """Test that the parser reports any missing required arguments""" - parser = get_parser() - - args = cmd.split() - - captured = capsys.readouterr() # throw away existing output - with pytest.raises(SystemExit) as ex: - ns = parser.parse_args(args) - - captured = capsys.readouterr() - assert "the following arguments are required" in captured.err - for arg in missing: - assert arg in captured.err - - expected = ALL_ARGS - missing - msg_tuple = captured.err.split("the following arguments are required: ") - if len(msg_tuple) < 2: - assert False, "error message indicates no missing arguments" - - actual_missing = msg_tuple[1].strip() - for exp in expected: - assert f"{exp}/" not in actual_missing - - -def test_cleanup(capsys, monkeypatch): - """Ensure cleanup attempts termination of correct process""" - mock_pid = 123 - create_msg = "creating: {0}" - term_msg = "terminating: {0}" - - class MockProc: - def __init__(self, pid: int): - print(create_msg.format(pid)) - - def terminate(self): - print(term_msg.format(mock_pid)) - - captured = capsys.readouterr() # throw away existing output - - with monkeypatch.context() as ctx: - ctx.setattr("psutil.pid_exists", lambda pid: True) - ctx.setattr("psutil.Process", MockProc) - ctx.setattr("smartsim._core.entrypoints.indirect.STEP_PID", mock_pid) - cleanup() - - captured = capsys.readouterr() - assert create_msg.format(mock_pid) in captured.out - assert term_msg.format(mock_pid) in captured.out - - -def test_cleanup_late(capsys, monkeypatch): - """Ensure cleanup exceptions are swallowed if a process is already terminated""" - mock_pid = 123 - create_msg = "creating: {0}" - term_msg = "terminating: {0}" - - class MockMissingProc: - def __init__(self, pid: int) -> None: - print(create_msg.format(mock_pid)) - raise psutil.NoSuchProcess(pid) - - def terminate(self) -> None: - print(term_msg.format(mock_pid)) - - captured = capsys.readouterr() # throw away existing output - - with monkeypatch.context() as ctx: - ctx.setattr("psutil.pid_exists", lambda pid: True) - ctx.setattr("psutil.Process", MockMissingProc) - ctx.setattr("smartsim._core.entrypoints.indirect.STEP_PID", mock_pid) - cleanup() - - captured = capsys.readouterr() - assert create_msg.format(mock_pid) in captured.out - - -def test_ts(): - """Ensure expected output type""" - ts = get_ts_ms() - assert isinstance(ts, int) - - -def test_indirect_main_dir_check(test_dir): - """Ensure that the proxy validates the test directory exists""" - exp_dir = pathlib.Path(test_dir) - - cmd = ["echo", "unit-test"] - encoded_cmd = encode_cmd(cmd) - - status_path = exp_dir / CONFIG.telemetry_subdir - - # show that a missing status_path is created when missing - main(encoded_cmd, "application", exp_dir, status_path) - - assert status_path.exists() - - -def test_indirect_main_cmd_check(capsys, test_dir, monkeypatch): - """Ensure that the proxy validates the cmd is not empty or whitespace-only""" - exp_dir = pathlib.Path(test_dir) - - captured = capsys.readouterr() # throw away existing output - with monkeypatch.context() as ctx, pytest.raises(ValueError) as ex: - ctx.setattr("smartsim._core.entrypoints.indirect.logger.error", print) - _ = main("", "application", exp_dir, exp_dir / CONFIG.telemetry_subdir) - - captured = capsys.readouterr() - assert "Invalid cmd supplied" in ex.value.args[0] - - # test with non-emptystring cmd - with monkeypatch.context() as ctx, pytest.raises(ValueError) as ex: - ctx.setattr("smartsim._core.entrypoints.indirect.logger.error", print) - status_dir = exp_dir / CONFIG.telemetry_subdir - _ = main(" \n \t ", "application", exp_dir, status_dir) - - captured = capsys.readouterr() - assert "Invalid cmd supplied" in ex.value.args[0] - - -def test_process_failure(fileutils, test_dir: str, monkeypatch: pytest.MonkeyPatch): - """Ensure that a stop event is logged if the process unexpectedly terminates""" - mock_pid = 1122334455 - create_msg = "creating: {0}" - term_msg = "term: {0}" - wait_msg = "wait: {0}" - - class MockProc: - def __init__(self, *args, **kwargs): - print(create_msg.format(mock_pid)) - - @property - def pid(self): - return mock_pid - - def terminate(self): - print(term_msg.format(mock_pid)) - - def wait(self): - print(wait_msg.format(mock_pid)) - raise Exception("You shall not pass!") - - script = fileutils.get_test_conf_path("sleep.py") - - exp_dir = pathlib.Path(test_dir) - - raw_cmd = f"{sys.executable} {script} --time=10" - cmd = encode_cmd(raw_cmd.split()) - - mock_track = conftest.CountingCallable() - - with monkeypatch.context() as ctx: - ctx.setattr("smartsim._core.entrypoints.indirect.write_event", mock_track) - ctx.setattr("psutil.pid_exists", lambda pid: True) - ctx.setattr("psutil.Popen", MockProc) - ctx.setattr("psutil.Process", MockProc) # handle the proc.terminate() - ctx.setattr("smartsim._core.entrypoints.indirect.STEP_PID", mock_pid) - - rc = main(cmd, "application", exp_dir, exp_dir / CONFIG.telemetry_subdir) - assert rc == -1 - - (args1, _), (args2, kwargs2) = mock_track.details - assert "start" in args1 - assert "stop" in args2 - assert kwargs2.get("returncode", -1) - - -def test_complete_process( - fileutils: conftest.FileUtils, test_dir: str, monkeypatch: pytest.MonkeyPatch -) -> None: - """Ensure the happy-path completes and returns a success return code""" - script = fileutils.get_test_conf_path("sleep.py") - - exp_dir = pathlib.Path(test_dir) - - raw_cmd = f"{sys.executable} {script} --time=1" - cmd = encode_cmd(raw_cmd.split()) - - mock_track = conftest.CountingCallable() - with monkeypatch.context() as ctx: - ctx.setattr("smartsim._core.entrypoints.indirect.write_event", mock_track) - rc = main(cmd, "application", exp_dir, exp_dir / CONFIG.telemetry_subdir) - assert rc == 0 - - (args1, _), (args2, _) = mock_track.details - assert "start" in args1 - assert "stop" in args2 diff --git a/tests/test_logs.py b/tests/test_logs.py index 8bdbde735c..b24ef14ca9 100644 --- a/tests/test_logs.py +++ b/tests/test_logs.py @@ -35,22 +35,10 @@ import smartsim.log from smartsim import Experiment -_CFG_TM_ENABLED_ATTR = "telemetry_enabled" - # The tests in this file belong to the group_b group pytestmark = pytest.mark.group_b -@pytest.fixture -def turn_on_tm(monkeypatch): - monkeypatch.setattr( - smartsim._core.config.config.Config, - _CFG_TM_ENABLED_ATTR, - property(lambda self: True), - ) - yield - - @pytest.mark.parametrize( "level,expect_d,expect_i,expect_w,expect_e", [ @@ -112,7 +100,7 @@ def test_add_exp_loggers(test_dir): assert err_file.is_file() -def test_get_logger(test_dir: str, turn_on_tm, monkeypatch): +def test_get_logger(test_dir: str, monkeypatch): """Ensure the correct logger type is instantiated""" monkeypatch.setenv("SMARTSIM_LOG_LEVEL", "developer") logger = smartsim.log.get_logger("SmartSimTest", "INFO") @@ -132,13 +120,13 @@ def test_get_logger(test_dir: str, turn_on_tm, monkeypatch): pytest.param("developer", "debug", id="translation back, developer"), ], ) -def test_translate_log_level(input_level: str, exp_level: str, turn_on_tm): +def test_translate_log_level(input_level: str, exp_level: str): """Ensure the correct logger type is instantiated""" translated_level = smartsim.log._translate_log_level(input_level) assert exp_level == translated_level -def test_exp_logs(test_dir: str, turn_on_tm, monkeypatch): +def test_exp_logs(test_dir: str, monkeypatch): """Ensure that experiment loggers are added when context info exists""" monkeypatch.setenv("SMARTSIM_LOG_LEVEL", "developer") test_dir = pathlib.Path(test_dir) @@ -181,7 +169,7 @@ def test_exp_logs(test_dir: str, turn_on_tm, monkeypatch): smartsim.log.ctx_exp_path.reset(token) -def test_context_leak(test_dir: str, turn_on_tm, monkeypatch): +def test_context_leak(test_dir: str, monkeypatch): """Ensure that exceptions do not leave the context in an invalid state""" test_dir = pathlib.Path(test_dir) test_dir.mkdir(parents=True, exist_ok=True) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index a49b4eec34..8ff9d0fb89 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -33,14 +33,7 @@ import pytest from smartsim import Experiment -from smartsim._core.control.manifest import ( - LaunchedManifest, - LaunchedManifestBuilder, - Manifest, -) -from smartsim._core.control.manifest import ( - _LaunchedManifestMetadata as LaunchedManifestMetadata, -) +from smartsim._core.control.manifest import Manifest from smartsim._core.launcher.step import Step from smartsim.database import Orchestrator from smartsim.entity import Ensemble, Model @@ -54,8 +47,8 @@ # ---- create entities for testing -------- -_EntityResult = t.Tuple[ - Experiment, t.Tuple[Model, Model], Ensemble, Orchestrator, DBModel, DBScript +_EntityResult = tuple[ + Experiment, tuple[Model, Model], Ensemble, Orchestrator, DBModel, DBScript ] @@ -163,99 +156,8 @@ def test_manifest_detects_db_objects( ) monkeypatch.setattr(*patch) - assert Manifest(model, ensemble).has_db_objects == has_db_objects - - -def test_launched_manifest_transform_data(entities: _EntityResult) -> None: - _, (model, model_2), ensemble, orc, _, _ = entities - - models = [(model, 1), (model_2, 2)] - ensembles = [(ensemble, [(m, i) for i, m in enumerate(ensemble.entities)])] - dbs = [(orc, [(n, i) for i, n in enumerate(orc.entities)])] - lmb = LaunchedManifest( - metadata=LaunchedManifestMetadata("name", "path", "launcher", "run_id"), - models=models, # type: ignore - ensembles=ensembles, # type: ignore - databases=dbs, # type: ignore - ) - transformed = lmb.map(lambda x: str(x)) - - assert transformed.models == tuple((m, str(i)) for m, i in models) - assert transformed.ensembles[0][1] == tuple((m, str(i)) for m, i in ensembles[0][1]) - assert transformed.databases[0][1] == tuple((n, str(i)) for n, i in dbs[0][1]) + assert Manifest(model, ensemble).has_db_objects == has_db_objects -def test_launched_manifest_builder_correctly_maps_data(entities: _EntityResult) -> None: - _, (model, model_2), ensemble, orc, _, _ = entities - - lmb = LaunchedManifestBuilder( - "name", "path", "launcher name", str(uuid4()) - ) # type: ignore - lmb.add_model(model, 1) - lmb.add_model(model_2, 1) - lmb.add_ensemble(ensemble, [i for i in range(len(ensemble.entities))]) - lmb.add_database(orc, [i for i in range(len(orc.entities))]) - - manifest = lmb.finalize() - assert len(manifest.models) == 2 - assert len(manifest.ensembles) == 1 - assert len(manifest.databases) == 1 - - -def test_launced_manifest_builder_raises_if_lens_do_not_match( - entities: _EntityResult, -) -> None: - _, _, ensemble, orc, _, _ = entities - - lmb = LaunchedManifestBuilder( - "name", "path", "launcher name", str(uuid4()) - ) # type: ignore - with pytest.raises(ValueError): - lmb.add_ensemble(ensemble, list(range(123))) - with pytest.raises(ValueError): - lmb.add_database(orc, list(range(123))) - - -def test_launched_manifest_builer_raises_if_attaching_data_to_empty_collection( - monkeypatch: pytest.MonkeyPatch, entities: _EntityResult -) -> None: - _, _, ensemble, _, _, _ = entities - - lmb: LaunchedManifestBuilder[t.Tuple[str, Step]] = LaunchedManifestBuilder( - "name", "path", "launcher", str(uuid4()) - ) - monkeypatch.setattr(ensemble, "entities", []) - with pytest.raises(ValueError): - lmb.add_ensemble(ensemble, []) - - -def test_lmb_and_launched_manifest_have_same_paths_for_launched_metadata() -> None: - exp_path = "/path/to/some/exp" - lmb: LaunchedManifestBuilder[t.Tuple[str, Step]] = LaunchedManifestBuilder( - "exp_name", exp_path, "launcher", str(uuid4()) - ) - manifest = lmb.finalize() - assert ( - lmb.exp_telemetry_subdirectory == manifest.metadata.exp_telemetry_subdirectory - ) - assert ( - lmb.run_telemetry_subdirectory == manifest.metadata.run_telemetry_subdirectory - ) - assert ( - os.path.commonprefix( - [ - manifest.metadata.run_telemetry_subdirectory, - manifest.metadata.exp_telemetry_subdirectory, - manifest.metadata.manifest_file_path, - exp_path, - ] - ) - == exp_path - ) - assert os.path.commonprefix( - [ - manifest.metadata.run_telemetry_subdirectory, - manifest.metadata.exp_telemetry_subdirectory, - manifest.metadata.manifest_file_path, - ] - ) == str(manifest.metadata.exp_telemetry_subdirectory) +# Removed tests for LaunchedManifest, LaunchedManifestBuilder, and _LaunchedManifestMetadata +# since those classes were removed per MattToast's feedback diff --git a/tests/test_manifest_metadata_directories.py b/tests/test_manifest_metadata_directories.py new file mode 100644 index 0000000000..95cc3d201d --- /dev/null +++ b/tests/test_manifest_metadata_directories.py @@ -0,0 +1,205 @@ +"""Test the metadata directory functionality added to LaunchedManifestBuilder""" + +# NOTE: This entire test file has been commented out because it tests +# LaunchedManifestBuilder functionality which has been removed. +# All LaunchedManifest-related classes have been deleted from the codebase. +# +# # import pathlib +# # import tempfile +# # import time +# # from unittest.mock import patch +# # +# # import pytest +# # +# # from smartsim._core.config import CONFIG +# # from smartsim._core.control.manifest import LaunchedManifestBuilder +# +# +# class TestLaunchedManifestBuilderMetadataDirectories: +# """Test metadata directory properties and methods of LaunchedManifestBuilder""" +# +# def test_exp_metadata_subdirectory_property(self): +# """Test that exp_metadata_subdirectory returns correct path""" +# with tempfile.TemporaryDirectory() as temp_dir: +# lmb = LaunchedManifestBuilder( +# exp_name="test_exp", +# exp_path=temp_dir, +# launcher_name="local", +# ) +# +# expected_path = pathlib.Path(temp_dir) / CONFIG.metadata_subdir +# assert lmb.exp_metadata_subdirectory == expected_path +# +# def test_run_metadata_subdirectory_property(self): +# """Test that run_metadata_subdirectory returns correct timestamped path""" +# with tempfile.TemporaryDirectory() as temp_dir: +# # Mock the timestamp to make it predictable +# mock_timestamp = "1234567890123" +# with patch.object(time, "time", return_value=1234567890.123): +# lmb = LaunchedManifestBuilder( +# exp_name="test_exp", +# exp_path=temp_dir, +# launcher_name="local", +# ) +# +# expected_path = ( +# pathlib.Path(temp_dir) +# / CONFIG.metadata_subdir +# / f"run_{mock_timestamp}" +# ) +# assert lmb.run_metadata_subdirectory == expected_path +# +# def test_run_metadata_subdirectory_uses_actual_timestamp(self): +# """Test that run_metadata_subdirectory uses actual timestamp from launch""" +# with tempfile.TemporaryDirectory() as temp_dir: +# lmb = LaunchedManifestBuilder( +# exp_name="test_exp", +# exp_path=temp_dir, +# launcher_name="local", +# ) +# +# # Check that the timestamp is reasonable (within last few seconds) +# run_dir_name = lmb.run_metadata_subdirectory.name +# assert run_dir_name.startswith("run_") +# +# # Extract timestamp and verify it's recent +# timestamp_str = run_dir_name[4:] # Remove "run_" prefix +# timestamp_ms = int(timestamp_str) +# current_time_ms = int(time.time() * 1000) +# +# # Should be within 5 seconds of current time +# assert abs(current_time_ms - timestamp_ms) < 5000 +# +# def test_get_entity_metadata_subdirectory_method(self): +# """Test that get_entity_metadata_subdirectory returns correct entity-specific paths""" +# with tempfile.TemporaryDirectory() as temp_dir: +# mock_timestamp = "1234567890123" +# with patch.object(time, "time", return_value=1234567890.123): +# lmb = LaunchedManifestBuilder( +# exp_name="test_exp", +# exp_path=temp_dir, +# launcher_name="local", +# ) +# +# # Test different entity types +# model_dir = lmb.get_entity_metadata_subdirectory("model") +# ensemble_dir = lmb.get_entity_metadata_subdirectory("ensemble") +# database_dir = lmb.get_entity_metadata_subdirectory("database") +# +# base_path = ( +# pathlib.Path(temp_dir) +# / CONFIG.metadata_subdir +# / f"run_{mock_timestamp}" +# ) +# +# assert model_dir == base_path / "model" +# assert ensemble_dir == base_path / "ensemble" +# assert database_dir == base_path / "database" +# +# def test_get_entity_metadata_subdirectory_custom_entity_type(self): +# """Test that get_entity_metadata_subdirectory works with custom entity types""" +# with tempfile.TemporaryDirectory() as temp_dir: +# lmb = LaunchedManifestBuilder( +# exp_name="test_exp", +# exp_path=temp_dir, +# launcher_name="local", +# ) +# +# # Test with custom entity type +# custom_dir = lmb.get_entity_metadata_subdirectory("custom_entity_type") +# +# expected_path = lmb.run_metadata_subdirectory / "custom_entity_type" +# assert custom_dir == expected_path +# +# def test_metadata_directory_hierarchy(self): +# """Test that the metadata directory hierarchy is correct""" +# with tempfile.TemporaryDirectory() as temp_dir: +# lmb = LaunchedManifestBuilder( +# exp_name="test_exp", +# exp_path=temp_dir, +# launcher_name="local", +# ) +# +# # Test that the hierarchy is: exp_path/.smartsim/metadata/run_/entity_type +# model_dir = lmb.get_entity_metadata_subdirectory("model") +# +# # Check path components +# path_parts = model_dir.parts +# # Extract the metadata subdir parts for comparison +# metadata_parts = pathlib.Path(CONFIG.metadata_subdir).parts +# if len(metadata_parts) == 2: # e.g., ".smartsim/metadata" +# assert path_parts[-4] == metadata_parts[0] # ".smartsim" +# assert path_parts[-3] == metadata_parts[1] # "metadata" +# else: # single part, e.g., "metadata" +# assert path_parts[-3] == metadata_parts[0] +# assert path_parts[-2].startswith("run_") +# assert path_parts[-1] == "model" +# +# def test_multiple_instances_have_different_timestamps(self): +# """Test that multiple LaunchedManifestBuilder instances have different timestamps""" +# with tempfile.TemporaryDirectory() as temp_dir: +# lmb1 = LaunchedManifestBuilder( +# exp_name="test_exp1", +# exp_path=temp_dir, +# launcher_name="local", +# ) +# +# # Small delay to ensure different timestamps +# time.sleep(0.001) +# +# lmb2 = LaunchedManifestBuilder( +# exp_name="test_exp2", +# exp_path=temp_dir, +# launcher_name="local", +# ) +# +# # Timestamps should be different +# assert lmb1._launch_timestamp != lmb2._launch_timestamp +# assert lmb1.run_metadata_subdirectory != lmb2.run_metadata_subdirectory +# +# def test_same_instance_consistent_timestamps(self): +# """Test that the same instance always returns consistent timestamps""" +# with tempfile.TemporaryDirectory() as temp_dir: +# lmb = LaunchedManifestBuilder( +# exp_name="test_exp", +# exp_path=temp_dir, +# launcher_name="local", +# ) +# +# # Multiple calls should return the same timestamp +# timestamp1 = lmb._launch_timestamp +# timestamp2 = lmb._launch_timestamp +# assert timestamp1 == timestamp2 +# +# # Multiple calls to run_metadata_subdirectory should be consistent +# run_dir1 = lmb.run_metadata_subdirectory +# run_dir2 = lmb.run_metadata_subdirectory +# assert run_dir1 == run_dir2 +# +# def test_exp_path_with_pathlib(self): +# """Test that metadata directories work correctly when exp_path is a pathlib.Path""" +# with tempfile.TemporaryDirectory() as temp_dir: +# exp_path = pathlib.Path(temp_dir) +# lmb = LaunchedManifestBuilder( +# exp_name="test_exp", +# exp_path=str(exp_path), # LaunchedManifestBuilder expects string +# launcher_name="local", +# ) +# +# expected_exp_metadata = exp_path / CONFIG.metadata_subdir +# assert lmb.exp_metadata_subdirectory == expected_exp_metadata +# +# def test_metadata_paths_are_pathlib_paths(self): +# """Test that all metadata directory methods return pathlib.Path objects""" +# with tempfile.TemporaryDirectory() as temp_dir: +# lmb = LaunchedManifestBuilder( +# exp_name="test_exp", +# exp_path=temp_dir, +# launcher_name="local", +# ) +# +# assert isinstance(lmb.exp_metadata_subdirectory, pathlib.Path) +# assert isinstance(lmb.run_metadata_subdirectory, pathlib.Path) +# assert isinstance( +# lmb.get_entity_metadata_subdirectory("model"), pathlib.Path +# ) diff --git a/tests/test_metadata_integration.py b/tests/test_metadata_integration.py new file mode 100644 index 0000000000..235286b552 --- /dev/null +++ b/tests/test_metadata_integration.py @@ -0,0 +1,348 @@ +"""Integration tests for metadata directory functionality end-to-end""" + +import pathlib +import tempfile +import time +from unittest.mock import patch + +import pytest + +from smartsim import Experiment +from smartsim._core.config import CONFIG +from smartsim.database.orchestrator import Orchestrator +from smartsim.entity import Ensemble, Model +from smartsim.settings import RunSettings + + +class TestMetadataDirectoryIntegration: + """Integration tests for metadata directory creation across the SmartSim workflow""" + + def test_experiment_creates_correct_metadata_directory_structure_model_only(self): + """Test that launching only models creates the correct directory structure""" + with tempfile.TemporaryDirectory() as temp_dir: + exp = Experiment("test_metadata_model", exp_path=temp_dir, launcher="local") + + # Create a simple model + model = exp.create_model( + "test_model", run_settings=exp.create_run_settings("echo", ["hello"]) + ) + + # Start and wait for completion + exp.start(model, block=False) + exp.poll(interval=1) + + # Verify directory structure + smartsim_dir = pathlib.Path(temp_dir) / CONFIG.smartsim_base_dir + metadata_dir = smartsim_dir / "metadata" + + assert metadata_dir.exists(), "Metadata directory should exist" + + # Check for run-specific subdirectory + run_dirs = [ + d + for d in metadata_dir.iterdir() + if d.is_dir() and d.name.startswith("run_") + ] + assert ( + len(run_dirs) == 1 + ), f"Should have exactly one run directory, found: {run_dirs}" + + run_dir = run_dirs[0] + + # Check for entity-specific subdirectories with entity names + model_dir = run_dir / "model" / "test_model" + ensemble_dir = run_dir / "ensemble" + database_dir = run_dir / "database" + + assert ( + model_dir.exists() + ), f"Model metadata directory should exist: {model_dir}" + assert ( + not ensemble_dir.exists() + ), f"Ensemble metadata directory should not exist: {ensemble_dir}" + assert ( + not database_dir.exists() + ), f"Database metadata directory should not exist: {database_dir}" + + # Clean up + exp.stop(model) + + def test_experiment_creates_correct_metadata_directory_structure_ensemble_only( + self, + ): + """Test that launching only ensembles creates the correct directory structure""" + with tempfile.TemporaryDirectory() as temp_dir: + exp = Experiment( + "test_metadata_ensemble", exp_path=temp_dir, launcher="local" + ) + + # Create an ensemble + ensemble = exp.create_ensemble( + "test_ensemble", + run_settings=exp.create_run_settings("echo", ["world"]), + replicas=2, + ) + + # Start and wait for completion + exp.start(ensemble, block=False) + exp.poll(interval=1) + + # Verify directory structure + smartsim_dir = pathlib.Path(temp_dir) / CONFIG.smartsim_base_dir + metadata_dir = smartsim_dir / "metadata" + + assert metadata_dir.exists(), "Metadata directory should exist" + + # Check for run-specific subdirectory + run_dirs = [ + d + for d in metadata_dir.iterdir() + if d.is_dir() and d.name.startswith("run_") + ] + assert ( + len(run_dirs) == 1 + ), f"Should have exactly one run directory, found: {run_dirs}" + + run_dir = run_dirs[0] + + # Check for entity-specific subdirectories with entity names + model_dir = run_dir / "model" + ensemble_dir = run_dir / "ensemble" / "test_ensemble" + database_dir = run_dir / "database" + + assert ( + not model_dir.exists() + ), f"Model metadata directory should not exist: {model_dir}" + assert ( + ensemble_dir.exists() + ), f"Ensemble metadata directory should exist: {ensemble_dir}" + assert ( + not database_dir.exists() + ), f"Database metadata directory should not exist: {database_dir}" + + # Clean up + exp.stop(ensemble) + + def test_experiment_creates_correct_metadata_directory_structure_all_types(self): + """Test that launching models, ensembles, and orchestrator creates all directories""" + with tempfile.TemporaryDirectory() as temp_dir: + exp = Experiment("test_metadata_all", exp_path=temp_dir, launcher="local") + + # Create model + model = exp.create_model( + "test_model", run_settings=exp.create_run_settings("echo", ["hello"]) + ) + + # Create ensemble + ensemble = exp.create_ensemble( + "test_ensemble", + run_settings=exp.create_run_settings("echo", ["world"]), + replicas=2, + ) + + # Create database + orchestrator = exp.create_database(port=6379, interface="lo") + + # Start all entities - orchestrator and compute entities may create separate run dirs + exp.start(orchestrator, block=False) + exp.start(model, ensemble, block=False) + exp.poll(interval=1) + + # Verify directory structure + smartsim_dir = pathlib.Path(temp_dir) / CONFIG.smartsim_base_dir + metadata_dir = smartsim_dir / "metadata" + + assert metadata_dir.exists(), "Metadata directory should exist" + + # Check for run-specific subdirectories (may be 1 or 2 depending on timing) + run_dirs = [ + d + for d in metadata_dir.iterdir() + if d.is_dir() and d.name.startswith("run_") + ] + assert ( + len(run_dirs) >= 1 + ), f"Should have at least one run directory, found: {run_dirs}" + + # Find directory with model/ensemble subdirs + run_dir = None + for rd in run_dirs: + if (rd / "model").exists() or (rd / "ensemble").exists(): + run_dir = rd + break + + assert run_dir is not None, "Should find run directory with entity subdirs" + + # Check for entity-specific subdirectories with entity names + model_dir = run_dir / "model" / "test_model" + ensemble_dir = run_dir / "ensemble" / "test_ensemble" + + assert ( + model_dir.exists() + ), f"Model metadata directory should exist: {model_dir}" + assert ( + ensemble_dir.exists() + ), f"Ensemble metadata directory should exist: {ensemble_dir}" + # Clean up + exp.stop(model, ensemble) + exp.stop(orchestrator) + + def test_multiple_experiment_runs_create_separate_run_directories(self): + """Test that multiple experiment runs create separate timestamped directories""" + with tempfile.TemporaryDirectory() as temp_dir: + # First experiment run + exp1 = Experiment("test_metadata_run1", exp_path=temp_dir, launcher="local") + model1 = exp1.create_model( + "test_model1", run_settings=exp1.create_run_settings("echo", ["run1"]) + ) + + exp1.start(model1, block=False) + exp1.poll(interval=1) + exp1.stop(model1) + + # Small delay to ensure different timestamps + time.sleep(0.01) + + # Second experiment run + exp2 = Experiment("test_metadata_run2", exp_path=temp_dir, launcher="local") + model2 = exp2.create_model( + "test_model2", run_settings=exp2.create_run_settings("echo", ["run2"]) + ) + + exp2.start(model2, block=False) + exp2.poll(interval=1) + exp2.stop(model2) + + # Verify two separate run directories exist + metadata_dir = pathlib.Path(temp_dir) / CONFIG.metadata_subdir + run_dirs = [ + d + for d in metadata_dir.iterdir() + if d.is_dir() and d.name.startswith("run_") + ] + + assert ( + len(run_dirs) == 2 + ), f"Should have exactly two run directories, found: {run_dirs}" + + # Verify both have model subdirectories with entity names + model_names = ["test_model1", "test_model2"] + found_models = [] + + for run_dir in run_dirs: + model_parent_dir = run_dir / "model" + assert ( + model_parent_dir.exists() + ), f"Model parent directory should exist in {run_dir}" + + # Find which model is in this run directory + for model_name in model_names: + model_dir = run_dir / "model" / model_name + if model_dir.exists(): + found_models.append(model_name) + break + else: + assert False, f"No model directory found in {run_dir}" + + # Verify we found both models + assert ( + len(found_models) == 2 + ), f"Should find both models, found: {found_models}" + assert set(found_models) == set( + model_names + ), f"Should find correct models: {model_names}, found: {found_models}" + + def test_metadata_directory_structure_with_batch_entities(self): + """Test metadata directory creation pattern with batch-like behavior""" + with tempfile.TemporaryDirectory() as temp_dir: + exp = Experiment("test_metadata_batch", exp_path=temp_dir, launcher="local") + + # Create model and ensemble (batch settings don't work with local launcher) + model = exp.create_model( + "batch_model", + run_settings=exp.create_run_settings("echo", ["batch_hello"]), + ) + + ensemble = exp.create_ensemble( + "batch_ensemble", + run_settings=exp.create_run_settings("echo", ["batch_world"]), + replicas=2, + ) + + # Start entities to trigger metadata directory creation + exp.start(model, ensemble, block=False) + exp.poll(interval=1) + + # Verify directory structure was created + smartsim_dir = pathlib.Path(temp_dir) / CONFIG.smartsim_base_dir + metadata_dir = smartsim_dir / "metadata" + + assert metadata_dir.exists(), "Metadata directory should exist" + + # Check for run-specific subdirectory + run_dirs = [ + d + for d in metadata_dir.iterdir() + if d.is_dir() and d.name.startswith("run_") + ] + assert ( + len(run_dirs) >= 1 + ), f"Should have at least one run directory, found: {run_dirs}" + + # Check that at least one run directory has entity subdirs with entity names + has_model_dir = any( + (rd / "model" / "batch_model").exists() for rd in run_dirs + ) + has_ensemble_dir = any( + (rd / "ensemble" / "batch_ensemble").exists() for rd in run_dirs + ) + + assert ( + has_model_dir + ), "Should have model metadata directory with entity name" + assert ( + has_ensemble_dir + ), "Should have ensemble metadata directory with entity name" + + # Stop entities to clean up + exp.stop(model, ensemble) + + def test_metadata_directory_permissions_and_structure(self): + """Test that metadata directories are created with correct permissions""" + with tempfile.TemporaryDirectory() as temp_dir: + exp = Experiment("test_metadata_perms", exp_path=temp_dir, launcher="local") + + model = exp.create_model( + "test_model", + run_settings=exp.create_run_settings("echo", ["permissions"]), + ) + + exp.start(model, block=False) + exp.poll(interval=1) + + # Check directory structure and permissions + smartsim_dir = pathlib.Path(temp_dir) / CONFIG.smartsim_base_dir + metadata_dir = smartsim_dir / "metadata" + + # Verify directories exist and are readable/writable + assert metadata_dir.exists() and metadata_dir.is_dir() + assert ( + metadata_dir.stat().st_mode & 0o700 + ) # Owner should have read/write/execute + + run_dirs = [ + d + for d in metadata_dir.iterdir() + if d.is_dir() and d.name.startswith("run_") + ] + if run_dirs: + run_dir = run_dirs[0] + assert run_dir.exists() and run_dir.is_dir() + + # Check for entity-specific model directory with entity name + model_dir = run_dir / "model" / "test_model" + if model_dir.exists(): + assert model_dir.is_dir() + assert model_dir.stat().st_mode & 0o700 + + exp.stop(model) diff --git a/tests/test_model.py b/tests/test_model.py index fe4a482b35..1523475bd7 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -30,7 +30,8 @@ import pytest from smartsim import Experiment -from smartsim._core.control.manifest import LaunchedManifestBuilder + +# Removed LaunchedManifestBuilder import since it was deleted from smartsim._core.launcher.step import SbatchStep, SrunStep from smartsim.entity import Ensemble, Model from smartsim.entity.model import _parse_model_parameters @@ -97,7 +98,8 @@ def start_wo_job_manager( self, exp_name, exp_path, manifest, block=True, kill_on_interrupt=True ): self._launch(exp_name, exp_path, manifest) - return LaunchedManifestBuilder("name", "path", "launcher").finalize() + # Controller start method now returns None after LaunchedManifest removal + return None def launch_step_nop(self, step, entity): entity_steps.append((step, entity)) diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index c7d8131eed..7e992f3adc 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -88,7 +88,7 @@ def test_orc_is_active_functions( def test_multiple_interfaces( - test_dir: str, wlmutils: t.Type["conftest.WLMUtils"] + test_dir: str, wlmutils: type["conftest.WLMUtils"] ) -> None: exp_name = "test_multiple_interfaces" exp = Experiment(exp_name, launcher="local", exp_path=test_dir) @@ -136,7 +136,7 @@ def test_catch_local_db_errors() -> None: ##### PBS ###### -def test_pbs_set_run_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: +def test_pbs_set_run_arg(wlmutils: type["conftest.WLMUtils"]) -> None: orc = Orchestrator( wlmutils.get_test_port(), db_nodes=3, @@ -155,7 +155,7 @@ def test_pbs_set_run_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: ) -def test_pbs_set_batch_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: +def test_pbs_set_batch_arg(wlmutils: type["conftest.WLMUtils"]) -> None: orc = Orchestrator( wlmutils.get_test_port(), db_nodes=3, @@ -184,7 +184,7 @@ def test_pbs_set_batch_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: ##### Slurm ###### -def test_slurm_set_run_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: +def test_slurm_set_run_arg(wlmutils: type["conftest.WLMUtils"]) -> None: orc = Orchestrator( wlmutils.get_test_port(), db_nodes=3, @@ -199,7 +199,7 @@ def test_slurm_set_run_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: ) -def test_slurm_set_batch_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: +def test_slurm_set_batch_arg(wlmutils: type["conftest.WLMUtils"]) -> None: orc = Orchestrator( wlmutils.get_test_port(), db_nodes=3, @@ -250,24 +250,3 @@ def test_orc_results_in_correct_number_of_shards(single_cmd: bool) -> None: assert ( orc.num_shards == orc.db_nodes == sum(node.num_shards for node in orc.entities) ) - - -def test_orc_telemetry(test_dir: str, wlmutils: t.Type["conftest.WLMUtils"]) -> None: - """Ensure the default behavior for an orchestrator is to disable telemetry""" - db = Orchestrator(port=wlmutils.get_test_port()) - db.set_path(test_dir) - - # default is disabled - assert not db.telemetry.is_enabled - - # ensure updating value works as expected - db.telemetry.enable() - assert db.telemetry.is_enabled - - # toggle back - db.telemetry.disable() - assert not db.telemetry.is_enabled - - # toggle one more time - db.telemetry.enable() - assert db.telemetry.is_enabled diff --git a/tests/test_output_files.py b/tests/test_output_files.py index 770ec6e355..d0daf4ec58 100644 --- a/tests/test_output_files.py +++ b/tests/test_output_files.py @@ -106,10 +106,12 @@ def test_mutated_model_output(test_dir): def test_get_output_files_with_create_job_step(test_dir): """Testing output files through _create_job_step""" exp_dir = pathlib.Path(test_dir) - status_dir = exp_dir / CONFIG.telemetry_subdir / model.type - step = controller._create_job_step(model, status_dir) - expected_out_path = status_dir / model.name / (model.name + ".out") - expected_err_path = status_dir / model.name / (model.name + ".err") + model.path = test_dir + # Create metadata_dir to simulate consistent metadata structure + metadata_dir = exp_dir / CONFIG.metadata_subdir + step = controller._create_job_step(model, metadata_dir) + expected_out_path = metadata_dir / (model.name + ".out") + expected_err_path = metadata_dir / (model.name + ".err") assert step.get_output_files() == (str(expected_out_path), str(expected_err_path)) @@ -120,17 +122,13 @@ def test_get_output_files_with_create_job_step(test_dir): def test_get_output_files_with_create_batch_job_step(entity, test_dir): """Testing output files through _create_batch_job_step""" exp_dir = pathlib.Path(test_dir) - status_dir = exp_dir / CONFIG.telemetry_subdir / entity.type + status_dir = exp_dir / CONFIG.metadata_subdir / entity.type batch_step, substeps = slurm_controller._create_batch_job_step(entity, status_dir) for step in substeps: # example output path for a member of an Ensemble is - # .smartsim/telemetry/Ensemble/ens/ens_0/ens_0.out - expected_out_path = ( - status_dir / entity.name / step.entity_name / (step.entity_name + ".out") - ) - expected_err_path = ( - status_dir / entity.name / step.entity_name / (step.entity_name + ".err") - ) + # {CONFIG.metadata_subdir}/Ensemble/ens_0.out + expected_out_path = status_dir / (step.entity_name + ".out") + expected_err_path = status_dir / (step.entity_name + ".err") assert step.get_output_files() == ( str(expected_out_path), str(expected_err_path), @@ -141,9 +139,9 @@ def test_model_get_output_files(test_dir): """Testing model output files with manual step creation""" exp_dir = pathlib.Path(test_dir) step = Step(model.name, model.path, model.run_settings) - step.meta["status_dir"] = exp_dir / "output_dir" - expected_out_path = step.meta["status_dir"] / (model.name + ".out") - expected_err_path = step.meta["status_dir"] / (model.name + ".err") + step.meta["metadata_dir"] = exp_dir / "output_dir" + expected_out_path = step.meta["metadata_dir"] / (model.name + ".out") + expected_err_path = step.meta["metadata_dir"] / (model.name + ".err") assert step.get_output_files() == (str(expected_out_path), str(expected_err_path)) @@ -152,16 +150,16 @@ def test_ensemble_get_output_files(test_dir): exp_dir = pathlib.Path(test_dir) for member in ens.models: step = Step(member.name, member.path, member.run_settings) - step.meta["status_dir"] = exp_dir / "output_dir" - expected_out_path = step.meta["status_dir"] / (member.name + ".out") - expected_err_path = step.meta["status_dir"] / (member.name + ".err") + step.meta["metadata_dir"] = exp_dir / "output_dir" + expected_out_path = step.meta["metadata_dir"] / (member.name + ".out") + expected_err_path = step.meta["metadata_dir"] / (member.name + ".err") assert step.get_output_files() == ( str(expected_out_path), str(expected_err_path), ) -def test_get_output_files_no_status_dir(test_dir): +def test_get_output_files_no_metadata_dir(test_dir): """Test that a step not having a status directory throws a KeyError""" step_settings = RunSettings("echo") step = Step("mock-step", test_dir, step_settings) diff --git a/tests/test_pals_settings.py b/tests/test_pals_settings.py index dc297ccde1..9d6c87b3c7 100644 --- a/tests/test_pals_settings.py +++ b/tests/test_pals_settings.py @@ -45,12 +45,6 @@ default_kwargs = {"fail_if_missing_exec": False} -@pytest.fixture(autouse=True) -def turn_off_telemetry_indirect(monkeypatch): - monkeypatch.setattr(smartsim._core.config.config.Config, "telemetry_enabled", False) - yield - - # Uncomment when # @pytest.mark.parametrize( # "function_name",[ diff --git a/tests/test_preview.py b/tests/test_preview.py index a18d107281..91b26cf7a4 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -1,6 +1,6 @@ # BSD 2-Clause License # -# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# Copyright (c) 2021-2025, Hewlett Packard Enterprise # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -60,7 +60,7 @@ def _choose_host(wlmutils, index: int = 0): @pytest.fixture -def preview_object(test_dir) -> t.Dict[str, Job]: +def preview_object(test_dir) -> dict[str, Job]: """ Bare bones orch """ @@ -72,12 +72,12 @@ def preview_object(test_dir) -> t.Dict[str, Job]: s.ports = [1235] s.num_shards = 1 job = Job("faux-name", "faux-step-id", s, "slurm", True) - active_dbjobs: t.Dict[str, Job] = {"mock_job": job} + active_dbjobs: dict[str, Job] = {"mock_job": job} return active_dbjobs @pytest.fixture -def preview_object_multidb(test_dir) -> t.Dict[str, Job]: +def preview_object_multidb(test_dir) -> dict[str, Job]: """ Bare bones orch """ @@ -99,7 +99,7 @@ def preview_object_multidb(test_dir) -> t.Dict[str, Job]: s2.num_shards = 1 job2 = Job("faux-name_2", "faux-step-id_2", s2, "slurm", True) - active_dbjobs: t.Dict[str, Job] = {"mock_job": job, "mock_job2": job2} + active_dbjobs: dict[str, Job] = {"mock_job": job, "mock_job2": job2} return active_dbjobs diff --git a/tests/test_serialize.py b/tests/test_serialize.py deleted file mode 100644 index 4396bffc4d..0000000000 --- a/tests/test_serialize.py +++ /dev/null @@ -1,174 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2025, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import json -import logging -from pathlib import Path -from uuid import uuid4 - -import pytest - -import smartsim._core.config.config -from smartsim import Experiment -from smartsim._core._cli import utils -from smartsim._core.control.manifest import LaunchedManifestBuilder -from smartsim._core.utils import serialize -from smartsim.database.orchestrator import Orchestrator - -_CFG_TM_ENABLED_ATTR = "telemetry_enabled" - -# The tests in this file belong to the group_b group -pytestmark = pytest.mark.group_b - - -@pytest.fixture(autouse=True) -def turn_on_tm(monkeypatch): - monkeypatch.setattr( - smartsim._core.config.config.Config, - _CFG_TM_ENABLED_ATTR, - property(lambda self: True), - ) - yield - - -@pytest.fixture -def manifest_json(test_dir, config) -> str: - return Path(test_dir) / config.telemetry_subdir / serialize.MANIFEST_FILENAME - - -def test_serialize_creates_a_manifest_json_file_if_dne(test_dir, manifest_json): - lmb = LaunchedManifestBuilder("exp", test_dir, "launcher", str(uuid4())) - serialize.save_launch_manifest(lmb.finalize()) - - assert manifest_json.is_file() - with open(manifest_json, "r") as f: - manifest = json.load(f) - assert manifest["experiment"]["name"] == "exp" - assert manifest["experiment"]["launcher"] == "launcher" - assert isinstance(manifest["runs"], list) - assert len(manifest["runs"]) == 1 - - -def test_serialize_does_write_manifest_json_if_telemetry_monitor_is_off( - test_dir, monkeypatch, manifest_json -): - """Ensure that the manifest is written even if telemetry is not collected""" - monkeypatch.setattr( - smartsim._core.config.config.Config, - _CFG_TM_ENABLED_ATTR, - property(lambda self: False), - ) - lmb = LaunchedManifestBuilder("exp", test_dir, "launcher", str(uuid4())) - serialize.save_launch_manifest(lmb.finalize()) - assert manifest_json.exists() - - -def test_serialize_appends_a_manifest_json_exists(test_dir, manifest_json): - serialize.save_launch_manifest( - LaunchedManifestBuilder("exp", test_dir, "launcher", str(uuid4())).finalize() - ) - serialize.save_launch_manifest( - LaunchedManifestBuilder("exp", test_dir, "launcher", str(uuid4())).finalize() - ) - serialize.save_launch_manifest( - LaunchedManifestBuilder("exp", test_dir, "launcher", str(uuid4())).finalize() - ) - - assert manifest_json.is_file() - with open(manifest_json, "r") as f: - manifest = json.load(f) - assert isinstance(manifest["runs"], list) - assert len(manifest["runs"]) == 3 - assert len({run["run_id"] for run in manifest["runs"]}) == 3 - - -def test_serialize_overwites_file_if_not_json(test_dir, manifest_json): - manifest_json.parent.mkdir(parents=True, exist_ok=True) - with open(manifest_json, "w") as f: - f.write("This is not a json\n") - - lmb = LaunchedManifestBuilder("exp", test_dir, "launcher", str(uuid4())) - serialize.save_launch_manifest(lmb.finalize()) - with open(manifest_json, "r") as f: - assert isinstance(json.load(f), dict) - - -def test_started_entities_are_serialized(test_dir, manifest_json): - exp_name = "test-exp" - exp = Experiment(exp_name, exp_path=str(test_dir), launcher="local") - - rs1 = exp.create_run_settings("echo", ["hello", "world"]) - rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) - - hello_world_model = exp.create_model("echo-hello", run_settings=rs1) - spam_eggs_model = exp.create_model("echo-spam", run_settings=rs2) - hello_ensemble = exp.create_ensemble("echo-ensemble", run_settings=rs1, replicas=3) - - exp.generate(hello_world_model, spam_eggs_model, hello_ensemble) - exp.start(hello_world_model, spam_eggs_model, block=False) - exp.start(hello_ensemble, block=False) - - try: - with open(manifest_json, "r") as f: - manifest = json.load(f) - assert len(manifest["runs"]) == 2 - assert len(manifest["runs"][0]["model"]) == 2 - assert len(manifest["runs"][0]["ensemble"]) == 0 - assert len(manifest["runs"][1]["model"]) == 0 - assert len(manifest["runs"][1]["ensemble"]) == 1 - assert len(manifest["runs"][1]["ensemble"][0]["models"]) == 3 - finally: - exp.stop(hello_world_model, spam_eggs_model, hello_ensemble) - - -def test_serialzed_database_does_not_break_if_using_a_non_standard_install(monkeypatch): - monkeypatch.setattr(utils, "get_db_path", lambda: None) - db = Orchestrator() - dict_ = serialize._dictify_db(db, []) - assert dict_["type"] == "Unknown" - - -def test_dictify_run_settings_warns_when_attepting_to_dictify_mpmd( - monkeypatch, caplog, test_dir -): - # TODO: Eventually this test should be removed and we should be able to - # handle MPMD run settings as part of the output dict - exp_name = "test-exp" - test_dir = Path(test_dir) / exp_name - test_dir.mkdir(parents=True) - exp = Experiment(exp_name, exp_path=str(test_dir), launcher="local") - - rs1 = exp.create_run_settings("echo", ["hello", "world"]) - rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) - - # Make rs "MPMD" - monkeypatch.setattr(rs1, "mpmd", [rs2], raising=False) - # Make work with colored logs - monkeypatch.setattr(serialize, "_LOGGER", logging.getLogger()) - serialize._dictify_run_settings(rs1) - (rec,) = caplog.records - assert rec.levelno == logging.WARNING - assert "MPMD run settings" in rec.msg diff --git a/tests/test_slurm_settings.py b/tests/test_slurm_settings.py index 84fcc3539d..45ecb33e3f 100644 --- a/tests/test_slurm_settings.py +++ b/tests/test_slurm_settings.py @@ -105,7 +105,7 @@ def test_mpmd_compound_env_exports(): step = SrunStep("teststep", "./", srun) - step.meta["status_dir"] = "" + step.meta["metadata_dir"] = "" launch_cmd = step.get_launch_cmd() env_cmds = [v for v in launch_cmd if v == "env"] assert "env" in launch_cmd and len(env_cmds) == 1 @@ -165,7 +165,7 @@ def test_mpmd_non_compound_env_exports(): step = SrunStep("teststep", "./", srun) - step.meta["status_dir"] = "" + step.meta["metadata_dir"] = "" launch_cmd = step.get_launch_cmd() env_cmds = [v for v in launch_cmd if v == "env"] assert "env" not in launch_cmd and len(env_cmds) == 0 @@ -225,7 +225,7 @@ def test_mpmd_non_compound_no_exports(): step = SrunStep("teststep", "./", srun) - step.meta["status_dir"] = "" + step.meta["metadata_dir"] = "" launch_cmd = step.get_launch_cmd() env_cmds = [v for v in launch_cmd if v == "env"] assert "env" not in launch_cmd and len(env_cmds) == 0 diff --git a/tests/test_symlinking.py b/tests/test_symlinking.py index 02a692be06..6574c628d7 100644 --- a/tests/test_symlinking.py +++ b/tests/test_symlinking.py @@ -65,7 +65,7 @@ def test_symlink(test_dir, entity): """Test symlinking historical output files""" entity.path = test_dir if entity.type == Ensemble: - for member in ens.models: + for member in entity.models: symlink_with_create_job_step(test_dir, member) else: symlink_with_create_job_step(test_dir, entity) @@ -75,16 +75,20 @@ def symlink_with_create_job_step(test_dir, entity): """Function that helps cut down on repeated testing code""" exp_dir = pathlib.Path(test_dir) entity.path = test_dir - status_dir = exp_dir / CONFIG.telemetry_subdir / entity.type - step = controller._create_job_step(entity, status_dir) + # Use consistent metadata directory structure + metadata_dir = exp_dir / CONFIG.metadata_subdir + step = controller._create_job_step(entity, metadata_dir) controller.symlink_output_files(step, entity) assert pathlib.Path(entity.path, f"{entity.name}.out").is_symlink() assert pathlib.Path(entity.path, f"{entity.name}.err").is_symlink() + # Verify symlinks point to the correct metadata directory + expected_out = metadata_dir / (entity.name + ".out") + expected_err = metadata_dir / (entity.name + ".err") assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.out")) == str( - status_dir / entity.name / (entity.name + ".out") + expected_out ) assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.err")) == str( - status_dir / entity.name / (entity.name + ".err") + expected_err ) @@ -100,19 +104,51 @@ def test_batch_symlink(entity, test_dir): """Test symlinking historical output files""" exp_dir = pathlib.Path(test_dir) entity.path = test_dir - status_dir = exp_dir / CONFIG.telemetry_subdir / entity.type - batch_step, substeps = slurm_controller._create_batch_job_step(entity, status_dir) - for step in substeps: - slurm_controller.symlink_output_files(step, entity) - assert pathlib.Path(entity.path, f"{entity.name}.out").is_symlink() - assert pathlib.Path(entity.path, f"{entity.name}.err").is_symlink() - assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.out")) == str( - status_dir / entity.name / step.entity_name / (step.entity_name + ".out") + # For entities with sub-entities (like Orchestrator), set their paths too + if hasattr(entity, "entities"): + for sub_entity in entity.entities: + sub_entity.path = test_dir + + # Create metadata_dir to simulate consistent metadata structure + metadata_dir = exp_dir / CONFIG.metadata_subdir + batch_step, substeps = slurm_controller._create_batch_job_step(entity, metadata_dir) + + # For batch entities, we need to call symlink_output_files correctly + # Based on how the controller does it, we should pass the individual entities + if hasattr(entity, "entities") and len(substeps) > 0: + # Just test the first substep and entity pair + substep = substeps[0] + substep_entity = entity.entities[0] + slurm_controller.symlink_output_files(substep, substep_entity) + + # The symlinks should be created in the substep entity's path using its name + symlink_out = pathlib.Path(substep_entity.path, f"{substep_entity.name}.out") + symlink_err = pathlib.Path(substep_entity.path, f"{substep_entity.name}.err") + + assert symlink_out.is_symlink() + assert symlink_err.is_symlink() + + # The symlinks should point to the metadata_dir set for this substep + expected_out = pathlib.Path(substep.meta["metadata_dir"]) / ( + substep.entity_name + ".out" ) - assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.err")) == str( - status_dir / entity.name / step.entity_name / (step.entity_name + ".err") + expected_err = pathlib.Path(substep.meta["metadata_dir"]) / ( + substep.entity_name + ".err" ) + assert os.readlink(symlink_out) == str(expected_out) + assert os.readlink(symlink_err) == str(expected_err) + else: + # For _AnonymousBatchJob (single model) + substep = substeps[0] + slurm_controller.symlink_output_files(substep, entity) + + symlink_out = pathlib.Path(entity.path, f"{entity.name}.out") + symlink_err = pathlib.Path(entity.path, f"{entity.name}.err") + + assert symlink_out.is_symlink() + assert symlink_err.is_symlink() + def test_symlink_error(test_dir): """Ensure FileNotFoundError is thrown""" @@ -122,8 +158,8 @@ def test_symlink_error(test_dir): path=pathlib.Path(test_dir, "badpath"), run_settings=RunSettings("echo"), ) - telem_dir = pathlib.Path(test_dir, "bad_model_telemetry") - bad_step = controller._create_job_step(bad_model, telem_dir) + metadata_dir = pathlib.Path(test_dir, "bad_model_metadata") + bad_step = controller._create_job_step(bad_model, metadata_dir) with pytest.raises(FileNotFoundError): controller.symlink_output_files(bad_step, bad_model) diff --git a/tests/test_telemetry_monitor.py b/tests/test_telemetry_monitor.py deleted file mode 100644 index 6a27a02153..0000000000 --- a/tests/test_telemetry_monitor.py +++ /dev/null @@ -1,1325 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2025, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -import logging -import multiprocessing as mp -import pathlib -import sys -import time -import typing as t -import uuid - -import pytest - -import smartsim._core.config.config as cfg -from conftest import FileUtils, WLMUtils -from smartsim import Experiment -from smartsim._core.control.job import Job, JobEntity -from smartsim._core.control.jobmanager import JobManager -from smartsim._core.entrypoints.telemetrymonitor import get_parser -from smartsim._core.launcher.launcher import WLMLauncher -from smartsim._core.launcher.slurm.slurmLauncher import SlurmLauncher -from smartsim._core.launcher.step.step import Step, proxyable_launch_cmd -from smartsim._core.launcher.stepInfo import StepInfo -from smartsim._core.utils import serialize -from smartsim._core.utils.helpers import get_ts_ms -from smartsim._core.utils.telemetry.manifest import Run, RuntimeManifest -from smartsim._core.utils.telemetry.telemetry import ( - ManifestEventHandler, - TelemetryMonitor, - TelemetryMonitorArgs, -) -from smartsim._core.utils.telemetry.util import map_return_code, write_event -from smartsim.error.errors import UnproxyableStepError -from smartsim.settings.base import RunSettings -from smartsim.status import SmartSimStatus - -ALL_ARGS = {"-exp_dir", "-frequency"} -PROXY_ENTRY_POINT = "smartsim._core.entrypoints.indirect" -CFG_TM_ENABLED_ATTR = "telemetry_enabled" - - -for_all_wlm_launchers = pytest.mark.parametrize( - "wlm_launcher", - [pytest.param(cls(), id=cls.__name__) for cls in WLMLauncher.__subclasses__()], -) - -requires_wlm = pytest.mark.skipif( - pytest.test_launcher == "local", reason="Test requires WLM" -) - -logger = logging.getLogger(__name__) - -# The tests in this file belong to the slow_tests group -pytestmark = pytest.mark.slow_tests - - -@pytest.fixture(autouse=True) -def turn_on_tm(monkeypatch): - monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, property(lambda self: True)) - yield - - -def write_stop_file(entity: JobEntity, test_dir: pathlib.Path, duration: int): - time.sleep(duration) - write_event( - get_ts_ms(), - entity.task_id, - entity.step_id, - entity.type, - "stop", - test_dir, - "mock stop event", - 0, - ) - - -def snooze_blocking( - test_dir: pathlib.Path, max_delay: int = 20, post_data_delay: int = 2 -): - # let the non-blocking experiment complete. - for _ in range(max_delay): - time.sleep(1) - if test_dir.exists(): - time.sleep(post_data_delay) - break - - -@pytest.mark.parametrize( - ["cmd", "missing"], - [ - pytest.param("", {"-exp_dir", "-frequency"}, id="no args"), - pytest.param("-exp_dir /foo/bar", {"-frequency"}, id="no freq"), - pytest.param("-frequency 123", {"-exp_dir"}, id="no dir"), - ], -) -def test_parser_reqd_args(capsys, cmd, missing): - """Test that the parser reports any missing required arguments""" - parser = get_parser() - - args = cmd.split() - - captured = capsys.readouterr() # throw away existing output - with pytest.raises(SystemExit) as ex: - ns = parser.parse_args(args) - - captured = capsys.readouterr() - assert "the following arguments are required" in captured.err - err_desc = captured.err.split("the following arguments are required:")[-1] - for arg in missing: - assert arg in err_desc - - expected = ALL_ARGS - missing - for exp in expected: - assert exp not in err_desc - - -def test_parser(): - """Test that the parser succeeds when receiving expected args""" - parser = get_parser() - - test_dir = "/foo/bar" - test_freq = 123 - - cmd = f"-exp_dir {test_dir} -frequency {test_freq}" - args = cmd.split() - - ns = parser.parse_args(args) - - assert ns.exp_dir == test_dir - assert ns.frequency == test_freq - - -def test_ts(): - """Ensure expected output type""" - ts = get_ts_ms() - assert isinstance(ts, int) - - -@pytest.mark.parametrize( - ["freq"], - [ - pytest.param("1", id="1s delay"), - pytest.param("1.0", id="1s (float) freq"), - pytest.param("1.5", id="1.5s (float) freq"), - pytest.param("60", id="upper bound freq"), - pytest.param("60.0", id="upper bound (float) freq"), - ], -) -def test_valid_frequencies(freq: t.Union[int, float], test_dir: str): - """Ensure validation does not raise an exception on values in valid range""" - # check_frequency(float(freq)) - telmon_args = TelemetryMonitorArgs(test_dir, float(freq), 30, logging.DEBUG) - # telmon_args raises ValueError on bad inputs - assert telmon_args is not None - - -@pytest.mark.parametrize( - ["freq"], - [ - pytest.param("-1", id="negative freq"), - pytest.param("0", id="0s freq"), - pytest.param("0.9", id="0.9s freq"), - pytest.param("0.9999", id="lower bound"), - pytest.param("600.0001", id="just over upper"), - pytest.param("3600", id="too high"), - pytest.param("100000", id="bonkers high"), - ], -) -def test_invalid_frequencies(freq: t.Union[int, float], test_dir: str): - """Ensure validation raises an exception on values outside valid range""" - exp_err_msg = "in the range" - with pytest.raises(ValueError) as ex: - TelemetryMonitorArgs(test_dir, float(freq), 30, logging.DEBUG) - assert exp_err_msg in "".join(ex.value.args) - - -@pytest.mark.parametrize( - ["etype", "task_id", "step_id", "timestamp", "evt_type"], - [ - pytest.param("ensemble", "", "123", get_ts_ms(), "start", id="start event"), - pytest.param("ensemble", "", "123", get_ts_ms(), "stop", id="stop event"), - ], -) -def test_write_event( - etype: str, - task_id: str, - step_id: str, - timestamp: int, - evt_type: str, - test_dir: str, -): - """Ensure that track event writes a file to the expected location""" - exp_path = pathlib.Path(test_dir) - write_event(timestamp, task_id, step_id, etype, evt_type, exp_path) - - expected_output = exp_path / f"{evt_type}.json" - - assert expected_output.exists() - assert expected_output.is_file() - - -@pytest.mark.parametrize( - ["entity_type", "task_id", "step_id", "timestamp", "evt_type"], - [ - pytest.param("ensemble", "", "123", get_ts_ms(), "start", id="start event"), - pytest.param("ensemble", "", "123", get_ts_ms(), "stop", id="stop event"), - ], -) -def test_write_event_overwrite( - entity_type: str, - task_id: str, - step_id: str, - timestamp: int, - evt_type: str, - test_dir: str, -): - """Ensure that `write_event` does not overwrite an existing file if called more than once""" - exp_path = pathlib.Path(test_dir) - write_event(timestamp, task_id, step_id, entity_type, evt_type, exp_path) - - expected_output = exp_path / f"{evt_type}.json" - - assert expected_output.exists() - assert expected_output.is_file() - - # grab whatever is in the file now to compare against - original_content = expected_output.read_text() - - updated_timestamp = get_ts_ms() - updated_task_id = task_id + "xxx" - updated_step_id = step_id + "xxx" - updated_entity = entity_type + "xxx" - - # write to the same location - write_event( - updated_timestamp, - updated_task_id, - updated_step_id, - updated_entity, - evt_type, - exp_path, - ) - - # read in file content after attempted overwrite - with open(expected_output, "r") as validate_fp: - validate_output = validate_fp.read() - - # verify the content matches the old content - assert str(timestamp) in validate_output - assert str(updated_timestamp) not in validate_output - assert "xxx" not in validate_output - assert validate_output == original_content - - -def test_load_manifest(fileutils: FileUtils, test_dir: str, config: cfg.Config): - """Ensure that the runtime manifest loads correctly""" - sample_manifest_path = fileutils.get_test_conf_path("telemetry/telemetry.json") - sample_manifest = pathlib.Path(sample_manifest_path) - assert sample_manifest.exists() - - test_manifest_path = fileutils.make_test_file( - serialize.MANIFEST_FILENAME, - pathlib.Path(test_dir) / config.telemetry_subdir, - sample_manifest.read_text(), - ) - test_manifest = pathlib.Path(test_manifest_path) - assert test_manifest.exists() - - manifest = RuntimeManifest.load_manifest(test_manifest_path) - assert manifest.name == "my-exp" - assert str(manifest.path) == "/path/to/my-exp" - assert manifest.launcher == "Slurm" - assert len(manifest.runs) == 6 - - assert len(manifest.runs[0].models) == 1 - assert len(manifest.runs[2].models) == 8 # 8 models in ensemble - assert len(manifest.runs[0].orchestrators) == 0 - assert len(manifest.runs[1].orchestrators) == 3 # 3 shards in db - - -def test_load_manifest_colo_model(fileutils: FileUtils): - """Ensure that the runtime manifest loads correctly when containing a colocated model""" - # NOTE: for regeneration, this manifest can use `test_telemetry_colo` - sample_manifest_path = fileutils.get_test_conf_path("telemetry/colocatedmodel.json") - sample_manifest = pathlib.Path(sample_manifest_path) - assert sample_manifest.exists() - - manifest = RuntimeManifest.load_manifest(sample_manifest_path) - assert manifest.name == "my-exp" - assert str(manifest.path) == "/tmp/my-exp" - assert manifest.launcher == "Slurm" - assert len(manifest.runs) == 1 - - assert len(manifest.runs[0].models) == 1 - - -def test_load_manifest_serial_models(fileutils: FileUtils): - """Ensure that the runtime manifest loads correctly when containing multiple models""" - # NOTE: for regeneration, this manifest can use `test_telemetry_colo` - sample_manifest_path = fileutils.get_test_conf_path("telemetry/serialmodels.json") - sample_manifest = pathlib.Path(sample_manifest_path) - assert sample_manifest.exists() - - manifest = RuntimeManifest.load_manifest(sample_manifest_path) - assert manifest.name == "my-exp" - assert str(manifest.path) == "/tmp/my-exp" - assert manifest.launcher == "Slurm" - assert len(manifest.runs) == 1 - - assert len(manifest.runs[0].models) == 5 - - -def test_load_manifest_db_and_models(fileutils: FileUtils): - """Ensure that the runtime manifest loads correctly when containing models & - orchestrator across 2 separate runs""" - # NOTE: for regeneration, this manifest can use `test_telemetry_colo` - sample_manifest_path = fileutils.get_test_conf_path("telemetry/db_and_model.json") - sample_manifest = pathlib.Path(sample_manifest_path) - assert sample_manifest.exists() - - manifest = RuntimeManifest.load_manifest(sample_manifest_path) - assert manifest.name == "my-exp" - assert str(manifest.path) == "/tmp/my-exp" - assert manifest.launcher == "Slurm" - assert len(manifest.runs) == 2 - - assert len(manifest.runs[0].orchestrators) == 1 - assert len(manifest.runs[1].models) == 1 - - # verify collector paths from manifest are deserialized to collector config - assert manifest.runs[0].orchestrators[0].collectors["client"] - assert manifest.runs[0].orchestrators[0].collectors["memory"] - # verify collector paths missing from manifest are empty - assert not manifest.runs[0].orchestrators[0].collectors["client_count"] - - -def test_load_manifest_db_and_models_1run(fileutils: FileUtils): - """Ensure that the runtime manifest loads correctly when containing models & - orchestrator in a single run""" - # NOTE: for regeneration, this manifest can use `test_telemetry_colo` - sample_manifest_path = fileutils.get_test_conf_path( - "telemetry/db_and_model_1run.json" - ) - sample_manifest = pathlib.Path(sample_manifest_path) - assert sample_manifest.exists() - - manifest = RuntimeManifest.load_manifest(sample_manifest_path) - assert manifest.name == "my-exp" - assert str(manifest.path) == "/tmp/my-exp" - assert manifest.launcher == "Slurm" - assert len(manifest.runs) == 1 - - assert len(manifest.runs[0].orchestrators) == 1 - assert len(manifest.runs[0].models) == 1 - - -@pytest.mark.parametrize( - ["task_id", "step_id", "etype", "exp_isorch", "exp_ismanaged"], - [ - pytest.param("123", "", "model", False, False, id="unmanaged, non-orch"), - pytest.param("456", "123", "ensemble", False, True, id="managed, non-orch"), - pytest.param("789", "987", "orchestrator", True, True, id="managed, orch"), - pytest.param("987", "", "orchestrator", True, False, id="unmanaged, orch"), - ], -) -def test_persistable_computed_properties( - task_id: str, step_id: str, etype: str, exp_isorch: bool, exp_ismanaged: bool -): - name = f"test-{etype}-{uuid.uuid4()}" - timestamp = get_ts_ms() - exp_dir = pathlib.Path("/foo/bar") - stored = { - "name": name, - "run_id": timestamp, - "telemetry_metadata": { - "status_dir": str(exp_dir), - "task_id": task_id, - "step_id": step_id, - }, - } - faux_experiment = {"launcher": "local"} - persistables = Run.load_entity(etype, stored, exp_dir, faux_experiment) - persistable = persistables[0] if persistables else None - - assert persistable.is_managed == exp_ismanaged - assert persistable.is_db == exp_isorch - - -def test_deserialize_ensemble(fileutils: FileUtils): - """Ensure that the children of ensembles (models) are correctly - placed in the models collection""" - sample_manifest_path = fileutils.get_test_conf_path("telemetry/ensembles.json") - sample_manifest = pathlib.Path(sample_manifest_path) - assert sample_manifest.exists() - - manifest = RuntimeManifest.load_manifest(sample_manifest_path) - assert manifest - - assert len(manifest.runs) == 1 - - # NOTE: no longer returning ensembles, only children... - # assert len(manifest.runs[0].ensembles) == 1 - assert len(manifest.runs[0].models) == 8 - - -def test_shutdown_conditions__no_monitored_jobs(test_dir: str): - """Show that an event handler w/no monitored jobs can shutdown""" - job_entity1 = JobEntity() - job_entity1.name = "xyz" - job_entity1.step_id = "123" - job_entity1.task_id = "" - - mani_handler = ManifestEventHandler("xyz") - - tm_args = TelemetryMonitorArgs(test_dir, 1, 10, logging.DEBUG) - telmon = TelemetryMonitor(tm_args) - telmon._action_handler = mani_handler # replace w/mock handler - - assert telmon._can_shutdown() - - -def test_shutdown_conditions__has_monitored_job(test_dir: str): - """Show that an event handler w/a monitored job cannot shutdown""" - job_entity1 = JobEntity() - job_entity1.name = "xyz" - job_entity1.step_id = "123" - job_entity1.task_id = "" - - mani_handler = ManifestEventHandler("xyz") - mani_handler.job_manager.add_job( - job_entity1.name, job_entity1.step_id, job_entity1, False - ) - tm_args = TelemetryMonitorArgs(test_dir, 1, 10, logging.DEBUG) - telmon = TelemetryMonitor(tm_args) - telmon._action_handler = mani_handler - - assert not telmon._can_shutdown() - assert not bool(mani_handler.job_manager.db_jobs) - assert bool(mani_handler.job_manager.jobs) - - -def test_shutdown_conditions__has_db(test_dir: str): - """Show that an event handler w/a monitored db cannot shutdown""" - job_entity1 = JobEntity() - job_entity1.name = "xyz" - job_entity1.step_id = "123" - job_entity1.task_id = "" - job_entity1.type = "orchestrator" # <---- make entity appear as db - - mani_handler = ManifestEventHandler("xyz") - ## TODO: see next comment and combine an add_job method on manieventhandler - # and _use within_ manieventhandler - # PROBABLY just encapsulating the body of for run in runs: for entity in run.flatten()... - mani_handler.job_manager.add_job( - job_entity1.name, job_entity1.step_id, job_entity1, False - ) - ## TODO: !!!!!! shouldn't add_job (or something on mani_handler) - # allow me to add a job to "all the places" in one call... even a private one? - mani_handler._tracked_jobs[job_entity1.key] = job_entity1 - tm_args = TelemetryMonitorArgs(test_dir, 1, 10, logging.DEBUG) - telmon = TelemetryMonitor(tm_args) - telmon._action_handler = mani_handler # replace w/mock handler - - assert not telmon._can_shutdown() - assert bool([j for j in mani_handler._tracked_jobs.values() if j.is_db]) - assert not bool(mani_handler.job_manager.jobs) - - -@pytest.mark.parametrize( - "expected_duration", - [ - pytest.param(2000, id="2s cooldown"), - pytest.param(3000, id="3s cooldown"), - pytest.param(5000, id="5s cooldown"), - pytest.param(10000, id="10s cooldown"), - ], -) -@pytest.mark.asyncio -async def test_auto_shutdown__no_jobs(test_dir: str, expected_duration: int): - """Ensure that the cooldown timer is respected""" - - class FauxObserver: - """Mock for the watchdog file system event listener""" - - def __init__(self): - self.stop_count = 0 - - def stop(self): - self.stop_count += 1 - - def is_alive(self) -> bool: - if self.stop_count > 0: - return False - - return True - - frequency = 1000 - - # monitor_pattern = f"{test_dir}/mock_mani.json" - # show that an event handler w/out a monitored task will automatically stop - mani_handler = ManifestEventHandler("xyz", logger) - observer = FauxObserver() - expected_duration = 2000 - - ts0 = get_ts_ms() - tm_args = TelemetryMonitorArgs( - test_dir, frequency / 1000, expected_duration / 1000, logging.DEBUG - ) - telmon = TelemetryMonitor(tm_args) - telmon._observer = observer # replace w/mock observer - telmon._action_handler = mani_handler # replace w/mock handler - - # with NO jobs registered, monitor should notice that it can - # shutdown immediately but wait for the cooldown period - await telmon.monitor() # observer, mani_handler, frequency, duration) - ts1 = get_ts_ms() - - test_duration = ts1 - ts0 - assert test_duration >= expected_duration - assert observer.stop_count == 1 - - -@pytest.mark.parametrize( - "cooldown_ms, task_duration_ms", - [ - pytest.param(2000, 2000, id="2s task + 2s cooldown"), - pytest.param(3000, 4000, id="3s task + 4s cooldown"), - pytest.param(5000, 5000, id="5s task + 5s cooldown"), - pytest.param(5000, 10000, id="5s task + 10s cooldown"), - ], -) -@pytest.mark.asyncio -async def test_auto_shutdown__has_db( - test_dir: str, cooldown_ms: int, task_duration_ms: int -): - """Ensure that the cooldown timer is respected with a running db""" - - class FauxObserver: - """Mock for the watchdog file system event listener""" - - def __init__(self): - self.stop_count = 0 - - def stop(self): - self.stop_count += 1 - - def is_alive(self) -> bool: - if self.stop_count > 0: - return False - - return True - - entity = JobEntity() - entity.name = "db_0" - entity.step_id = "123" - entity.task_id = "" - entity.type = "orchestrator" - entity.telemetry_on = True - entity.status_dir = test_dir - - p = mp.Process( - target=write_stop_file, - args=(entity, pathlib.Path(test_dir), (task_duration_ms / 1000)), - ) - - frequency = 1000 - - # show that when a monitored task completes,the telmon automatically stops - mani_handler = ManifestEventHandler("xyz", logger) - observer = FauxObserver() - expected_duration = (cooldown_ms / 1000) + (task_duration_ms / 1000) - - tm_args = TelemetryMonitorArgs( - test_dir, frequency / 1000, (cooldown_ms / 1000), logging.DEBUG - ) - telmon = TelemetryMonitor(tm_args) - telmon._observer = observer # replace w/mock observer - telmon._action_handler = mani_handler # replace w/mock handler - - ts0 = get_ts_ms() - p.start() # another process write the stop.json and telmon picks it up - await telmon.monitor() - ts1 = get_ts_ms() - - test_duration = ts1 - ts0 - assert test_duration >= expected_duration - assert observer.stop_count == 1 - - -def test_telemetry_single_model(fileutils, test_dir, wlmutils, config): - """Test that it is possible to create_database then colocate_db_uds/colocate_db_tcp - with unique db_identifiers""" - - # Set experiment name - exp_name = "telemetry_single_model" - - # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_script = fileutils.get_test_conf_path("echo.py") - - # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - - # create run settings - app_settings = exp.create_run_settings(sys.executable, test_script) - app_settings.set_nodes(1) - app_settings.set_tasks_per_node(1) - - # Create the SmartSim Model - smartsim_model = exp.create_model("perroquet", app_settings) - exp.generate(smartsim_model) - exp.start(smartsim_model, block=True) - assert exp.get_status(smartsim_model)[0] == SmartSimStatus.STATUS_COMPLETED - - telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - start_events = list(telemetry_output_path.rglob("start.json")) - stop_events = list(telemetry_output_path.rglob("stop.json")) - - assert len(start_events) == 1 - assert len(stop_events) == 1 - - -def test_telemetry_single_model_nonblocking( - fileutils, test_dir, wlmutils, monkeypatch, config -): - """Ensure that the telemetry monitor logs exist when the experiment - is non-blocking""" - with monkeypatch.context() as ctx: - ctx.setattr(cfg.Config, "telemetry_frequency", 1) - - # Set experiment name - exp_name = "test_telemetry_single_model_nonblocking" - - # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_script = fileutils.get_test_conf_path("echo.py") - - # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - - # create run settings - app_settings = exp.create_run_settings(sys.executable, test_script) - app_settings.set_nodes(1) - app_settings.set_tasks_per_node(1) - - # Create the SmartSim Model - smartsim_model = exp.create_model("perroquet", app_settings) - exp.generate(smartsim_model) - exp.start(smartsim_model) - - telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - - assert exp.get_status(smartsim_model)[0] == SmartSimStatus.STATUS_COMPLETED - - start_events = list(telemetry_output_path.rglob("start.json")) - stop_events = list(telemetry_output_path.rglob("stop.json")) - - assert len(start_events) == 1 - assert len(stop_events) == 1 - - -def test_telemetry_serial_models(fileutils, test_dir, wlmutils, monkeypatch, config): - """ - Test telemetry with models being run in serial (one after each other) - """ - with monkeypatch.context() as ctx: - ctx.setattr(cfg.Config, "telemetry_frequency", 1) - - # Set experiment name - exp_name = "telemetry_serial_models" - - # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_script = fileutils.get_test_conf_path("echo.py") - - # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - - # create run settings - app_settings = exp.create_run_settings(sys.executable, test_script) - app_settings.set_nodes(1) - app_settings.set_tasks_per_node(1) - - # Create the SmartSim Model - smartsim_models = [ - exp.create_model(f"perroquet_{i}", app_settings) for i in range(5) - ] - exp.generate(*smartsim_models) - exp.start(*smartsim_models, block=True) - assert all( - [ - status == SmartSimStatus.STATUS_COMPLETED - for status in exp.get_status(*smartsim_models) - ] - ) - - telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - start_events = list(telemetry_output_path.rglob("start.json")) - stop_events = list(telemetry_output_path.rglob("stop.json")) - - assert len(start_events) == 5 - assert len(stop_events) == 5 - - -def test_telemetry_serial_models_nonblocking( - fileutils, test_dir, wlmutils, monkeypatch, config -): - """ - Test telemetry with models being run in serial (one after each other) - in a non-blocking experiment - """ - with monkeypatch.context() as ctx: - ctx.setattr(cfg.Config, "telemetry_frequency", 1) - - # Set experiment name - exp_name = "telemetry_serial_models" - - # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_script = fileutils.get_test_conf_path("echo.py") - - # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - - # create run settings - app_settings = exp.create_run_settings(sys.executable, test_script) - app_settings.set_nodes(1) - app_settings.set_tasks_per_node(1) - - # Create the SmartSim Model - smartsim_models = [ - exp.create_model(f"perroquet_{i}", app_settings) for i in range(5) - ] - exp.generate(*smartsim_models) - exp.start(*smartsim_models) - - telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - - assert all( - [ - status == SmartSimStatus.STATUS_COMPLETED - for status in exp.get_status(*smartsim_models) - ] - ) - - start_events = list(telemetry_output_path.rglob("start.json")) - stop_events = list(telemetry_output_path.rglob("stop.json")) - - assert len(start_events) == 5 - assert len(stop_events) == 5 - - -def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch, config): - """ - Test telemetry with only a database running - """ - with monkeypatch.context() as ctx: - ctx.setattr(cfg.Config, "telemetry_frequency", 1) - - # Set experiment name - exp_name = "telemetry_db_with_generate" - - # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_interface = wlmutils.get_test_interface() - test_port = wlmutils.get_test_port() - - # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - - # create regular database - orc = exp.create_database(port=test_port, interface=test_interface) - exp.generate(orc) - - telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - - try: - exp.start(orc, block=True) - - snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - - start_events = list(telemetry_output_path.rglob("start.json")) - stop_events = list(telemetry_output_path.rglob("stop.json")) - - assert len(start_events) == 1 - assert len(stop_events) <= 1 - finally: - exp.stop(orc) - snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - - assert exp.get_status(orc)[0] == SmartSimStatus.STATUS_CANCELLED - - stop_events = list(telemetry_output_path.rglob("stop.json")) - assert len(stop_events) == 1 - - -def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch, config): - """ - Test telemetry with only a non-generated database running - """ - with monkeypatch.context() as ctx: - ctx.setattr(cfg.Config, "telemetry_frequency", 1) - - # Set experiment name - exp_name = "telemetry_db_only_without_generate" - - # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_interface = wlmutils.get_test_interface() - test_port = wlmutils.get_test_port() - - # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - - # create regular database - orc = exp.create_database(port=test_port, interface=test_interface) - telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - - try: - exp.start(orc) - - snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - - start_events = list(telemetry_output_path.rglob("start.json")) - stop_events = list(telemetry_output_path.rglob("stop.json")) - - assert len(start_events) == 1 - assert len(stop_events) == 0 - finally: - exp.stop(orc) - - snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - assert exp.get_status(orc)[0] == SmartSimStatus.STATUS_CANCELLED - - stop_events = list(telemetry_output_path.rglob("stop.json")) - assert len(stop_events) == 1 - - -def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch, config): - """ - Test telemetry with only a database and a model running - """ - - with monkeypatch.context() as ctx: - ctx.setattr(cfg.Config, "telemetry_frequency", 1) - - # Set experiment name - exp_name = "telemetry_db_and_model" - - # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_interface = wlmutils.get_test_interface() - test_port = wlmutils.get_test_port() - test_script = fileutils.get_test_conf_path("echo.py") - - # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - - # create regular database - orc = exp.create_database(port=test_port, interface=test_interface) - exp.generate(orc) - try: - exp.start(orc) - - # create run settings - app_settings = exp.create_run_settings(sys.executable, test_script) - app_settings.set_nodes(1) - app_settings.set_tasks_per_node(1) - - # Create the SmartSim Model - smartsim_model = exp.create_model("perroquet", app_settings) - exp.generate(smartsim_model) - exp.start(smartsim_model, block=True) - finally: - exp.stop(orc) - - telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - - assert exp.get_status(orc)[0] == SmartSimStatus.STATUS_CANCELLED - assert exp.get_status(smartsim_model)[0] == SmartSimStatus.STATUS_COMPLETED - - start_events = list(telemetry_output_path.rglob("database/**/start.json")) - stop_events = list(telemetry_output_path.rglob("database/**/stop.json")) - - assert len(start_events) == 1 - assert len(stop_events) == 1 - - start_events = list(telemetry_output_path.rglob("model/**/start.json")) - stop_events = list(telemetry_output_path.rglob("model/**/stop.json")) - assert len(start_events) == 1 - assert len(stop_events) == 1 - - -def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch, config): - """ - Test telemetry with only an ensemble - """ - - with monkeypatch.context() as ctx: - ctx.setattr(cfg.Config, "telemetry_frequency", 1) - - # Set experiment name - exp_name = "telemetry_ensemble" - - # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_script = fileutils.get_test_conf_path("echo.py") - - # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - - app_settings = exp.create_run_settings(sys.executable, test_script) - app_settings.set_nodes(1) - app_settings.set_tasks_per_node(1) - - ens = exp.create_ensemble("troupeau", run_settings=app_settings, replicas=5) - exp.generate(ens) - exp.start(ens, block=True) - assert all( - [ - status == SmartSimStatus.STATUS_COMPLETED - for status in exp.get_status(ens) - ] - ) - - telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - start_events = list(telemetry_output_path.rglob("start.json")) - stop_events = list(telemetry_output_path.rglob("stop.json")) - - assert len(start_events) == 5 - assert len(stop_events) == 5 - - -def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch, config): - """ - Test telemetry with only a colocated model running - """ - - with monkeypatch.context() as ctx: - ctx.setattr(cfg.Config, "telemetry_frequency", 1) - - # Set experiment name - exp_name = "telemetry_colo" - - # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - - # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - - smartsim_model = coloutils.setup_test_colo( - fileutils, - "uds", - exp, - "echo.py", - {}, - ) - - exp.generate(smartsim_model) - exp.start(smartsim_model, block=True) - assert all( - [ - status == SmartSimStatus.STATUS_COMPLETED - for status in exp.get_status(smartsim_model) - ] - ) - - telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - start_events = list(telemetry_output_path.rglob("start.json")) - stop_events = list(telemetry_output_path.rglob("stop.json")) - - # the colodb does NOT show up as a unique entity in the telemetry - assert len(start_events) == 1 - assert len(stop_events) == 1 - - -@pytest.mark.parametrize( - "frequency, cooldown", - [ - pytest.param(1, 1, id="1s shutdown"), - pytest.param(1, 5, id="5s shutdown"), - pytest.param(1, 15, id="15s shutdown"), - ], -) -def test_telemetry_autoshutdown( - test_dir: str, - wlmutils, - monkeypatch: pytest.MonkeyPatch, - frequency: int, - cooldown: int, - config: cfg.Config, -): - """ - Ensure that the telemetry monitor process shuts down after the desired - cooldown period - """ - - with monkeypatch.context() as ctx: - ctx.setattr(cfg.Config, "telemetry_frequency", frequency) - ctx.setattr(cfg.Config, "telemetry_cooldown", cooldown) - - cooldown_ms = cooldown * 1000 - - # Set experiment name - exp_name = "telemetry_ensemble" - - # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - - # Create SmartSim Experiment - exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - - rs = RunSettings("python", exe_args=["sleep.py", "1"]) - model = exp.create_model("model", run_settings=rs) - - start_time = get_ts_ms() - exp.start(model, block=True) - - telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - empty_mani = list(telemetry_output_path.rglob("manifest.json")) - assert len(empty_mani) == 1, "an manifest.json should be created" - - popen = exp._control._telemetry_monitor - assert popen.pid > 0 - assert popen.returncode is None - - # give some leeway during testing for the cooldown to get hit - for i in range(10): - if popen.poll() is not None: - print(f"Completed polling for telemetry shutdown after {i} attempts") - break - time.sleep(2) - - stop_time = get_ts_ms() - duration = stop_time - start_time - - assert popen.returncode is not None - assert duration >= cooldown_ms - - -class MockStep(Step): - """Mock step to implement any abstract methods so that it can be - instanced for test purposes - """ - - def get_launch_cmd(self): - return ["spam", "eggs"] - - -@pytest.fixture -def mock_step_meta_dict(test_dir, config): - telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - yield { - "entity_type": "mock", - "status_dir": telemetry_output_path, - } - - -@pytest.fixture -def mock_step(test_dir, mock_step_meta_dict): - rs = RunSettings("echo") - step = MockStep("mock-step", test_dir, rs) - step.meta = mock_step_meta_dict - yield step - - -def test_proxy_launch_cmd_decorator_reformats_cmds(mock_step, monkeypatch): - monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, True) - get_launch_cmd = proxyable_launch_cmd(lambda step: ["some", "cmd", "list"]) - cmd = get_launch_cmd(mock_step) - assert cmd != ["some", "cmd", "list"] - assert sys.executable in cmd - assert PROXY_ENTRY_POINT in cmd - - -def test_proxy_launch_cmd_decorator_does_not_reformat_cmds_if_the_tm_is_off( - mock_step, monkeypatch -): - monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, False) - get_launch_cmd = proxyable_launch_cmd(lambda step: ["some", "cmd", "list"]) - cmd = get_launch_cmd(mock_step) - assert cmd == ["some", "cmd", "list"] - - -def test_proxy_launch_cmd_decorator_errors_if_attempt_to_proxy_a_managed_step( - mock_step, monkeypatch -): - monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, True) - mock_step.managed = True - get_launch_cmd = proxyable_launch_cmd(lambda step: ["some", "cmd", "list"]) - with pytest.raises(UnproxyableStepError): - get_launch_cmd(mock_step) - - -@for_all_wlm_launchers -def test_unmanaged_steps_are_proxyed_through_indirect( - wlm_launcher, mock_step_meta_dict, test_dir, monkeypatch -): - monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, True) - rs = RunSettings("echo", ["hello", "world"]) - step = wlm_launcher.create_step("test-step", test_dir, rs) - step.meta = mock_step_meta_dict - assert isinstance(step, Step) - assert not step.managed - cmd = step.get_launch_cmd() - assert sys.executable in cmd - assert PROXY_ENTRY_POINT in cmd - assert "hello" not in cmd - assert "world" not in cmd - - -@for_all_wlm_launchers -def test_unmanaged_steps_are_not_proxyed_if_the_telemetry_monitor_is_disabled( - wlm_launcher, mock_step_meta_dict, test_dir, monkeypatch -): - monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, False) - rs = RunSettings("echo", ["hello", "world"]) - step = wlm_launcher.create_step("test-step", test_dir, rs) - step.meta = mock_step_meta_dict - assert isinstance(step, Step) - assert not step.managed - cmd = step.get_launch_cmd() - assert PROXY_ENTRY_POINT not in cmd - assert "hello" in cmd - assert "world" in cmd - - -@requires_wlm -@pytest.mark.parametrize( - "run_command", - [ - pytest.param("", id="Unmanaged"), - pytest.param("auto", id="Managed"), - ], -) -def test_multistart_experiment( - wlmutils: WLMUtils, - fileutils: FileUtils, - test_dir: str, - monkeypatch: pytest.MonkeyPatch, - run_command: str, - config: cfg.Config, -): - """Run an experiment with multiple start calls to ensure that telemetry is - saved correctly for each run - """ - - exp_name = "my-exp" - exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) - rs_e = exp.create_run_settings( - sys.executable, ["printing_model.py"], run_command=run_command - ) - rs_e.set_nodes(1) - rs_e.set_tasks(1) - ens = exp.create_ensemble( - "my-ens", - run_settings=rs_e, - perm_strategy="all_perm", - params={ - "START": ["spam"], - "MID": ["eggs"], - "END": ["sausage", "and spam"], - }, - ) - - test_script_path = fileutils.get_test_conf_path("printing_model.py") - ens.attach_generator_files(to_configure=[test_script_path]) - - rs_m = exp.create_run_settings("echo", ["hello", "world"], run_command=run_command) - rs_m.set_nodes(1) - rs_m.set_tasks(1) - model = exp.create_model("my-model", run_settings=rs_m) - - db = exp.create_database( - db_nodes=1, - port=wlmutils.get_test_port(), - interface=wlmutils.get_test_interface(), - ) - - exp.generate(db, ens, model, overwrite=True) - - with monkeypatch.context() as ctx: - ctx.setattr(cfg.Config, "telemetry_frequency", 1) - ctx.setattr(cfg.Config, "telemetry_cooldown", 45) - - exp.start(model, block=False) - - # track PID to see that telmon cooldown avoids restarting process - tm_pid = exp._control._telemetry_monitor.pid - - exp.start(db, block=False) - # check that same TM proc is active - assert tm_pid == exp._control._telemetry_monitor.pid - try: - exp.start(ens, block=True, summary=True) - finally: - exp.stop(db) - assert tm_pid == exp._control._telemetry_monitor.pid - time.sleep(3) # time for telmon to write db stop event - - telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - - db_start_events = list(telemetry_output_path.rglob("database/**/start.json")) - assert len(db_start_events) == 1 - - m_start_events = list(telemetry_output_path.rglob("model/**/start.json")) - assert len(m_start_events) == 1 - - e_start_events = list(telemetry_output_path.rglob("ensemble/**/start.json")) - assert len(e_start_events) == 2 - - -@pytest.mark.parametrize( - "status_in, expected_out", - [ - pytest.param(SmartSimStatus.STATUS_CANCELLED, 1, id="failure on cancellation"), - pytest.param(SmartSimStatus.STATUS_COMPLETED, 0, id="success on completion"), - pytest.param(SmartSimStatus.STATUS_FAILED, 1, id="failure on failed"), - pytest.param(SmartSimStatus.STATUS_NEW, None, id="failure on new"), - pytest.param(SmartSimStatus.STATUS_PAUSED, None, id="failure on paused"), - pytest.param(SmartSimStatus.STATUS_RUNNING, None, id="failure on running"), - ], -) -def test_faux_rc(status_in: str, expected_out: t.Optional[int]): - """Ensure faux response codes match expectations.""" - step_info = StepInfo(status=status_in) - - rc = map_return_code(step_info) - assert rc == expected_out - - -@pytest.mark.parametrize( - "status_in, expected_out, expected_has_jobs", - [ - pytest.param( - SmartSimStatus.STATUS_CANCELLED, 1, False, id="failure on cancellation" - ), - pytest.param( - SmartSimStatus.STATUS_COMPLETED, 0, False, id="success on completion" - ), - pytest.param(SmartSimStatus.STATUS_FAILED, 1, False, id="failure on failed"), - pytest.param(SmartSimStatus.STATUS_NEW, None, True, id="failure on new"), - pytest.param(SmartSimStatus.STATUS_PAUSED, None, True, id="failure on paused"), - pytest.param( - SmartSimStatus.STATUS_RUNNING, None, True, id="failure on running" - ), - ], -) -@pytest.mark.asyncio -async def test_wlm_completion_handling( - test_dir: str, - monkeypatch: pytest.MonkeyPatch, - status_in: str, - expected_out: t.Optional[int], - expected_has_jobs: bool, -): - def get_faux_update(status: str) -> t.Callable: - def _faux_updates(_self: WLMLauncher, _names: t.List[str]) -> t.List[StepInfo]: - return [("faux-name", StepInfo(status=status))] - - return _faux_updates - - ts = get_ts_ms() - with monkeypatch.context() as ctx: - # don't actually start a job manager - ctx.setattr(JobManager, "start", lambda x: ...) - ctx.setattr(SlurmLauncher, "get_step_update", get_faux_update(status_in)) - - mani_handler = ManifestEventHandler("xyz", logger) - mani_handler.set_launcher("slurm") - - # prep a fake job to request updates for - job_entity = JobEntity() - job_entity.name = "faux-name" - job_entity.step_id = "faux-step-id" - job_entity.task_id = 1234 - job_entity.status_dir = test_dir - job_entity.type = "orchestrator" - - job = Job(job_entity.name, job_entity.step_id, job_entity, "slurm", True) - - # populate our tracking collections - mani_handler._tracked_jobs = {job_entity.key: job_entity} - mani_handler.job_manager.jobs[job.name] = job - - await mani_handler.on_timestep(ts) - - # see that the job queue was properly manipulated - has_jobs = bool(mani_handler._tracked_jobs) - assert expected_has_jobs == has_jobs - - # see that the event was properly written - stop_event_path = pathlib.Path(test_dir) / "stop.json" - - # if a status wasn't terminal, no stop event should have been written - should_have_stop_event = False if expected_out is None else True - assert should_have_stop_event == stop_event_path.exists()