From e98e2fe52a8614b1473d8f19847036afd8309445 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 25 Jun 2024 12:21:53 -0500 Subject: [PATCH 01/84] Initial FLI-based implementation --- .../_core/launcher/dragon/dragonBackend.py | 30 ++++- .../_core/mli/comm/channel/dragonchannel.py | 12 +- smartsim/_core/mli/comm/channel/dragonfli.py | 54 +++++++++ .../infrastructure/control/workermanager.py | 33 +++--- .../_core/mli/infrastructure/worker/worker.py | 106 ++++++++++++++---- smartsim/_core/mli/message_handler.py | 10 +- 6 files changed, 192 insertions(+), 53 deletions(-) create mode 100644 smartsim/_core/mli/comm/channel/dragonfli.py diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 2456606623..9ec4cc93e9 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -26,6 +26,7 @@ import collections import functools import itertools +import os import time import typing as t from dataclasses import dataclass, field @@ -38,10 +39,13 @@ # isort: off import dragon.infrastructure.connection as dragon_connection import dragon.infrastructure.policy as dragon_policy +from dragon.infrastructure.process_desc import ProcessOptions +from dragon.data.ddict.ddict import DDict import dragon.native.group_state as dragon_group_state import dragon.native.process as dragon_process import dragon.native.process_group as dragon_process_group import dragon.native.machine as dragon_machine +import multiprocessing as mp # pylint: enable=import-error # isort: on @@ -75,6 +79,9 @@ def __str__(self) -> str: return self.value +mp.set_start_method("dragon") + + @dataclass class ProcessGroupInfo: status: SmartSimStatus @@ -187,6 +194,7 @@ def __init__(self, pid: int) -> None: self._view = DragonBackendView(self) logger.debug(self._view.host_desc) + self._infra_ddict: t.Optional[DDict] = None @property def hosts(self) -> list[str]: @@ -391,6 +399,20 @@ def _stop_steps(self) -> None: self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED self._group_infos[step_id].return_codes = [-9] + @property + def infra_ddict(self) -> str: + """Create a Dragon distributed dictionary and return its + serialized descriptor + """ + if self._infra_ddict is None: + logger.info("Creating DDict") + self._infra_ddict = DDict() # todo: parametrize + logger.info("Created DDict") + self._infra_ddict["creation"] = str(time.time()) + logger.info(self._infra_ddict["creation"]) + + return self._infra_ddict.serialize() + def _start_steps(self) -> None: self._heartbeat() with self._queue_lock: @@ -406,6 +428,7 @@ def _start_steps(self) -> None: placement=dragon_policy.Policy.Placement.HOST_NAME, host_name=hosts[0], ) + options = ProcessOptions(make_inf_channels=True) grp = dragon_process_group.ProcessGroup( restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy ) @@ -421,10 +444,15 @@ def _start_steps(self) -> None: target=request.exe, args=request.exe_args, cwd=request.path, - env={**request.current_env, **request.env}, + env={ + **request.current_env, + **request.env, + "SS_DRG_DDICT": self.infra_ddict, + }, stdout=dragon_process.Popen.PIPE, stderr=dragon_process.Popen.PIPE, policy=local_policy, + options=options, ) grp.add_process(nproc=request.tasks_per_node, template=tmp_proc) diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index 4fd26861ca..d4dbfa3ba0 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -24,16 +24,13 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t import smartsim._core.mli.comm.channel.channel as cch from smartsim.log import get_logger logger = get_logger(__name__) -if t.TYPE_CHECKING: - import dragon.channels as dch - import dragon.utils as du +import dragon.channels as dch class DragonCommChannel(cch.CommChannelBase): @@ -42,11 +39,10 @@ class DragonCommChannel(cch.CommChannelBase): def __init__(self, key: bytes) -> None: """Initialize the DragonCommChannel instance""" super().__init__(key) - # todo: do we need memory pool information to construct the channel correctly? - self._channel: "dch.Channel" = du.get_channel(key) + self._channel: dch.Channel = dch.Channel.attach(key) def send(self, value: bytes) -> None: """Send a message throuh the underlying communication channel :param value: The value to send""" - logger.debug(f"Channel {self.descriptor.decode('utf-8')} sending message") - self._channel.send_bytes(value) + with self._channel.sendh(timeout=None) as sendh: + sendh.send_bytes(value) diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py new file mode 100644 index 0000000000..f601bb2eb8 --- /dev/null +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -0,0 +1,54 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# isort: off +import dragon +from dragon import fli +import dragon.channels as dch + +# isort: on + + +import smartsim._core.mli.comm.channel.channel as cch +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class DragonFLIChannel(cch.CommChannelBase): + """Passes messages by writing to a Dragon FLI Channel""" + + def __init__(self, fli_desc: bytes) -> None: + """Initialize the DragonFLIChannel instance""" + super().__init__(fli_desc) + # todo: do we need memory pool information to construct the channel correctly? + self._channel: "dch.Channel" = fli.FLInterface.attach(fli_desc) + + def send(self, value: bytes) -> None: + """Send a message throuh the underlying communication channel + :param value: The value to send""" + with self._channel.sendh(timeout=None) as sendh: + sendh.send_bytes(value) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index b3b79f7f30..588dc8e28d 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -24,14 +24,19 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import multiprocessing as mp +# isort: off +import dragon +from dragon import fli + +# isort: on +import time import typing as t import numpy as np from smartsim._core.entrypoints.service import Service from smartsim._core.mli.comm.channel.channel import CommChannelBase -from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore from smartsim._core.mli.infrastructure.worker.worker import ( InferenceReply, @@ -84,12 +89,6 @@ def deserialize_message( None # these will really be tensors already ) - # # client example - # msg = Message() - # t = torch.Tensor() - # msg.inputs = [custom_byte_converter(t)] - # mli_client.request_inference(msg) - # # end client input_meta: t.List[t.Any] = [] if request.input.which() == "inputKeys": @@ -163,12 +162,12 @@ class WorkerManager(Service): def __init__( self, - task_queue: "mp.Queue[bytes]", + file_like_interface: fli.FLInterface, worker: MachineLearningWorkerBase, feature_store: t.Optional[FeatureStore] = None, as_service: bool = False, cooldown: int = 0, - comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, + comm_channel_type: t.Type[CommChannelBase] = DragonFLIChannel, ) -> None: """Initialize the WorkerManager :param task_queue: The queue to monitor for new tasks @@ -182,7 +181,7 @@ def __init__( super().__init__(as_service, cooldown) """a collection of workers the manager is controlling""" - self._task_queue: "mp.Queue[bytes]" = task_queue + self._task_queue: fli.FLInterface = file_like_interface """the queue the manager monitors for new tasks""" self._feature_store: t.Optional[FeatureStore] = feature_store """a feature store to retrieve models from""" @@ -232,7 +231,12 @@ def _on_iteration(self) -> None: return # perform default deserialization of the message envelope - request_bytes: bytes = self._task_queue.get() + # perform default deserialization of the message envelope + with self._task_queue.recvh(timeout=None) as recvh: + try: + request_bytes, _ = recvh.recv_bytes(timeout=None) + except fli.FLIEOT as exc: + return request = deserialize_message(request_bytes, self._comm_channel_type) if not self._validate_request(request): @@ -246,17 +250,12 @@ def _on_iteration(self) -> None: fetch_input_result = self._worker.fetch_inputs(request, self._feature_store) transformed_input = self._worker.transform_input(request, fetch_input_result) - # batch: t.Collection[_Datum] = transform_result.transformed_input - # if self._batch_size: - # batch = self._worker.batch_requests(transform_result, self._batch_size) - reply = InferenceReply() try: execute_result = self._worker.execute( request, model_result, transformed_input ) - transformed_output = self._worker.transform_output(request, execute_result) if request.output_keys: diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 99b51e178d..8992b2b6ea 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -24,12 +24,17 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import io import typing as t from abc import ABC, abstractmethod +import numpy as np +import torch + import smartsim.error as sse from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.mli_schemas.tensor import tensor_capnp from smartsim.log import get_logger logger = get_logger(__name__) @@ -106,9 +111,10 @@ def __init__(self, result: t.Any) -> None: class FetchInputResult: """A wrapper around fetched inputs""" - def __init__(self, result: t.List[bytes]) -> None: + def __init__(self, result: t.List[bytes], meta: t.List[t.Any]) -> None: """Initialize the object""" self.inputs = result + self.meta = meta class TransformOutputResult: @@ -122,7 +128,6 @@ def __init__( self.shape = shape self.order = order self.dtype = dtype - # todo: determine if each output must have an individual (shape, order, dtype) class CreateInputBatchResult: @@ -152,8 +157,6 @@ def fetch_model( :param request: The request that triggered the pipeline :param feature_store: The feature store used for persistence :return: Raw bytes of the model""" - if not feature_store: - raise ValueError("Feature store is required for model retrieval") if request.raw_model: # Should we cache model in the feature store? @@ -162,6 +165,9 @@ def fetch_model( # short-circuit and return the directly supplied model return FetchModelResult(request.raw_model) + if not feature_store: + raise ValueError("Feature store is required for model retrieval") + if not request.model_key: raise sse.SmartSimError( "Key must be provided to retrieve model from feature store" @@ -185,8 +191,12 @@ def fetch_inputs( :param request: The request that triggered the pipeline :param feature_store: The feature store used for persistence :return: the fetched input""" + + if request.raw_inputs: + return FetchInputResult(request.raw_inputs, request.input_meta) + if not feature_store: - raise ValueError("Feature store is required for input retrieval") + raise ValueError("No input and no feature store provided") if request.input_keys: data: t.List[bytes] = [] @@ -201,9 +211,6 @@ def fetch_inputs( ) from ex return FetchInputResult(data) - if request.raw_inputs: - return FetchInputResult(request.raw_inputs) - raise ValueError("No input source") @staticmethod @@ -250,14 +257,6 @@ class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC): """Abstrct base class providing contract for a machine learning worker implementation.""" - # @staticmethod - # @abstractmethod - # def deserialize(request: InferenceRequest) -> InferenceRequest: - # """Given a collection of data serialized to bytes, convert the bytes - # to a proper representation used by the ML backend - # :param data_blob: inference request as a byte-serialized blob - # :return: InferenceRequest deserialized from the input""" - @staticmethod @abstractmethod def load_model( @@ -303,11 +302,70 @@ def transform_output( :param execute_result: The result of inference wrapped in an ExecuteResult :return:""" - # @staticmethod - # @abstractmethod - # def serialize_reply( - # request: InferenceRequest, results: OutputTransformResult - # ) -> bytes: - # """Given an output, serialize to bytes for transport - # :param reply: The result of the inference pipeline - # :return: a byte-serialized version of the reply""" + +class TorchWorker(MachineLearningWorkerBase): + """A worker that executes a PyTorch model.""" + + @staticmethod + def load_model( + request: InferenceRequest, fetch_result: FetchModelResult + ) -> LoadModelResult: + model_bytes = fetch_result.model_bytes or request.raw_model + if not model_bytes: + raise ValueError("Unable to load model without reference object") + + _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} + device = _device_to_torch[str(request.device)] + model: torch.nn.Module = torch.jit.load(io.BytesIO(model_bytes), map_location=device) # type: ignore[no-untyped-call] + result = LoadModelResult(model) + return result + + @staticmethod + def transform_input( + request: InferenceRequest, fetch_result: FetchInputResult + ) -> TransformInputResult: + result = [] + + _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} + device = _device_to_torch[str(request.device)] + for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): + td: tensor_capnp.TensorDescriptor = item_meta + result.append( + torch.tensor( + np.frombuffer(item, dtype=str(td.dataType)).reshape(td.dimensions) + ).to(device) + ) + return TransformInputResult(result) + # return data # note: this fails copy test! + + @staticmethod + def execute( + request: InferenceRequest, + load_result: LoadModelResult, + transform_result: TransformInputResult, + ) -> ExecuteResult: + if not load_result.model: + raise sse.SmartSimError("Model must be loaded to execute") + + model: torch.nn.Module = load_result.model + model.eval() + results = [model(tensor).detach() for tensor in transform_result.transformed] + + execute_result = ExecuteResult(results) + return execute_result + + @staticmethod + def transform_output( + request: InferenceRequest, + execute_result: ExecuteResult, + ) -> TransformOutputResult: + if str(request.device) != "cpu": + transformed = [ + item.to("cpu").clone() for item in execute_result.predictions + ] + # todo: need the shape from latest schemas added here. + return TransformOutputResult(transformed, None, "c", "float32") # fixme + else: + return TransformOutputResult( + execute_result.predictions, None, "c", "float32" + ) # fixme diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index 733fa83d98..4a5725bd9e 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -391,7 +391,9 @@ def deserialize_request(request_bytes: t.ByteString) -> request_capnp.Request: :param request_bytes: Bytes to be deserialized into a Request """ - bytes_message = request_capnp.Request.from_bytes(request_bytes) + bytes_message = request_capnp.Request.from_bytes( + request_bytes, traversal_limit_in_words=2**63 + ) with bytes_message as message: return message @@ -484,7 +486,7 @@ def _assign_custom_response_attributes( response.customAttributes.tf = custom_attrs # type: ignore else: raise ValueError("""Invalid custom attribute class name. - Expected 'TensorFlowResponseAttributes' or + Expected 'TensorFlowResponseAttributes' or 'TorchResponseAttributes'.""") except Exception as e: raise ValueError("Error assigning custom attributes to response.") from e @@ -529,7 +531,9 @@ def deserialize_response(response_bytes: t.ByteString) -> response_capnp.Respons """ Deserializes a serialized response message. """ - bytes_message = response_capnp.Response.from_bytes(response_bytes) + bytes_message = response_capnp.Response.from_bytes( + response_bytes, traversal_limit_in_words=2**63 + ) with bytes_message as message: return message From 043f0e74e68ad07846ffce9a0013eb6cf1919c09 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 25 Jun 2024 13:42:44 -0500 Subject: [PATCH 02/84] Add inference example stub --- .../high_throughput_inference/mli_driver.py | 35 +++++ .../high_throughput_inference/mock_app.py | 129 ++++++++++++++++++ .../standalone_workermanager.py | 46 +++++++ 3 files changed, 210 insertions(+) create mode 100644 examples/high_throughput_inference/mli_driver.py create mode 100644 examples/high_throughput_inference/mock_app.py create mode 100644 examples/high_throughput_inference/standalone_workermanager.py diff --git a/examples/high_throughput_inference/mli_driver.py b/examples/high_throughput_inference/mli_driver.py new file mode 100644 index 0000000000..187a7b8214 --- /dev/null +++ b/examples/high_throughput_inference/mli_driver.py @@ -0,0 +1,35 @@ +import sys +from smartsim import Experiment +from smartsim.status import TERMINAL_STATUSES +import time + +worker_manager_script_name = "standalone_workermanager.py" +app_script_name = "mock_app.py" +device = "cpu" + + +exp = Experiment("MLI_proto", launcher="dragon") + +worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name]) +worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) +worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) + + +app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) +app = exp.create_model("app", run_settings=app_rs) +app.attach_generator_files(to_copy=[app_script_name], to_symlink=[f"resnet50.{device.upper()}.pt"]) + + +exp.generate(worker_manager, app, overwrite=True) +exp.start(worker_manager, app, block=False) + +while True: + if exp.get_status(app)[0] in TERMINAL_STATUSES: + exp.stop(worker_manager) + break + if exp.get_status(worker_manager)[0] in TERMINAL_STATUSES: + exp.stop(app) + break + time.sleep(5) + +print("Exiting.") \ No newline at end of file diff --git a/examples/high_throughput_inference/mock_app.py b/examples/high_throughput_inference/mock_app.py new file mode 100644 index 0000000000..d6f8253b70 --- /dev/null +++ b/examples/high_throughput_inference/mock_app.py @@ -0,0 +1,129 @@ +# isort: off +import dragon +from dragon import fli +from dragon.channels import Channel +import dragon.channels +from dragon.data.ddict.ddict import DDict +from dragon.globalservices.api_setup import connect_to_infrastructure +from dragon.utils import b64decode, b64encode + +# isort: on + +import argparse +import io +import numpy +import os +import tabulate +import time +import torch +import typing as t + +from smartsim._core.mli.message_handler import MessageHandler + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser("Mock application") + parser.add_argument("--device", default="cpu") + + args = parser.parse_args() + + connect_to_infrastructure() + ddict_str = os.environ["SS_DRG_DDICT"] + + ddict = DDict.attach(ddict_str) + + to_worker_fli_str = None + + while to_worker_fli_str is None: + try: + to_worker_fli_str = ddict["to_worker_fli"] + except Exception as e: + time.sleep(1) + + to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str)) + + batch_size = 32 + model = torch.jit.load(f"resnet50.{args.device.upper()}.pt") + buffer = io.BytesIO() + batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) + scripted = torch.jit.trace(model, batch) + torch.jit.save(scripted, buffer) + + total_iterations = 10 + + headers=[ + "batch_size", + "build_tensor", + "build_request", + "serialize_request", + "send", + "receive", + "deserialize_response", + "deserialize_tensor", + ] + + print(",".join(headers)) + + for batch_size in [1, 8, 32, 64, 128]: + + timings = [] + for iteration_number in range(total_iterations + int(batch_size==1)): + + timings.append([batch_size]) + + batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) + + expected_device: t.Literal["cpu", "gpu"] = args.device.lower() + + start = time.perf_counter() + interm = start + built_tensor = MessageHandler.build_tensor( + batch.numpy(), "c", "float32", list(batch.shape) + ) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + + from_worker_ch = Channel.make_process_local() + + request = MessageHandler.build_request( + reply_channel=from_worker_ch.serialize(), + model=buffer.getvalue(), + device=expected_device, + inputs=[built_tensor], + outputs=[], + output_descriptors=[], + custom_attributes=None, + ) + + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + request_bytes = MessageHandler.serialize_request(request) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + with to_worker_fli.sendh(timeout=None) as to_sendh: + to_sendh.send_bytes(request_bytes) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + + with from_worker_ch.recvh(timeout=None) as from_recvh: + resp = from_recvh.recv_bytes(timeout=None) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + response = MessageHandler.deserialize_response(resp) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + result = torch.from_numpy( + numpy.frombuffer( + response.result.data[0].blob, + dtype=str(response.result.data[0].tensorDescriptor.dataType), + ) + ) + + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + + # duration = time.perf_counter() - start + # print(f"{duration:.3f} s") + + print(",".join(str(timing) for timing in timings[-1])) diff --git a/examples/high_throughput_inference/standalone_workermanager.py b/examples/high_throughput_inference/standalone_workermanager.py new file mode 100644 index 0000000000..7ddeff0094 --- /dev/null +++ b/examples/high_throughput_inference/standalone_workermanager.py @@ -0,0 +1,46 @@ +# isort: off +import dragon +from dragon import fli +from dragon.channels import Channel +from dragon.data.ddict.ddict import DDict +from dragon.utils import b64decode, b64encode +from dragon.globalservices.api_setup import connect_to_infrastructure +# isort: on +import logging +import multiprocessing as mp +import os +import pathlib +import shutil +import time + + +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel +from smartsim._core.mli.infrastructure.worker.worker import TorchWorker +from smartsim._core.mli.infrastructure.control.workermanager import ( + DragonCommChannel, + WorkerManager, +) + +if __name__ == "__main__": + connect_to_infrastructure() + mp.set_start_method("dragon") + ddict_str = os.environ["SS_DRG_DDICT"] + ddict = DDict.attach(ddict_str) + + to_worker_channel = Channel.make_process_local() + to_worker_manager_channel = Channel.make_process_local() + channels = [Channel.make_process_local() for _ in range(100)] + to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel, stream_channels=channels) + ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize()) + + torch_worker = TorchWorker() + + worker_manager = WorkerManager( + file_like_interface=to_worker_fli, + worker=torch_worker, + feature_store=None, + as_service=True, + cooldown=10, + comm_channel_type=DragonCommChannel, + ) + worker_manager.execute() From efc9e839d2c317a49662776b710993e43c88f75c Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 25 Jun 2024 17:09:50 -0500 Subject: [PATCH 03/84] Lint, style, black magic --- .../high_throughput_inference/mli_driver.py | 2 +- .../standalone_workermanager.py | 3 +- .../_core/launcher/dragon/dragonBackend.py | 3 +- .../_core/mli/infrastructure/worker/worker.py | 30 +++++++++++-------- 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/examples/high_throughput_inference/mli_driver.py b/examples/high_throughput_inference/mli_driver.py index 187a7b8214..833766cbef 100644 --- a/examples/high_throughput_inference/mli_driver.py +++ b/examples/high_throughput_inference/mli_driver.py @@ -5,7 +5,7 @@ worker_manager_script_name = "standalone_workermanager.py" app_script_name = "mock_app.py" -device = "cpu" +device = "gpu" exp = Experiment("MLI_proto", launcher="dragon") diff --git a/examples/high_throughput_inference/standalone_workermanager.py b/examples/high_throughput_inference/standalone_workermanager.py index 7ddeff0094..bb93c613ce 100644 --- a/examples/high_throughput_inference/standalone_workermanager.py +++ b/examples/high_throughput_inference/standalone_workermanager.py @@ -14,10 +14,9 @@ import time -from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel +from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel from smartsim._core.mli.infrastructure.worker.worker import TorchWorker from smartsim._core.mli.infrastructure.control.workermanager import ( - DragonCommChannel, WorkerManager, ) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 9ec4cc93e9..d103579115 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -26,7 +26,6 @@ import collections import functools import itertools -import os import time import typing as t from dataclasses import dataclass, field @@ -411,7 +410,7 @@ def infra_ddict(self) -> str: self._infra_ddict["creation"] = str(time.time()) logger.info(self._infra_ddict["creation"]) - return self._infra_ddict.serialize() + return str(self._infra_ddict.serialize()) def _start_steps(self) -> None: self._heartbeat() diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 8992b2b6ea..295b2573c8 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -111,7 +111,7 @@ def __init__(self, result: t.Any) -> None: class FetchInputResult: """A wrapper around fetched inputs""" - def __init__(self, result: t.List[bytes], meta: t.List[t.Any]) -> None: + def __init__(self, result: t.List[bytes], meta: t.Optional[t.List[t.Any]]) -> None: """Initialize the object""" self.inputs = result self.meta = meta @@ -121,7 +121,7 @@ class TransformOutputResult: """A wrapper around inference results transformed for transmission""" def __init__( - self, result: t.Any, shape: t.List[int], order: str, dtype: str + self, result: t.Any, shape: t.Optional[t.List[int]], order: str, dtype: str ) -> None: """Initialize the OutputTransformResult""" self.outputs = result @@ -209,7 +209,9 @@ def fetch_inputs( raise sse.SmartSimError( f"Model could not be retrieved with key {input_}" ) from ex - return FetchInputResult(data) + return FetchInputResult( + data, None + ) # fixme: need to get both tensor and descriptor raise ValueError("No input source") @@ -316,7 +318,9 @@ def load_model( _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} device = _device_to_torch[str(request.device)] - model: torch.nn.Module = torch.jit.load(io.BytesIO(model_bytes), map_location=device) # type: ignore[no-untyped-call] + buffer = io.BytesIO(model_bytes) + # type: ignore-next[no-untyped-call] + model = torch.jit.load(buffer, map_location=device) result = LoadModelResult(model) return result @@ -328,12 +332,14 @@ def transform_input( _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} device = _device_to_torch[str(request.device)] + if fetch_result.meta is None: + raise ValueError("Cannot reconstruct tensor without meta information") for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): - td: tensor_capnp.TensorDescriptor = item_meta + tensor_desc: tensor_capnp.TensorDescriptor = item_meta result.append( - torch.tensor( - np.frombuffer(item, dtype=str(td.dataType)).reshape(td.dimensions) - ).to(device) + torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType))) + .to(device) + .reshape(tuple(dim for dim in tensor_desc.dimensions)) ) return TransformInputResult(result) # return data # note: this fails copy test! @@ -365,7 +371,7 @@ def transform_output( ] # todo: need the shape from latest schemas added here. return TransformOutputResult(transformed, None, "c", "float32") # fixme - else: - return TransformOutputResult( - execute_result.predictions, None, "c", "float32" - ) # fixme + + return TransformOutputResult( + execute_result.predictions, None, "c", "float32" + ) # fixme From ed3c42a10b812963e2de28c6e89918dfe0efbc07 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 25 Jun 2024 18:07:56 -0500 Subject: [PATCH 04/84] Bring up to feature branch --- .../infrastructure/control/workermanager.py | 24 +++++++++++++++---- .../_core/mli/infrastructure/worker/worker.py | 24 ++++++++++--------- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 67b1627bb5..f46ced8756 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -54,7 +54,9 @@ def deserialize_message( - data_blob: bytes, channel_type: t.Type[CommChannelBase] + data_blob: bytes, + channel_type: t.Type[CommChannelBase], + device: t.Literal["cpu", "gpu"], ) -> InferenceRequest: """Deserialize a message from a byte stream into an InferenceRequest :param data_blob: The byte stream to deserialize""" @@ -166,6 +168,7 @@ def __init__( as_service: bool = False, cooldown: int = 0, comm_channel_type: t.Type[CommChannelBase] = DragonFLIChannel, + device: t.Literal["cpu", "gpu"] = "cpu", ) -> None: """Initialize the WorkerManager :param task_queue: The queue to monitor for new tasks @@ -187,6 +190,8 @@ def __init__( """The ML Worker implementation""" self._comm_channel_type = comm_channel_type """The type of communication channel to construct for callbacks""" + self._device = device + """Device on which workers need to run""" def _validate_request(self, request: InferenceRequest) -> bool: """Ensure the request can be processed. @@ -236,17 +241,24 @@ def _on_iteration(self) -> None: except fli.FLIEOT as exc: return - request = deserialize_message(request_bytes, self._comm_channel_type) + request = deserialize_message( + request_bytes, self._comm_channel_type, self._device + ) if not self._validate_request(request): return + # # let the worker perform additional custom deserialization # request = self._worker.deserialize(request_bytes) fetch_model_result = self._worker.fetch_model(request, self._feature_store) - model_result = self._worker.load_model(request, fetch_model_result) + model_result = self._worker.load_model( + request, fetch_model_result, self._device + ) fetch_input_result = self._worker.fetch_inputs(request, self._feature_store) - transformed_input = self._worker.transform_input(request, fetch_input_result) + transformed_input = self._worker.transform_input( + request, fetch_input_result, self._device + ) reply = InferenceReply() @@ -254,7 +266,9 @@ def _on_iteration(self) -> None: execute_result = self._worker.execute( request, model_result, transformed_input ) - transformed_output = self._worker.transform_output(request, execute_result) + transformed_output = self._worker.transform_output( + request, execute_result, self._device + ) if request.output_keys: reply.output_keys = self._worker.place_output( diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 9b813a9e9b..08c4997554 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -260,21 +260,23 @@ class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC): @staticmethod @abstractmethod def load_model( - request: InferenceRequest, fetch_result: FetchModelResult + request: InferenceRequest, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: """Given a loaded MachineLearningModel, ensure it is loaded into device memory :param request: The request that triggered the pipeline + :param device: The device on which the model must be placed :return: ModelLoadResult wrapping the model loaded for the request""" @staticmethod @abstractmethod def transform_input( - request: InferenceRequest, fetch_result: FetchInputResult + request: InferenceRequest, fetch_result: FetchInputResult, device: str ) -> TransformInputResult: """Given a collection of data, perform a transformation on the data :param request: The request that triggered the pipeline :param fetch_result: Raw output from fetching inputs out of a feature store + :param device: The device on which the transformed input must be placed :return: The transformed inputs wrapped in a InputTransformResult""" @staticmethod @@ -293,13 +295,13 @@ def execute( @staticmethod @abstractmethod def transform_output( - request: InferenceRequest, - execute_result: ExecuteResult, + request: InferenceRequest, execute_result: ExecuteResult, result_device: str ) -> TransformOutputResult: """Given inference results, perform transformations required to transmit results to the requestor. :param request: The request that triggered the pipeline :param execute_result: The result of inference wrapped in an ExecuteResult + :param result_device: The device on which the result of inference is placed :return:""" @@ -308,28 +310,27 @@ class TorchWorker(MachineLearningWorkerBase): @staticmethod def load_model( - request: InferenceRequest, fetch_result: FetchModelResult + request: InferenceRequest, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: model_bytes = fetch_result.model_bytes or request.raw_model if not model_bytes: raise ValueError("Unable to load model without reference object") _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - device = _device_to_torch[str(request.device)] + device = _device_to_torch[device] buffer = io.BytesIO(model_bytes) - # type: ignore-next[no-untyped-call] - model = torch.jit.load(buffer, map_location=device) + model = torch.jit.load(buffer, map_location=device) # type: ignore result = LoadModelResult(model) return result @staticmethod def transform_input( - request: InferenceRequest, fetch_result: FetchInputResult + request: InferenceRequest, fetch_result: FetchInputResult, device: str ) -> TransformInputResult: result = [] _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - device = _device_to_torch[str(request.device)] + device = _device_to_torch[device] if fetch_result.meta is None: raise ValueError("Cannot reconstruct tensor without meta information") for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): @@ -362,8 +363,9 @@ def execute( def transform_output( request: InferenceRequest, execute_result: ExecuteResult, + result_device: str, ) -> TransformOutputResult: - if str(request.device) != "cpu": + if result_device != "cpu": transformed = [ item.to("cpu").clone() for item in execute_result.predictions ] From e5be26bdcd8d55e6b3b9669fa9bd5492ffd89390 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 25 Jun 2024 18:08:14 -0500 Subject: [PATCH 05/84] Update example --- examples/high_throughput_inference/mli_driver.py | 13 ++++++++----- examples/high_throughput_inference/mock_app.py | 3 --- .../standalone_workermanager.py | 11 +++++------ 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/examples/high_throughput_inference/mli_driver.py b/examples/high_throughput_inference/mli_driver.py index 833766cbef..d32d88e51b 100644 --- a/examples/high_throughput_inference/mli_driver.py +++ b/examples/high_throughput_inference/mli_driver.py @@ -1,23 +1,26 @@ +import os import sys from smartsim import Experiment from smartsim.status import TERMINAL_STATUSES import time -worker_manager_script_name = "standalone_workermanager.py" -app_script_name = "mock_app.py" device = "gpu" +filedir = os.path.dirname(__file__) +worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") +app_script_name = os.path.join(filedir, "mock_app.py") +model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") -exp = Experiment("MLI_proto", launcher="dragon") +exp = Experiment("MLI_proto", launcher="dragon", exp_path=os.path.join(filedir, "MLI_proto")) -worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name]) +worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device]) worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) app = exp.create_model("app", run_settings=app_rs) -app.attach_generator_files(to_copy=[app_script_name], to_symlink=[f"resnet50.{device.upper()}.pt"]) +app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) exp.generate(worker_manager, app, overwrite=True) diff --git a/examples/high_throughput_inference/mock_app.py b/examples/high_throughput_inference/mock_app.py index d6f8253b70..afc0c836b8 100644 --- a/examples/high_throughput_inference/mock_app.py +++ b/examples/high_throughput_inference/mock_app.py @@ -74,8 +74,6 @@ batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) - expected_device: t.Literal["cpu", "gpu"] = args.device.lower() - start = time.perf_counter() interm = start built_tensor = MessageHandler.build_tensor( @@ -89,7 +87,6 @@ request = MessageHandler.build_request( reply_channel=from_worker_ch.serialize(), model=buffer.getvalue(), - device=expected_device, inputs=[built_tensor], outputs=[], output_descriptors=[], diff --git a/examples/high_throughput_inference/standalone_workermanager.py b/examples/high_throughput_inference/standalone_workermanager.py index bb93c613ce..32d534f360 100644 --- a/examples/high_throughput_inference/standalone_workermanager.py +++ b/examples/high_throughput_inference/standalone_workermanager.py @@ -6,12 +6,8 @@ from dragon.utils import b64decode, b64encode from dragon.globalservices.api_setup import connect_to_infrastructure # isort: on -import logging -import multiprocessing as mp +import argparse import os -import pathlib -import shutil -import time from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel @@ -21,8 +17,10 @@ ) if __name__ == "__main__": + parser = argparse.ArgumentParser("Worker Manager") + parser.add_argument("--device", default="gpu") + args = parser.parse_args() connect_to_infrastructure() - mp.set_start_method("dragon") ddict_str = os.environ["SS_DRG_DDICT"] ddict = DDict.attach(ddict_str) @@ -41,5 +39,6 @@ as_service=True, cooldown=10, comm_channel_type=DragonCommChannel, + device = args.device, ) worker_manager.execute() From a23010fb9726e4c18997bee279a0553bbaa473f0 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 25 Jun 2024 18:17:30 -0500 Subject: [PATCH 06/84] Change the changelog --- doc/changelog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/changelog.md b/doc/changelog.md index e86c93de66..d146d1973a 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -17,7 +17,7 @@ Description - Added schemas and MessageHandler class for de/serialization of inference requests and response messages - Removed device from schemas, MessageHandler and tests - +- Add TorchWorker first implementation and mock inference app example ### Development branch From 3c20f464d512c7b3a1ead1981efb96842e7a14bb Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 25 Jun 2024 18:38:12 -0500 Subject: [PATCH 07/84] Make style --- smartsim/_core/mli/infrastructure/control/workermanager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index f46ced8756..7a5f168fe4 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -247,7 +247,6 @@ def _on_iteration(self) -> None: if not self._validate_request(request): return - # # let the worker perform additional custom deserialization # request = self._worker.deserialize(request_bytes) From b9ed5ba8baa9fc355640f8c2461a0ce7d16cf56b Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 09:51:07 -0500 Subject: [PATCH 08/84] Attempt to mitigate import dragon error --- .../_core/mli/infrastructure/control/workermanager.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 7a5f168fe4..607f94982d 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -24,9 +24,15 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import sys + # isort: off -import dragon -from dragon import fli +try: + import dragon + from dragon import fli +except ImportError as exc: + if not "pytest" in sys.modules: + raise exc from None # isort: on import time From 0de06f3b6c0fa4747b471989a8068e4e609829a0 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 10:20:27 -0500 Subject: [PATCH 09/84] Import dragon optional --- smartsim/_core/mli/comm/channel/dragonchannel.py | 9 ++++++--- smartsim/_core/mli/comm/channel/dragonfli.py | 12 ++++++++---- .../mli/infrastructure/control/workermanager.py | 2 +- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index d4dbfa3ba0..e79fd2dfcf 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -24,14 +24,17 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - +import sys import smartsim._core.mli.comm.channel.channel as cch from smartsim.log import get_logger logger = get_logger(__name__) -import dragon.channels as dch - +try: + import dragon.channels as dch +except ImportError as exc: + if not "pytest" in sys.modules: + raise exc from None class DragonCommChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon channel""" diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index f601bb2eb8..3992241380 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -24,11 +24,15 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# isort: off -import dragon -from dragon import fli -import dragon.channels as dch +import sys +# isort: off +try: + from dragon import fli + import dragon.channels as dch +except ImportError as exc: + if not "pytest" in sys.modules: + raise exc from None # isort: on diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 607f94982d..6003869e46 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -168,7 +168,7 @@ class WorkerManager(Service): def __init__( self, - file_like_interface: fli.FLInterface, + file_like_interface: "fli.FLInterface", worker: MachineLearningWorkerBase, feature_store: t.Optional[FeatureStore] = None, as_service: bool = False, From d051385a963f2c18e55792b30316cd41eb2ca357 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 10:28:23 -0500 Subject: [PATCH 10/84] isort --- smartsim/_core/mli/comm/channel/dragonchannel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index e79fd2dfcf..872eb32350 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import sys + import smartsim._core.mli.comm.channel.channel as cch from smartsim.log import get_logger @@ -36,6 +37,7 @@ if not "pytest" in sys.modules: raise exc from None + class DragonCommChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon channel""" From e77b1cd5c9c8359aa7be27b2a3d61c398eaa7d04 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 11:33:47 -0500 Subject: [PATCH 11/84] Fix imports in dragon backend tests --- smartsim/_core/launcher/dragon/dragonBackend.py | 10 ++++------ tests/test_dragon_backend.py | 10 ++++++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index d103579115..f0e450a19c 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -36,15 +36,14 @@ # pylint: disable=import-error # isort: off +import dragon.data.ddict.ddict as dragon_ddict import dragon.infrastructure.connection as dragon_connection import dragon.infrastructure.policy as dragon_policy -from dragon.infrastructure.process_desc import ProcessOptions -from dragon.data.ddict.ddict import DDict +import dragon.infrastructure.process_desc as dragon_process_desc import dragon.native.group_state as dragon_group_state import dragon.native.process as dragon_process import dragon.native.process_group as dragon_process_group import dragon.native.machine as dragon_machine -import multiprocessing as mp # pylint: enable=import-error # isort: on @@ -78,7 +77,6 @@ def __str__(self) -> str: return self.value -mp.set_start_method("dragon") @dataclass @@ -405,7 +403,7 @@ def infra_ddict(self) -> str: """ if self._infra_ddict is None: logger.info("Creating DDict") - self._infra_ddict = DDict() # todo: parametrize + self._infra_ddict = dragon_ddict.DDict() # todo: parametrize logger.info("Created DDict") self._infra_ddict["creation"] = str(time.time()) logger.info(self._infra_ddict["creation"]) @@ -427,7 +425,7 @@ def _start_steps(self) -> None: placement=dragon_policy.Policy.Placement.HOST_NAME, host_name=hosts[0], ) - options = ProcessOptions(make_inf_channels=True) + options = dragon_process_desc.ProcessOptions(make_inf_channels=True) grp = dragon_process_group.ProcessGroup( restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy ) diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py index a510f660a5..f284f38d99 100644 --- a/tests/test_dragon_backend.py +++ b/tests/test_dragon_backend.py @@ -103,6 +103,16 @@ def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend": "dragon.infrastructure.connection", MagicMock(), ) + monkeypatch.setitem( + sys.modules, + "dragon.infrastructure.process_desc", + MagicMock(), + ) + monkeypatch.setitem( + sys.modules, + "dragon.data.ddict.ddict", + MagicMock(), + ) monkeypatch.setitem( sys.modules, "dragon.infrastructure.policy", From a90888d44d3e9ef2207a97c6b0936418daf4d06c Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 11:36:26 -0500 Subject: [PATCH 12/84] Style --- smartsim/_core/launcher/dragon/dragonBackend.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index f0e450a19c..d91f73e3c5 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -77,8 +77,6 @@ def __str__(self) -> str: return self.value - - @dataclass class ProcessGroupInfo: status: SmartSimStatus From b4312215184478186e837ab193cc609fb53f4698 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 11:40:14 -0500 Subject: [PATCH 13/84] Fix type --- smartsim/_core/launcher/dragon/dragonBackend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index d91f73e3c5..52f69ec41f 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -189,7 +189,7 @@ def __init__(self, pid: int) -> None: self._view = DragonBackendView(self) logger.debug(self._view.host_desc) - self._infra_ddict: t.Optional[DDict] = None + self._infra_ddict: t.Optional[dragon_ddict.DDict] = None @property def hosts(self) -> list[str]: From 23efebc25027d908703e80e059a3c431d5f7d434 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 12:38:55 -0500 Subject: [PATCH 14/84] Rename examples dir --- ex/high_throughput_inference/mli_driver.py | 38 ++++++ ex/high_throughput_inference/mock_app.py | 126 ++++++++++++++++++ .../standalone_workermanager.py | 44 ++++++ 3 files changed, 208 insertions(+) create mode 100644 ex/high_throughput_inference/mli_driver.py create mode 100644 ex/high_throughput_inference/mock_app.py create mode 100644 ex/high_throughput_inference/standalone_workermanager.py diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py new file mode 100644 index 0000000000..7b8db5ed83 --- /dev/null +++ b/ex/high_throughput_inference/mli_driver.py @@ -0,0 +1,38 @@ +import os +import sys +from smartsim import Experiment +from smartsim.status import TERMINAL_STATUSES +import time + +device = "cpu" +filedir = os.path.dirname(__file__) +worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") +app_script_name = os.path.join(filedir, "mock_app.py") +model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") + + +exp = Experiment("MLI_proto", launcher="dragon", exp_path=os.path.join(filedir, "MLI_proto")) + +worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device]) +worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) +worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) + + +app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) +app = exp.create_model("app", run_settings=app_rs) +app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) + + +exp.generate(worker_manager, app, overwrite=True) +exp.start(worker_manager, app, block=False) + +while True: + if exp.get_status(app)[0] in TERMINAL_STATUSES: + exp.stop(worker_manager) + break + if exp.get_status(worker_manager)[0] in TERMINAL_STATUSES: + exp.stop(app) + break + time.sleep(5) + +print("Exiting.") \ No newline at end of file diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py new file mode 100644 index 0000000000..afc0c836b8 --- /dev/null +++ b/ex/high_throughput_inference/mock_app.py @@ -0,0 +1,126 @@ +# isort: off +import dragon +from dragon import fli +from dragon.channels import Channel +import dragon.channels +from dragon.data.ddict.ddict import DDict +from dragon.globalservices.api_setup import connect_to_infrastructure +from dragon.utils import b64decode, b64encode + +# isort: on + +import argparse +import io +import numpy +import os +import tabulate +import time +import torch +import typing as t + +from smartsim._core.mli.message_handler import MessageHandler + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser("Mock application") + parser.add_argument("--device", default="cpu") + + args = parser.parse_args() + + connect_to_infrastructure() + ddict_str = os.environ["SS_DRG_DDICT"] + + ddict = DDict.attach(ddict_str) + + to_worker_fli_str = None + + while to_worker_fli_str is None: + try: + to_worker_fli_str = ddict["to_worker_fli"] + except Exception as e: + time.sleep(1) + + to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str)) + + batch_size = 32 + model = torch.jit.load(f"resnet50.{args.device.upper()}.pt") + buffer = io.BytesIO() + batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) + scripted = torch.jit.trace(model, batch) + torch.jit.save(scripted, buffer) + + total_iterations = 10 + + headers=[ + "batch_size", + "build_tensor", + "build_request", + "serialize_request", + "send", + "receive", + "deserialize_response", + "deserialize_tensor", + ] + + print(",".join(headers)) + + for batch_size in [1, 8, 32, 64, 128]: + + timings = [] + for iteration_number in range(total_iterations + int(batch_size==1)): + + timings.append([batch_size]) + + batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) + + start = time.perf_counter() + interm = start + built_tensor = MessageHandler.build_tensor( + batch.numpy(), "c", "float32", list(batch.shape) + ) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + + from_worker_ch = Channel.make_process_local() + + request = MessageHandler.build_request( + reply_channel=from_worker_ch.serialize(), + model=buffer.getvalue(), + inputs=[built_tensor], + outputs=[], + output_descriptors=[], + custom_attributes=None, + ) + + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + request_bytes = MessageHandler.serialize_request(request) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + with to_worker_fli.sendh(timeout=None) as to_sendh: + to_sendh.send_bytes(request_bytes) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + + with from_worker_ch.recvh(timeout=None) as from_recvh: + resp = from_recvh.recv_bytes(timeout=None) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + response = MessageHandler.deserialize_response(resp) + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + result = torch.from_numpy( + numpy.frombuffer( + response.result.data[0].blob, + dtype=str(response.result.data[0].tensorDescriptor.dataType), + ) + ) + + timings[-1].append(time.perf_counter() - interm) + interm = time.perf_counter() + + # duration = time.perf_counter() - start + # print(f"{duration:.3f} s") + + print(",".join(str(timing) for timing in timings[-1])) diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py new file mode 100644 index 0000000000..32d534f360 --- /dev/null +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -0,0 +1,44 @@ +# isort: off +import dragon +from dragon import fli +from dragon.channels import Channel +from dragon.data.ddict.ddict import DDict +from dragon.utils import b64decode, b64encode +from dragon.globalservices.api_setup import connect_to_infrastructure +# isort: on +import argparse +import os + + +from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel +from smartsim._core.mli.infrastructure.worker.worker import TorchWorker +from smartsim._core.mli.infrastructure.control.workermanager import ( + WorkerManager, +) + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Worker Manager") + parser.add_argument("--device", default="gpu") + args = parser.parse_args() + connect_to_infrastructure() + ddict_str = os.environ["SS_DRG_DDICT"] + ddict = DDict.attach(ddict_str) + + to_worker_channel = Channel.make_process_local() + to_worker_manager_channel = Channel.make_process_local() + channels = [Channel.make_process_local() for _ in range(100)] + to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel, stream_channels=channels) + ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize()) + + torch_worker = TorchWorker() + + worker_manager = WorkerManager( + file_like_interface=to_worker_fli, + worker=torch_worker, + feature_store=None, + as_service=True, + cooldown=10, + comm_channel_type=DragonCommChannel, + device = args.device, + ) + worker_manager.execute() From 09b9d249c5c2147a062f95356c943c4da8e534b9 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 12:48:11 -0500 Subject: [PATCH 15/84] Remove old dir --- .../high_throughput_inference/mli_driver.py | 38 ------ .../high_throughput_inference/mock_app.py | 126 ------------------ .../standalone_workermanager.py | 44 ------ 3 files changed, 208 deletions(-) delete mode 100644 examples/high_throughput_inference/mli_driver.py delete mode 100644 examples/high_throughput_inference/mock_app.py delete mode 100644 examples/high_throughput_inference/standalone_workermanager.py diff --git a/examples/high_throughput_inference/mli_driver.py b/examples/high_throughput_inference/mli_driver.py deleted file mode 100644 index d32d88e51b..0000000000 --- a/examples/high_throughput_inference/mli_driver.py +++ /dev/null @@ -1,38 +0,0 @@ -import os -import sys -from smartsim import Experiment -from smartsim.status import TERMINAL_STATUSES -import time - -device = "gpu" -filedir = os.path.dirname(__file__) -worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") -app_script_name = os.path.join(filedir, "mock_app.py") -model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") - - -exp = Experiment("MLI_proto", launcher="dragon", exp_path=os.path.join(filedir, "MLI_proto")) - -worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device]) -worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) -worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) - - -app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) -app = exp.create_model("app", run_settings=app_rs) -app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) - - -exp.generate(worker_manager, app, overwrite=True) -exp.start(worker_manager, app, block=False) - -while True: - if exp.get_status(app)[0] in TERMINAL_STATUSES: - exp.stop(worker_manager) - break - if exp.get_status(worker_manager)[0] in TERMINAL_STATUSES: - exp.stop(app) - break - time.sleep(5) - -print("Exiting.") \ No newline at end of file diff --git a/examples/high_throughput_inference/mock_app.py b/examples/high_throughput_inference/mock_app.py deleted file mode 100644 index afc0c836b8..0000000000 --- a/examples/high_throughput_inference/mock_app.py +++ /dev/null @@ -1,126 +0,0 @@ -# isort: off -import dragon -from dragon import fli -from dragon.channels import Channel -import dragon.channels -from dragon.data.ddict.ddict import DDict -from dragon.globalservices.api_setup import connect_to_infrastructure -from dragon.utils import b64decode, b64encode - -# isort: on - -import argparse -import io -import numpy -import os -import tabulate -import time -import torch -import typing as t - -from smartsim._core.mli.message_handler import MessageHandler - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser("Mock application") - parser.add_argument("--device", default="cpu") - - args = parser.parse_args() - - connect_to_infrastructure() - ddict_str = os.environ["SS_DRG_DDICT"] - - ddict = DDict.attach(ddict_str) - - to_worker_fli_str = None - - while to_worker_fli_str is None: - try: - to_worker_fli_str = ddict["to_worker_fli"] - except Exception as e: - time.sleep(1) - - to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str)) - - batch_size = 32 - model = torch.jit.load(f"resnet50.{args.device.upper()}.pt") - buffer = io.BytesIO() - batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) - scripted = torch.jit.trace(model, batch) - torch.jit.save(scripted, buffer) - - total_iterations = 10 - - headers=[ - "batch_size", - "build_tensor", - "build_request", - "serialize_request", - "send", - "receive", - "deserialize_response", - "deserialize_tensor", - ] - - print(",".join(headers)) - - for batch_size in [1, 8, 32, 64, 128]: - - timings = [] - for iteration_number in range(total_iterations + int(batch_size==1)): - - timings.append([batch_size]) - - batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) - - start = time.perf_counter() - interm = start - built_tensor = MessageHandler.build_tensor( - batch.numpy(), "c", "float32", list(batch.shape) - ) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - - from_worker_ch = Channel.make_process_local() - - request = MessageHandler.build_request( - reply_channel=from_worker_ch.serialize(), - model=buffer.getvalue(), - inputs=[built_tensor], - outputs=[], - output_descriptors=[], - custom_attributes=None, - ) - - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - request_bytes = MessageHandler.serialize_request(request) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - with to_worker_fli.sendh(timeout=None) as to_sendh: - to_sendh.send_bytes(request_bytes) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - - with from_worker_ch.recvh(timeout=None) as from_recvh: - resp = from_recvh.recv_bytes(timeout=None) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - response = MessageHandler.deserialize_response(resp) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - result = torch.from_numpy( - numpy.frombuffer( - response.result.data[0].blob, - dtype=str(response.result.data[0].tensorDescriptor.dataType), - ) - ) - - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - - # duration = time.perf_counter() - start - # print(f"{duration:.3f} s") - - print(",".join(str(timing) for timing in timings[-1])) diff --git a/examples/high_throughput_inference/standalone_workermanager.py b/examples/high_throughput_inference/standalone_workermanager.py deleted file mode 100644 index 32d534f360..0000000000 --- a/examples/high_throughput_inference/standalone_workermanager.py +++ /dev/null @@ -1,44 +0,0 @@ -# isort: off -import dragon -from dragon import fli -from dragon.channels import Channel -from dragon.data.ddict.ddict import DDict -from dragon.utils import b64decode, b64encode -from dragon.globalservices.api_setup import connect_to_infrastructure -# isort: on -import argparse -import os - - -from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel -from smartsim._core.mli.infrastructure.worker.worker import TorchWorker -from smartsim._core.mli.infrastructure.control.workermanager import ( - WorkerManager, -) - -if __name__ == "__main__": - parser = argparse.ArgumentParser("Worker Manager") - parser.add_argument("--device", default="gpu") - args = parser.parse_args() - connect_to_infrastructure() - ddict_str = os.environ["SS_DRG_DDICT"] - ddict = DDict.attach(ddict_str) - - to_worker_channel = Channel.make_process_local() - to_worker_manager_channel = Channel.make_process_local() - channels = [Channel.make_process_local() for _ in range(100)] - to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel, stream_channels=channels) - ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize()) - - torch_worker = TorchWorker() - - worker_manager = WorkerManager( - file_like_interface=to_worker_fli, - worker=torch_worker, - feature_store=None, - as_service=True, - cooldown=10, - comm_channel_type=DragonCommChannel, - device = args.device, - ) - worker_manager.execute() From 56d8e50f4f7e9fddb9e4d79ba0b1fe556e400684 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 26 Jun 2024 18:47:40 -0500 Subject: [PATCH 16/84] Add tests for torch worker --- ex/high_throughput_inference/mock_app.py | 5 +- .../standalone_workermanager.py | 2 +- .../mli/infrastructure/worker/torch_worker.py | 118 ++++++++++++ .../_core/mli/infrastructure/worker/worker.py | 91 +-------- tests/mli/test_torch_worker.py | 173 ++++++++++++++++++ tests/mli/test_worker_manager.py | 12 +- 6 files changed, 309 insertions(+), 92 deletions(-) create mode 100644 smartsim/_core/mli/infrastructure/worker/torch_worker.py create mode 100644 tests/mli/test_torch_worker.py diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index afc0c836b8..d22792d15b 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -5,7 +5,7 @@ import dragon.channels from dragon.data.ddict.ddict import DDict from dragon.globalservices.api_setup import connect_to_infrastructure -from dragon.utils import b64decode, b64encode +from dragon.utils import b64decode # isort: on @@ -13,11 +13,8 @@ import io import numpy import os -import tabulate import time import torch -import typing as t - from smartsim._core.mli.message_handler import MessageHandler diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 32d534f360..40fefcc372 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -11,7 +11,7 @@ from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel -from smartsim._core.mli.infrastructure.worker.worker import TorchWorker +from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.infrastructure.control.workermanager import ( WorkerManager, ) diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py new file mode 100644 index 0000000000..c350499c20 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -0,0 +1,118 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io + +import numpy as np +import torch + +from .....error import SmartSimError +from .....log import get_logger +from ...mli_schemas.tensor import tensor_capnp +from .worker import ( + ExecuteResult, + FetchInputResult, + FetchModelResult, + InferenceRequest, + LoadModelResult, + MachineLearningWorkerBase, + TransformInputResult, + TransformOutputResult, +) + +logger = get_logger(__name__) + + +class TorchWorker(MachineLearningWorkerBase): + """A worker that executes a PyTorch model.""" + + @staticmethod + def load_model( + request: InferenceRequest, fetch_result: FetchModelResult, device: str + ) -> LoadModelResult: + model_bytes = fetch_result.model_bytes or request.raw_model + if not model_bytes: + raise ValueError("Unable to load model without reference object") + + _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} + device = _device_to_torch[device] + buffer = io.BytesIO(model_bytes) + model = torch.jit.load(buffer, map_location=device) # type: ignore + result = LoadModelResult(model) + return result + + @staticmethod + def transform_input( + request: InferenceRequest, fetch_result: FetchInputResult, device: str + ) -> TransformInputResult: + result = [] + + _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} + device = _device_to_torch[device] + if fetch_result.meta is None: + raise ValueError("Cannot reconstruct tensor without meta information") + for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): + tensor_desc: tensor_capnp.TensorDescriptor = item_meta + result.append( + torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType))) + .to(device) + .reshape(tuple(dim for dim in tensor_desc.dimensions)) + ) + return TransformInputResult(result) + # return data # note: this fails copy test! + + @staticmethod + def execute( + request: InferenceRequest, + load_result: LoadModelResult, + transform_result: TransformInputResult, + ) -> ExecuteResult: + if not load_result.model: + raise SmartSimError("Model must be loaded to execute") + + model: torch.nn.Module = load_result.model + model.eval() + results = [model(tensor).detach() for tensor in transform_result.transformed] + + execute_result = ExecuteResult(results) + return execute_result + + @staticmethod + def transform_output( + request: InferenceRequest, + execute_result: ExecuteResult, + result_device: str, + ) -> TransformOutputResult: + if result_device != "cpu": + transformed = [ + item.to("cpu").clone() for item in execute_result.predictions + ] + # todo: need the shape from latest schemas added here. + return TransformOutputResult(transformed, None, "c", "float32") # fixme + + return TransformOutputResult( + execute_result.predictions, None, "c", "float32" + ) # fixme diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 08c4997554..24dc734d00 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -24,18 +24,13 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import io import typing as t from abc import ABC, abstractmethod -import numpy as np -import torch - -import smartsim.error as sse -from smartsim._core.mli.comm.channel.channel import CommChannelBase -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore -from smartsim._core.mli.mli_schemas.tensor import tensor_capnp -from smartsim.log import get_logger +from .....error import SmartSimError +from .....log import get_logger +from ...comm.channel.channel import CommChannelBase +from ...infrastructure.storage.featurestore import FeatureStore logger = get_logger(__name__) @@ -167,7 +162,7 @@ def fetch_model( raise ValueError("Feature store is required for model retrieval") if not request.model_key: - raise sse.SmartSimError( + raise SmartSimError( "Key must be provided to retrieve model from feature store" ) @@ -176,7 +171,7 @@ def fetch_model( return FetchModelResult(raw_bytes) except FileNotFoundError as ex: logger.exception(ex) - raise sse.SmartSimError( + raise SmartSimError( f"Model could not be retrieved with key {request.model_key}" ) from ex @@ -204,7 +199,7 @@ def fetch_inputs( data.append(tensor_bytes) except KeyError as ex: logger.exception(ex) - raise sse.SmartSimError( + raise SmartSimError( f"Model could not be retrieved with key {input_}" ) from ex return FetchInputResult( @@ -303,75 +298,3 @@ def transform_output( :param execute_result: The result of inference wrapped in an ExecuteResult :param result_device: The device on which the result of inference is placed :return:""" - - -class TorchWorker(MachineLearningWorkerBase): - """A worker that executes a PyTorch model.""" - - @staticmethod - def load_model( - request: InferenceRequest, fetch_result: FetchModelResult, device: str - ) -> LoadModelResult: - model_bytes = fetch_result.model_bytes or request.raw_model - if not model_bytes: - raise ValueError("Unable to load model without reference object") - - _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - device = _device_to_torch[device] - buffer = io.BytesIO(model_bytes) - model = torch.jit.load(buffer, map_location=device) # type: ignore - result = LoadModelResult(model) - return result - - @staticmethod - def transform_input( - request: InferenceRequest, fetch_result: FetchInputResult, device: str - ) -> TransformInputResult: - result = [] - - _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - device = _device_to_torch[device] - if fetch_result.meta is None: - raise ValueError("Cannot reconstruct tensor without meta information") - for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): - tensor_desc: tensor_capnp.TensorDescriptor = item_meta - result.append( - torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType))) - .to(device) - .reshape(tuple(dim for dim in tensor_desc.dimensions)) - ) - return TransformInputResult(result) - # return data # note: this fails copy test! - - @staticmethod - def execute( - request: InferenceRequest, - load_result: LoadModelResult, - transform_result: TransformInputResult, - ) -> ExecuteResult: - if not load_result.model: - raise sse.SmartSimError("Model must be loaded to execute") - - model: torch.nn.Module = load_result.model - model.eval() - results = [model(tensor).detach() for tensor in transform_result.transformed] - - execute_result = ExecuteResult(results) - return execute_result - - @staticmethod - def transform_output( - request: InferenceRequest, - execute_result: ExecuteResult, - result_device: str, - ) -> TransformOutputResult: - if result_device != "cpu": - transformed = [ - item.to("cpu").clone() for item in execute_result.predictions - ] - # todo: need the shape from latest schemas added here. - return TransformOutputResult(transformed, None, "c", "float32") # fixme - - return TransformOutputResult( - execute_result.predictions, None, "c", "float32" - ) # fixme diff --git a/tests/mli/test_torch_worker.py b/tests/mli/test_torch_worker.py new file mode 100644 index 0000000000..0b1cd4ccf3 --- /dev/null +++ b/tests/mli/test_torch_worker.py @@ -0,0 +1,173 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io + +import numpy as np +import pytest +import torch +from torch import nn +from torch.nn import functional as F + +from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker +from smartsim._core.mli.infrastructure.worker.worker import ( + ExecuteResult, + FetchInputResult, + FetchModelResult, + InferenceRequest, + LoadModelResult, + TransformInputResult, +) +from smartsim._core.mli.message_handler import MessageHandler +from smartsim.log import get_logger + +logger = get_logger(__name__) +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +# simple MNIST in PyTorch +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = F.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output + + +torch_device = {"cpu": "cpu", "gpu": "cuda"} + + +def get_batch() -> torch.Tensor: + return torch.rand(20, 1, 28, 28) + + +def create_torch_model(): + n = Net() + example_forward_input = get_batch() + module = torch.jit.trace(n, example_forward_input) + model_buffer = io.BytesIO() + torch.jit.save(module, model_buffer) + return model_buffer.getvalue() + + +def get_request() -> InferenceRequest: + + tensors = [get_batch() for _ in range(2)] + serialized_tensors = [ + MessageHandler.build_tensor(tensor.numpy(), "c", "float32", list(tensor.shape)) + for tensor in tensors + ] + + return InferenceRequest( + model_key="model", + callback=None, + raw_inputs=[s_tensor.blob for s_tensor in serialized_tensors], + input_keys=None, + input_meta=[s_tensor.tensorDescriptor for s_tensor in serialized_tensors], + output_keys=None, + raw_model=create_torch_model(), + batch_size=0, + ) + + +sample_request: InferenceRequest = get_request() +worker = TorchWorker() + + +def test_load_model(mlutils) -> None: + fetch_model_result = FetchModelResult(sample_request.raw_model) + load_model_result = worker.load_model( + sample_request, fetch_model_result, mlutils.get_test_device().lower() + ) + + assert load_model_result.model( + get_batch().to(torch_device[mlutils.get_test_device().lower()]) + ).shape == torch.Size((20, 10)) + + +def test_transform_input(mlutils) -> None: + fetch_input_result = FetchInputResult( + sample_request.raw_inputs, sample_request.input_meta + ) + + transform_input_result = worker.transform_input( + sample_request, fetch_input_result, mlutils.get_test_device().lower() + ) + + assert all( + transformed.shape == get_batch().shape + for transformed in transform_input_result.transformed + ) + + +def test_execute(mlutils) -> None: + load_model_result = LoadModelResult( + Net().to(torch_device[mlutils.get_test_device().lower()]) + ) + transform_result = TransformInputResult( + [ + get_batch().to(torch_device[mlutils.get_test_device().lower()]) + for _ in range(2) + ] + ) + + execute_result = worker.execute(sample_request, load_model_result, transform_result) + + assert all( + result.shape == torch.Size((20, 10)) for result in execute_result.predictions + ) + + +def test_transform_output(mlutils): + execute_result = ExecuteResult([torch.rand((20, 10)) for _ in range(2)]) + + transformed_output = worker.transform_output( + sample_request, execute_result, torch_device[mlutils.get_test_device().lower()] + ) + + assert transformed_output.outputs == execute_result.predictions + assert transformed_output.shape == None + assert transformed_output.order == "c" + assert transformed_output.dtype == "float32" diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index 01502ec521..46cae5b2e4 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -29,10 +29,14 @@ import multiprocessing as mp import pathlib import time -import typing as t import pytest -import torch + +should_run = True +try: + import torch +except ImportError: + should_run = False from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore @@ -44,9 +48,11 @@ from .worker import IntegratedTorchWorker logger = get_logger(__name__) -# The tests in this file belong to the group_b group +# The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a +pytest.mark.skipif(not should_run, "Test needs PyTorch to run") + def mock_work(worker_manager_queue: "mp.Queue[bytes]") -> None: """Mock event producer for triggering the inference pipeline""" From 6cec83ea4697761b3d297cc8fd50cd44a568af64 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 27 Jun 2024 08:14:24 -0500 Subject: [PATCH 17/84] Switch to sender-supplied channels in app example --- ex/high_throughput_inference/mock_app.py | 6 ++++-- ex/high_throughput_inference/standalone_workermanager.py | 3 +-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index d22792d15b..8a00e8f0e4 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -62,6 +62,9 @@ print(",".join(headers)) + from_worker_ch = Channel.make_process_local() + to_worker_ch = Channel.make_process_local() + for batch_size in [1, 8, 32, 64, 128]: timings = [] @@ -79,7 +82,6 @@ timings[-1].append(time.perf_counter() - interm) interm = time.perf_counter() - from_worker_ch = Channel.make_process_local() request = MessageHandler.build_request( reply_channel=from_worker_ch.serialize(), @@ -95,7 +97,7 @@ request_bytes = MessageHandler.serialize_request(request) timings[-1].append(time.perf_counter() - interm) interm = time.perf_counter() - with to_worker_fli.sendh(timeout=None) as to_sendh: + with to_worker_fli.sendh(timeout=None, stream_channel=to_worker_ch) as to_sendh: to_sendh.send_bytes(request_bytes) timings[-1].append(time.perf_counter() - interm) interm = time.perf_counter() diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 40fefcc372..cdc97f4c2e 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -26,8 +26,7 @@ to_worker_channel = Channel.make_process_local() to_worker_manager_channel = Channel.make_process_local() - channels = [Channel.make_process_local() for _ in range(100)] - to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel, stream_channels=channels) + to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel) ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize()) torch_worker = TorchWorker() From 3ad6d445662a611539b40cb72fcba1a0b4ea102f Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 27 Jun 2024 16:55:59 -0500 Subject: [PATCH 18/84] Add prototype client for mock app --- ex/high_throughput_inference/mli_driver.py | 2 +- ex/high_throughput_inference/mock_app.py | 206 ++++++++++++--------- 2 files changed, 116 insertions(+), 92 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 7b8db5ed83..d32d88e51b 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -4,7 +4,7 @@ from smartsim.status import TERMINAL_STATUSES import time -device = "cpu" +device = "gpu" filedir = os.path.dirname(__file__) worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") app_script_name = os.path.join(filedir, "mock_app.py") diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 8a00e8f0e4..aa3aaeb3ee 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -15,111 +15,135 @@ import os import time import torch +import numbers + +from collections import OrderedDict from smartsim._core.mli.message_handler import MessageHandler +class ProtoClient: + def __init__(self, timing_on: bool): + connect_to_infrastructure() + ddict_str = os.environ["SS_DRG_DDICT"] + self._ddict = DDict.attach(ddict_str) + to_worker_fli_str = None + while to_worker_fli_str is None: + try: + to_worker_fli_str = self._ddict["to_worker_fli"] + self._to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str)) + except KeyError: + time.sleep(1) + self._from_worker_ch = Channel.make_process_local() + self._from_worker_ch_serialized = self._from_worker_ch.serialize() + self._to_worker_ch = Channel.make_process_local() + + self._start = None + self._interm = None + self._timings: OrderedDict[str, list[numbers.Number]] = OrderedDict() + self._timing_on = timing_on + + def _add_label_to_timings(self, label: str): + if label not in self._timings: + self._timings[label] = [] + + @staticmethod + def _format_number(number: numbers.Number): + return f"{number:0.4e}" + + def start_timings(self, batch_size: int): + if self._timing_on: + self._add_label_to_timings("batch_size") + self._timings["batch_size"].append(batch_size) + self._start = time.perf_counter() + self._interm = time.perf_counter() + + def end_timings(self): + if self._timing_on: + self._add_label_to_timings("total_time") + self._timings["total_time"].append(self._format_number(time.perf_counter()-self._start)) + + def measure_time(self, label: str): + if self._timing_on: + self._add_label_to_timings(label) + self._timings[label].append(self._format_number(time.perf_counter()-self._interm)) + self._interm = time.perf_counter() + + def print_timings(self, to_file: bool = False): + print(" ".join(self._timings.keys())) + value_array = numpy.array([value for value in self._timings.values()], dtype=float) + value_array = numpy.transpose(value_array) + for i in range(value_array.shape[0]): + print(" ".join(self._format_number(value) for value in value_array[i])) + if to_file: + numpy.save("timings.npy", value_array) + numpy.savetxt("timings.txt", value_array) + + + def run_model(self, model: bytes, batch: torch.Tensor): + self.start_timings(batch.shape[0]) + built_tensor = MessageHandler.build_tensor( + batch.numpy(), "c", "float32", list(batch.shape)) + self.measure_time("build_tensor") + request = MessageHandler.build_request( + reply_channel=self._from_worker_ch_serialized, + model=model, + inputs=[built_tensor], + outputs=[], + output_descriptors=[], + custom_attributes=None, + ) + self.measure_time("build_request") + request_bytes = MessageHandler.serialize_request(request) + self.measure_time("serialize_request") + with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: + to_sendh.send_bytes(request_bytes) + + self.measure_time("send") + with self._from_worker_ch.recvh(timeout=None) as from_recvh: + resp = from_recvh.recv_bytes(timeout=None) + self.measure_time("receive") + response = MessageHandler.deserialize_response(resp) + self.measure_time("deserialize_response") + result = torch.from_numpy( + numpy.frombuffer( + response.result.data[0].blob, + dtype=str(response.result.data[0].tensorDescriptor.dataType), + ) + ) + self.measure_time("deserialize_tensor") -if __name__ == "__main__": + self.end_timings() + return result - parser = argparse.ArgumentParser("Mock application") - parser.add_argument("--device", default="cpu") - args = parser.parse_args() +class ResNetWrapper(): + def __init__(self, model: str): + self._model = torch.jit.load(model) + buffer = io.BytesIO() + scripted = torch.jit.trace(self._model, self.get_batch()) + torch.jit.save(scripted, buffer) + self._serialized_model = buffer.getvalue() - connect_to_infrastructure() - ddict_str = os.environ["SS_DRG_DDICT"] + def get_batch(self, batch_size: int=32): + return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) - ddict = DDict.attach(ddict_str) + @property + def model(self): + return self._serialized_model - to_worker_fli_str = None +if __name__ == "__main__": - while to_worker_fli_str is None: - try: - to_worker_fli_str = ddict["to_worker_fli"] - except Exception as e: - time.sleep(1) + parser = argparse.ArgumentParser("Mock application") + parser.add_argument("--device", default="cpu") + args = parser.parse_args() - to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str)) + resnet = ResNetWrapper(f"resnet50.{args.device.upper()}.pt") - batch_size = 32 - model = torch.jit.load(f"resnet50.{args.device.upper()}.pt") - buffer = io.BytesIO() - batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) - scripted = torch.jit.trace(model, batch) - torch.jit.save(scripted, buffer) + client = ProtoClient(timing_on=True) total_iterations = 10 - headers=[ - "batch_size", - "build_tensor", - "build_request", - "serialize_request", - "send", - "receive", - "deserialize_response", - "deserialize_tensor", - ] - - print(",".join(headers)) - - from_worker_ch = Channel.make_process_local() - to_worker_ch = Channel.make_process_local() - for batch_size in [1, 8, 32, 64, 128]: - - timings = [] for iteration_number in range(total_iterations + int(batch_size==1)): + client.run_model(resnet.model, resnet.get_batch(batch_size)) - timings.append([batch_size]) - - batch = torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) - - start = time.perf_counter() - interm = start - built_tensor = MessageHandler.build_tensor( - batch.numpy(), "c", "float32", list(batch.shape) - ) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - - - request = MessageHandler.build_request( - reply_channel=from_worker_ch.serialize(), - model=buffer.getvalue(), - inputs=[built_tensor], - outputs=[], - output_descriptors=[], - custom_attributes=None, - ) - - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - request_bytes = MessageHandler.serialize_request(request) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - with to_worker_fli.sendh(timeout=None, stream_channel=to_worker_ch) as to_sendh: - to_sendh.send_bytes(request_bytes) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - - with from_worker_ch.recvh(timeout=None) as from_recvh: - resp = from_recvh.recv_bytes(timeout=None) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - response = MessageHandler.deserialize_response(resp) - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - result = torch.from_numpy( - numpy.frombuffer( - response.result.data[0].blob, - dtype=str(response.result.data[0].tensorDescriptor.dataType), - ) - ) - - timings[-1].append(time.perf_counter() - interm) - interm = time.perf_counter() - - # duration = time.perf_counter() - start - # print(f"{duration:.3f} s") - - print(",".join(str(timing) for timing in timings[-1])) + client.print_timings(to_file=True) \ No newline at end of file From bd5f13357b181ee07e2df880b519d8464c8af174 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 28 Jun 2024 14:55:18 -0500 Subject: [PATCH 19/84] Update mock app --- ex/high_throughput_inference/mli_driver.py | 5 +++-- ex/high_throughput_inference/mock_app.py | 9 +++++++-- ex/high_throughput_inference/standalone_workermanager.py | 3 +-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index d32d88e51b..9b899f4124 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -10,8 +10,9 @@ app_script_name = os.path.join(filedir, "mock_app.py") model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") - -exp = Experiment("MLI_proto", launcher="dragon", exp_path=os.path.join(filedir, "MLI_proto")) +exp_path = os.path.join(filedir, "MLI_proto") +os.makedirs(exp_path, exist_ok=True) +exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path) worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device]) worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index aa3aaeb3ee..666d7fcc91 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -19,6 +19,9 @@ from collections import OrderedDict from smartsim._core.mli.message_handler import MessageHandler +from smartsim.log import get_logger + +logger = get_logger("App") class ProtoClient: def __init__(self, timing_on: bool): @@ -140,10 +143,12 @@ def model(self): client = ProtoClient(timing_on=True) - total_iterations = 10 + total_iterations = 100 - for batch_size in [1, 8, 32, 64, 128]: + for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]: + logger.info(f"Batch size: {batch_size}") for iteration_number in range(total_iterations + int(batch_size==1)): + logger.info(f"Iteration: {iteration_number}") client.run_model(resnet.model, resnet.get_batch(batch_size)) client.print_timings(to_file=True) \ No newline at end of file diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index cdc97f4c2e..ccefcbf584 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -25,8 +25,7 @@ ddict = DDict.attach(ddict_str) to_worker_channel = Channel.make_process_local() - to_worker_manager_channel = Channel.make_process_local() - to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=to_worker_manager_channel) + to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize()) torch_worker = TorchWorker() From 3e343ee5dff7d85646a39db1b56123efa575f387 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 4 Jul 2024 05:40:59 -0500 Subject: [PATCH 20/84] Changes to feature store --- smartsim/_core/launcher/dragon/dragonBackend.py | 2 +- .../infrastructure/storage/dragonfeaturestore.py | 12 ++++-------- .../mli/infrastructure/worker/torch_worker.py | 2 +- smartsim/_core/mli/infrastructure/worker/worker.py | 14 +++++++++++++- smartsim/_core/mli/message_handler.py | 4 +++- 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 52f69ec41f..856de38030 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -401,7 +401,7 @@ def infra_ddict(self) -> str: """ if self._infra_ddict is None: logger.info("Creating DDict") - self._infra_ddict = dragon_ddict.DDict() # todo: parametrize + self._infra_ddict = dragon_ddict.DDict(n_nodes=len(self._hosts), total_mem=len(self._hosts)*1024**3) # todo: parametrize logger.info("Created DDict") self._infra_ddict["creation"] = str(time.time()) logger.info(self._infra_ddict["creation"]) diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py index ea8f06977d..53f2f461f8 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py +++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py @@ -47,24 +47,20 @@ def __init__(self, storage: "DragonDict") -> None: def __getitem__(self, key: str) -> t.Any: """Retrieve an item using key :param key: Unique key of an item to retrieve from the feature store""" - key_ = key.encode("utf-8") try: - return self._storage[key_] + return self._storage[key] except Exception as ex: # note: explicitly avoid round-trip to check for key existence raise sse.SmartSimError(f"{key} not found in feature store") from ex - def __setitem__(self, key: str, value: bytes) -> None: + def __setitem__(self, key: str, value: str) -> None: """Assign a value using key :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" - key_ = key.encode("utf-8") - self._storage[key_] = value + self._storage[key] = value - def __contains__(self, key: t.Union[str, bytes]) -> bool: + def __contains__(self, key: t.Union[str]) -> bool: """Membership operator to test for a key existing within the feature store. Return `True` if the key is found, `False` otherwise :param key: Unique key of an item to retrieve from the feature store""" - if isinstance(key, str): - key = key.encode("utf-8") return key in self._storage diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index c350499c20..122b9ddf2f 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -108,7 +108,7 @@ def transform_output( ) -> TransformOutputResult: if result_device != "cpu": transformed = [ - item.to("cpu").clone() for item in execute_result.predictions + item.to("cpu") for item in execute_result.predictions ] # todo: need the shape from latest schemas added here. return TransformOutputResult(transformed, None, "c", "float32") # fixme diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 24dc734d00..40696ac22f 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -32,6 +32,18 @@ from ...comm.channel.channel import CommChannelBase from ...infrastructure.storage.featurestore import FeatureStore +import sys + +# isort: off +try: + import dragon + from dragon.utils import b64decode +except ImportError as exc: + if not "pytest" in sys.modules: + raise exc from None + +# isort: on + logger = get_logger(__name__) @@ -167,7 +179,7 @@ def fetch_model( ) try: - raw_bytes = feature_store[request.model_key] + raw_bytes = b64decode(feature_store[request.model_key]) return FetchModelResult(raw_bytes) except FileNotFoundError as ex: logger.exception(ex) diff --git a/smartsim/_core/mli/message_handler.py b/smartsim/_core/mli/message_handler.py index fd8f6aeed7..1928db2f7c 100644 --- a/smartsim/_core/mli/message_handler.py +++ b/smartsim/_core/mli/message_handler.py @@ -200,7 +200,9 @@ def _assign_model( if isinstance(model, bytes): request.model.modelData = model else: - request.model.modelKey = model # type: ignore + model_key = data_references_capnp.ModelKey() + model_key.key = model + request.model.modelKey = model_key # type: ignore except Exception as e: raise ValueError("Error building model portion of request.") from e From a2bed267d8dbc1af109cad6708557afb11687d0a Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 5 Jul 2024 17:45:32 +0200 Subject: [PATCH 21/84] Make style --- smartsim/_core/launcher/dragon/dragonBackend.py | 4 +++- smartsim/_core/mli/infrastructure/worker/torch_worker.py | 4 +--- smartsim/_core/mli/infrastructure/worker/worker.py | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 856de38030..dcc5c8392b 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -401,7 +401,9 @@ def infra_ddict(self) -> str: """ if self._infra_ddict is None: logger.info("Creating DDict") - self._infra_ddict = dragon_ddict.DDict(n_nodes=len(self._hosts), total_mem=len(self._hosts)*1024**3) # todo: parametrize + self._infra_ddict = dragon_ddict.DDict( + n_nodes=len(self._hosts), total_mem=len(self._hosts) * 1024**3 + ) # todo: parametrize logger.info("Created DDict") self._infra_ddict["creation"] = str(time.time()) logger.info(self._infra_ddict["creation"]) diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 122b9ddf2f..28237dc422 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -107,9 +107,7 @@ def transform_output( result_device: str, ) -> TransformOutputResult: if result_device != "cpu": - transformed = [ - item.to("cpu") for item in execute_result.predictions - ] + transformed = [item.to("cpu") for item in execute_result.predictions] # todo: need the shape from latest schemas added here. return TransformOutputResult(transformed, None, "c", "float32") # fixme diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 73eff4e8ea..e368935a0d 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import sys import typing as t from abc import ABC, abstractmethod @@ -33,8 +34,6 @@ from ...infrastructure.storage.featurestore import FeatureStore from ...mli_schemas.model.model_capnp import Model -import sys - # isort: off try: import dragon From 36e92d9dabcdd013cdba637a2629e19c15896cb5 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 5 Jul 2024 18:07:31 +0200 Subject: [PATCH 22/84] Fix typing --- .../mli/infrastructure/storage/featurestore.py | 2 +- .../_core/mli/infrastructure/worker/torch_worker.py | 13 ++++++++----- smartsim/_core/mli/infrastructure/worker/worker.py | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py index ec4086b732..e18643e932 100644 --- a/smartsim/_core/mli/infrastructure/storage/featurestore.py +++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py @@ -37,7 +37,7 @@ def __getitem__(self, key: str) -> bytes: :param key: Unique key of an item to retrieve from the feature store""" @abstractmethod - def __setitem__(self, key: str, value: bytes) -> None: + def __setitem__(self, key: str, value: str) -> None: """Assign a value using key :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 28237dc422..e21513648b 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -53,13 +53,16 @@ class TorchWorker(MachineLearningWorkerBase): def load_model( request: InferenceRequest, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: - model_bytes = fetch_result.model_bytes or request.raw_model - if not model_bytes: + if fetch_result.model_bytes: + model_bytes = fetch_result.model_bytes + elif request.raw_model and request.raw_model.data: + model_bytes = request.raw_model.data + else: raise ValueError("Unable to load model without reference object") - _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - device = _device_to_torch[device] - buffer = io.BytesIO(model_bytes) + device_to_torch = {"cpu": "cpu", "gpu": "cuda"} + device = device_to_torch[device] + buffer = io.BytesIO(initial_bytes=model_bytes) model = torch.jit.load(buffer, map_location=device) # type: ignore result = LoadModelResult(model) return result diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index e368935a0d..fb061348ee 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -148,7 +148,7 @@ class FetchModelResult: def __init__(self, result: bytes) -> None: """Initialize the object""" - self.model_bytes = result + self.model_bytes: bytes = result class MachineLearningWorkerCore: From 59840a3be12576eedce2528d93a8b601a768973e Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 5 Jul 2024 18:17:18 +0200 Subject: [PATCH 23/84] Fix lint --- smartsim/_core/mli/infrastructure/worker/torch_worker.py | 4 ++-- smartsim/_core/mli/infrastructure/worker/worker.py | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index e21513648b..a4e725ab99 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -73,8 +73,8 @@ def transform_input( ) -> TransformInputResult: result = [] - _device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - device = _device_to_torch[device] + device_to_torch = {"cpu": "cpu", "gpu": "cuda"} + device = device_to_torch[device] if fetch_result.meta is None: raise ValueError("Cannot reconstruct tensor without meta information") for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index fb061348ee..fe82ea2a3e 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -36,12 +36,10 @@ # isort: off try: - import dragon from dragon.utils import b64decode except ImportError as exc: - if not "pytest" in sys.modules: + if "pytest" not in sys.modules: raise exc from None - # isort: on logger = get_logger(__name__) From b35b37dd89bf6f7fd7a93c339e79643046d48abe Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 5 Jul 2024 18:32:00 +0200 Subject: [PATCH 24/84] Remove duplicated/useless comments --- smartsim/_core/mli/infrastructure/control/workermanager.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 4e276d2507..f0cae497a0 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -240,7 +240,6 @@ def _on_iteration(self) -> None: logger.warning("No queue to check for tasks") return - # perform default deserialization of the message envelope # perform default deserialization of the message envelope with self._task_queue.recvh(timeout=None) as recvh: try: @@ -254,9 +253,6 @@ def _on_iteration(self) -> None: if not self._validate_request(request): return - # # let the worker perform additional custom deserialization - # request = self._worker.deserialize(request_bytes) - fetch_model_result = self._worker.fetch_model(request, self._feature_store) model_result = self._worker.load_model( request, fetch_model_result, self._device @@ -294,7 +290,6 @@ def _on_iteration(self) -> None: response = build_reply(reply) - # serialized = self._worker.serialize_reply(request, transformed_output) serialized_resp = MessageHandler.serialize_response(response) # type: ignore if request.callback: request.callback.send(serialized_resp) From 51e0b17bdbf22683759597ece523778b6d7bd953 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 9 Jul 2024 12:37:22 -0500 Subject: [PATCH 25/84] Bring up to date with new schema --- ex/high_throughput_inference/mli_driver.py | 9 ++- ex/high_throughput_inference/mock_app.py | 30 +++++++++- .../standalone_workermanager.py | 57 +++++++++++++++++-- smartsim/_core/entrypoints/service.py | 3 +- smartsim/_core/mli/comm/channel/channel.py | 7 ++- .../_core/mli/comm/channel/dragonchannel.py | 6 ++ smartsim/_core/mli/comm/channel/dragonfli.py | 29 ++++++---- .../infrastructure/control/workermanager.py | 20 ++----- .../_core/mli/infrastructure/worker/worker.py | 11 ++-- 9 files changed, 128 insertions(+), 44 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 9b899f4124..4a3dd034e8 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -1,6 +1,11 @@ + + import os +import base64 +import cloudpickle import sys from smartsim import Experiment +from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim.status import TERMINAL_STATUSES import time @@ -14,7 +19,9 @@ os.makedirs(exp_path, exist_ok=True) exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path) -worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device]) +torch_worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii") + +worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device, "--worker_class", torch_worker_str]) worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 666d7fcc91..df0ba55c76 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -1,3 +1,29 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + # isort: off import dragon from dragon import fli @@ -32,7 +58,7 @@ def __init__(self, timing_on: bool): while to_worker_fli_str is None: try: to_worker_fli_str = self._ddict["to_worker_fli"] - self._to_worker_fli = fli.FLInterface.attach(b64decode(to_worker_fli_str)) + self._to_worker_fli = fli.FLInterface.attach(to_worker_fli_str) except KeyError: time.sleep(1) self._from_worker_ch = Channel.make_process_local() @@ -88,7 +114,7 @@ def run_model(self, model: bytes, batch: torch.Tensor): self.measure_time("build_tensor") request = MessageHandler.build_request( reply_channel=self._from_worker_ch_serialized, - model=model, + model=MessageHandler.build_model(model, "resnet-50", "1.0"), inputs=[built_tensor], outputs=[], output_descriptors=[], diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index ccefcbf584..991e869581 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -1,3 +1,29 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + # isort: off import dragon from dragon import fli @@ -7,10 +33,12 @@ from dragon.globalservices.api_setup import connect_to_infrastructure # isort: on import argparse +import base64 +import cloudpickle import os - from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.infrastructure.control.workermanager import ( WorkerManager, @@ -18,7 +46,23 @@ if __name__ == "__main__": parser = argparse.ArgumentParser("Worker Manager") - parser.add_argument("--device", default="gpu") + parser.add_argument( + "--device", + type=str, + default="gpu", + choices="gpu cpu".split(), + help="Device on which the inference takes place", + ) + parser.add_argument( + "--worker_class", + type=str, + required=True, + help="Serialized class of worker to run", + ) + parser.add_argument( + "--num_workers", type=int, default=1, help="Number of workers to run" + ) + args = parser.parse_args() connect_to_infrastructure() ddict_str = os.environ["SS_DRG_DDICT"] @@ -26,12 +70,13 @@ to_worker_channel = Channel.make_process_local() to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - ddict["to_worker_fli"] = b64encode(to_worker_fli.serialize()) - - torch_worker = TorchWorker() + to_worker_fli_serialized = to_worker_fli.serialize() + ddict["to_worker_fli"] = to_worker_fli_serialized + torch_worker = cloudpickle.loads(base64.b64decode(args.worker_class.encode('ascii')))() + comm_channel = DragonFLIChannel(to_worker_fli_serialized) worker_manager = WorkerManager( - file_like_interface=to_worker_fli, + task_queue=comm_channel, worker=torch_worker, feature_store=None, as_service=True, diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py index e03df6bea1..6b4ef74b67 100644 --- a/smartsim/_core/entrypoints/service.py +++ b/smartsim/_core/entrypoints/service.py @@ -46,7 +46,8 @@ def __init__( :param as_service: Determines if the host will run until shutdown criteria are met or as a run-once instance :param cooldown: Period of time to allow service to run before automatic - shutdown, in seconds. A non-zero, positive integer.""" + shutdown, in seconds. A non-zero, positive integer. + :param loop_delay: delay between iterations of the event loop""" self._as_service = as_service """If the service should run until shutdown function returns True""" self._cooldown = abs(cooldown) diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index 201ab9deab..2318896a9b 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -41,9 +41,14 @@ def __init__(self, descriptor: t.Union[str, bytes]) -> None: @abstractmethod def send(self, value: bytes) -> None: - """Send a message throuh the underlying communication channel + """Send a message through the underlying communication channel :param value: The value to send""" + @abstractmethod + def recv(self) -> bytes: + """Receieve a message through the underlying communication channel + :returns: the received message""" + @property def descriptor(self) -> bytes: """Return the channel descriptor for the underlying dragon channel""" diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index 872eb32350..fb1a0c51c1 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -51,3 +51,9 @@ def send(self, value: bytes) -> None: :param value: The value to send""" with self._channel.sendh(timeout=None) as sendh: sendh.send_bytes(value) + + def recv(self) -> bytes: + """Receieve a message through the underlying communication channel + :returns: the received message""" + with self._channel.recvh(timeout=None) as recvh: + return recvh.recv_bytes(timeout=None) \ No newline at end of file diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 3992241380..ebf824b7db 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -24,18 +24,12 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import sys - # isort: off -try: - from dragon import fli - import dragon.channels as dch -except ImportError as exc: - if not "pytest" in sys.modules: - raise exc from None +from dragon import fli +import dragon.channels as dch # isort: on - +import sys import smartsim._core.mli.comm.channel.channel as cch from smartsim.log import get_logger @@ -45,14 +39,25 @@ class DragonFLIChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon FLI Channel""" - def __init__(self, fli_desc: bytes) -> None: + def __init__(self, fli_desc: str) -> None: """Initialize the DragonFLIChannel instance""" super().__init__(fli_desc) # todo: do we need memory pool information to construct the channel correctly? - self._channel: "dch.Channel" = fli.FLInterface.attach(fli_desc) + self._channel: "fli" = fli.FLInterface.attach(fli_desc) def send(self, value: bytes) -> None: - """Send a message throuh the underlying communication channel + """Send a message through the underlying communication channel :param value: The value to send""" with self._channel.sendh(timeout=None) as sendh: sendh.send_bytes(value) + + def recv(self) -> bytes: + """Receieve a message through the underlying communication channel + :returns: the received message""" + with self._channel.recvh(timeout=None) as recvh: + try: + request_bytes: bytes + request_bytes, _ = recvh.recv_bytes(timeout=None) + return request_bytes + except fli.FLIEOT as exc: + return b'' \ No newline at end of file diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index f0cae497a0..6f31972727 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -27,14 +27,10 @@ import sys # isort: off -try: - import dragon - from dragon import fli -except ImportError as exc: - if not "pytest" in sys.modules: - raise exc from None - +import dragon +from dragon import fli # isort: on + import time import typing as t @@ -169,7 +165,7 @@ class WorkerManager(Service): def __init__( self, - file_like_interface: "fli.FLInterface", + task_queue: CommChannelBase, worker: MachineLearningWorkerBase, feature_store: t.Optional[FeatureStore] = None, as_service: bool = False, @@ -189,7 +185,7 @@ def __init__( super().__init__(as_service, cooldown) """a collection of workers the manager is controlling""" - self._task_queue: fli.FLInterface = file_like_interface + self._task_queue: CommChannelBase = task_queue """the queue the manager monitors for new tasks""" self._feature_store: t.Optional[FeatureStore] = feature_store """a feature store to retrieve models from""" @@ -241,11 +237,7 @@ def _on_iteration(self) -> None: return # perform default deserialization of the message envelope - with self._task_queue.recvh(timeout=None) as recvh: - try: - request_bytes, _ = recvh.recv_bytes(timeout=None) - except fli.FLIEOT as exc: - return + request_bytes = self._task_queue.recv() request = deserialize_message( request_bytes, self._comm_channel_type, self._device diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index fe82ea2a3e..808c9cf9bf 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -24,6 +24,10 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# isort: off +from dragon.utils import b64decode +# isort: on + import sys import typing as t from abc import ABC, abstractmethod @@ -34,13 +38,6 @@ from ...infrastructure.storage.featurestore import FeatureStore from ...mli_schemas.model.model_capnp import Model -# isort: off -try: - from dragon.utils import b64decode -except ImportError as exc: - if "pytest" not in sys.modules: - raise exc from None -# isort: on logger = get_logger(__name__) From 1fcf17d4456f99a6ad34d6360879e2e2a2b24f12 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 10 Jul 2024 11:06:08 -0500 Subject: [PATCH 26/84] Add feature store prototype caching --- ex/high_throughput_inference/mli_driver.py | 7 +- ex/high_throughput_inference/mock_app.py | 19 +++- .../standalone_workermanager.py | 10 +- smartsim/_core/entrypoints/service.py | 17 ++++ .../_core/mli/comm/channel/dragonchannel.py | 3 +- smartsim/_core/mli/comm/channel/dragonfli.py | 4 +- .../infrastructure/control/workermanager.py | 96 ++++++++++++++++--- .../storage/dragonfeaturestore.py | 15 ++- .../infrastructure/storage/featurestore.py | 5 +- .../_core/mli/infrastructure/worker/worker.py | 10 +- tests/mli/test_worker_manager.py | 8 +- 11 files changed, 147 insertions(+), 47 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 4a3dd034e8..4e68fdfbcb 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -8,6 +8,7 @@ from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim.status import TERMINAL_STATUSES import time +import typing as t device = "gpu" filedir = os.path.dirname(__file__) @@ -15,7 +16,11 @@ app_script_name = os.path.join(filedir, "mock_app.py") model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") -exp_path = os.path.join(filedir, "MLI_proto") +transport: t.Literal["hsta", "tcp"] = "hsta" + +os.environ["SMARTSIM_DRAGON_TRANSPORT"] = transport + +exp_path = os.path.join(filedir, f"MLI_proto_{transport.upper()}") os.makedirs(exp_path, exist_ok=True) exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index df0ba55c76..4ecce58ac7 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -31,7 +31,7 @@ import dragon.channels from dragon.data.ddict.ddict import DDict from dragon.globalservices.api_setup import connect_to_infrastructure -from dragon.utils import b64decode +from dragon.utils import b64decode, b64encode # isort: on @@ -107,7 +107,7 @@ def print_timings(self, to_file: bool = False): numpy.savetxt("timings.txt", value_array) - def run_model(self, model: bytes, batch: torch.Tensor): + def run_model(self, model: bytes | str, batch: torch.Tensor): self.start_timings(batch.shape[0]) built_tensor = MessageHandler.build_tensor( batch.numpy(), "c", "float32", list(batch.shape)) @@ -143,10 +143,14 @@ def run_model(self, model: bytes, batch: torch.Tensor): self.end_timings() return result + def set_model(self, key: str, model: bytes): + self._ddict[key] = b64encode(model) + class ResNetWrapper(): - def __init__(self, model: str): + def __init__(self, name: str, model: str): self._model = torch.jit.load(model) + self._name = name buffer = io.BytesIO() scripted = torch.jit.trace(self._model, self.get_batch()) torch.jit.save(scripted, buffer) @@ -159,15 +163,20 @@ def get_batch(self, batch_size: int=32): def model(self): return self._serialized_model + @property + def name(self): + return self._name + if __name__ == "__main__": parser = argparse.ArgumentParser("Mock application") parser.add_argument("--device", default="cpu") args = parser.parse_args() - resnet = ResNetWrapper(f"resnet50.{args.device.upper()}.pt") + resnet = ResNetWrapper("resnet50", f"resnet50.{args.device.upper()}.pt") client = ProtoClient(timing_on=True) + client.set_model(resnet.name, resnet.model) total_iterations = 100 @@ -175,6 +184,6 @@ def model(self): logger.info(f"Batch size: {batch_size}") for iteration_number in range(total_iterations + int(batch_size==1)): logger.info(f"Iteration: {iteration_number}") - client.run_model(resnet.model, resnet.get_batch(batch_size)) + client.run_model(resnet.name, resnet.get_batch(batch_size)) client.print_timings(to_file=True) \ No newline at end of file diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 991e869581..f3e8e7c589 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -38,11 +38,11 @@ import os from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel +from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import DragonFeatureStore from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker -from smartsim._core.mli.infrastructure.control.workermanager import ( - WorkerManager, -) +from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager + if __name__ == "__main__": parser = argparse.ArgumentParser("Worker Manager") @@ -74,11 +74,13 @@ ddict["to_worker_fli"] = to_worker_fli_serialized torch_worker = cloudpickle.loads(base64.b64decode(args.worker_class.encode('ascii')))() + + dfs = DragonFeatureStore(ddict) comm_channel = DragonFLIChannel(to_worker_fli_serialized) worker_manager = WorkerManager( task_queue=comm_channel, worker=torch_worker, - feature_store=None, + feature_store=dfs, as_service=True, cooldown=10, comm_channel_type=DragonCommChannel, diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py index 6b4ef74b67..df9c2bbef6 100644 --- a/smartsim/_core/entrypoints/service.py +++ b/smartsim/_core/entrypoints/service.py @@ -103,6 +103,23 @@ def execute(self) -> None: running = True cooldown_start: t.Optional[datetime.datetime] = None + headers = [ + "batch_size", + "w_deserialize", + "w_fetch_model", + "w_load_model", + "w_fetch_input", + "w_transform_input", + "w_execute", + "w_transform_output", + "w_assign_output", + "w_build_reply", + "w_serialize_resp", + "w_send", + ] + + print(",".join(headers)) + while running: self._on_iteration() diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index fb1a0c51c1..1409747a91 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -56,4 +56,5 @@ def recv(self) -> bytes: """Receieve a message through the underlying communication channel :returns: the received message""" with self._channel.recvh(timeout=None) as recvh: - return recvh.recv_bytes(timeout=None) \ No newline at end of file + message_bytes: bytes = recvh.recv_bytes(timeout=None) + return message_bytes diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index ebf824b7db..0c1aba94e3 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -27,9 +27,11 @@ # isort: off from dragon import fli import dragon.channels as dch + # isort: on import sys + import smartsim._core.mli.comm.channel.channel as cch from smartsim.log import get_logger @@ -60,4 +62,4 @@ def recv(self) -> bytes: request_bytes, _ = recvh.recv_bytes(timeout=None) return request_bytes except fli.FLIEOT as exc: - return b'' \ No newline at end of file + return b"" diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 6f31972727..d3cc2d84ae 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -29,6 +29,7 @@ # isort: off import dragon from dragon import fli + # isort: on import time @@ -36,18 +37,20 @@ import numpy as np -from smartsim._core.entrypoints.service import Service -from smartsim._core.mli.comm.channel.channel import CommChannelBase -from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore -from smartsim._core.mli.infrastructure.worker.worker import ( +from .....error import SmartSimError +from .....log import get_logger +from ....entrypoints.service import Service +from ...comm.channel.channel import CommChannelBase +from ...comm.channel.dragonfli import DragonFLIChannel +from ...infrastructure.storage.featurestore import FeatureStore +from ...infrastructure.worker.worker import ( InferenceReply, InferenceRequest, + LoadModelResult, MachineLearningWorkerBase, ) -from smartsim._core.mli.message_handler import MessageHandler -from smartsim._core.mli.mli_schemas.response.response_capnp import Response -from smartsim.log import get_logger +from ...message_handler import MessageHandler +from ...mli_schemas.response.response_capnp import Response if t.TYPE_CHECKING: from smartsim._core.mli.mli_schemas.model.model_capnp import Model @@ -195,6 +198,8 @@ def __init__( """The type of communication channel to construct for callbacks""" self._device = device """Device on which workers need to run""" + self._cached_models: dict[str, t.Any] = {} + """Dictionary of previously loaded models""" def _validate_request(self, request: InferenceRequest) -> bool: """Ensure the request can be processed. @@ -236,34 +241,84 @@ def _on_iteration(self) -> None: logger.warning("No queue to check for tasks") return + timings = [] # perform default deserialization of the message envelope - request_bytes = self._task_queue.recv() + request_bytes: bytes = self._task_queue.recv() + interm = time.perf_counter() request = deserialize_message( request_bytes, self._comm_channel_type, self._device ) if not self._validate_request(request): return - fetch_model_result = self._worker.fetch_model(request, self._feature_store) - model_result = self._worker.load_model( - request, fetch_model_result, self._device - ) + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() + + if not request.raw_model: + if not request.model_key: + raise SmartSimError("Neither key, nor model provided") + + if request.model_key in self._cached_models: + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() + model_result = LoadModelResult(self._cached_models[request.model_key]) + + else: + fetch_model_result = None + while True: + try: + interm = time.perf_counter() + fetch_model_result = self._worker.fetch_model( + request, self._feature_store + ) + except KeyError: + time.sleep(0.1) + else: + break + + if fetch_model_result is None: + raise SmartSimError("Could not retrieve model from feature store") + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() + model_result = self._worker.load_model( + request, fetch_model_result, self._device + ) + self._cached_models[request.model_key] = model_result.model + else: + fetch_model_result = self._worker.fetch_model(request, None) + model_result = self._worker.load_model( + request, fetch_result=fetch_model_result, device=self._device + ) + + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() fetch_input_result = self._worker.fetch_inputs(request, self._feature_store) + + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() transformed_input = self._worker.transform_input( request, fetch_input_result, self._device ) + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() + reply = InferenceReply() try: execute_result = self._worker.execute( request, model_result, transformed_input ) + + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() transformed_output = self._worker.transform_output( request, execute_result, self._device ) + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() if request.output_keys: reply.output_keys = self._worker.place_output( request, transformed_output, self._feature_store @@ -274,6 +329,9 @@ def _on_iteration(self) -> None: logger.exception("Error executing worker") reply.failed = True + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() + if reply.failed: response = build_failure_reply("fail", "failure-occurred") else: @@ -282,10 +340,22 @@ def _on_iteration(self) -> None: response = build_reply(reply) + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() + + # serialized = self._worker.serialize_reply(request, transformed_output) serialized_resp = MessageHandler.serialize_response(response) # type: ignore + + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() if request.callback: request.callback.send(serialized_resp) + timings.append(time.perf_counter() - interm) + interm = time.perf_counter() + + print(" ".join(str(time) for time in timings)) + def _can_shutdown(self) -> bool: """Return true when the criteria to shut down the service are met.""" # todo: determine shutdown criteria diff --git a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py index 53f2f461f8..fbd18438f5 100644 --- a/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py +++ b/smartsim/_core/mli/infrastructure/storage/dragonfeaturestore.py @@ -44,22 +44,27 @@ def __init__(self, storage: "DragonDict") -> None: """Initialize the DragonFeatureStore instance""" self._storage = storage - def __getitem__(self, key: str) -> t.Any: + def __getitem__(self, key: str) -> t.Union[str, bytes]: """Retrieve an item using key :param key: Unique key of an item to retrieve from the feature store""" try: - return self._storage[key] + value: t.Union[str, bytes] = self._storage[key] + return value + except KeyError as ex: + raise ex except Exception as ex: # note: explicitly avoid round-trip to check for key existence - raise sse.SmartSimError(f"{key} not found in feature store") from ex + raise sse.SmartSimError( + f"Could not get value for existing key {key}, error:\n{ex}" + ) from ex - def __setitem__(self, key: str, value: str) -> None: + def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None: """Assign a value using key :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" self._storage[key] = value - def __contains__(self, key: t.Union[str]) -> bool: + def __contains__(self, key: str) -> bool: """Membership operator to test for a key existing within the feature store. Return `True` if the key is found, `False` otherwise :param key: Unique key of an item to retrieve from the feature store""" diff --git a/smartsim/_core/mli/infrastructure/storage/featurestore.py b/smartsim/_core/mli/infrastructure/storage/featurestore.py index e18643e932..553e13b10f 100644 --- a/smartsim/_core/mli/infrastructure/storage/featurestore.py +++ b/smartsim/_core/mli/infrastructure/storage/featurestore.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import typing as t from abc import ABC, abstractmethod @@ -32,12 +33,12 @@ class FeatureStore(ABC): values from a feature store implementation""" @abstractmethod - def __getitem__(self, key: str) -> bytes: + def __getitem__(self, key: str) -> t.Union[str, bytes]: """Retrieve an item using key :param key: Unique key of an item to retrieve from the feature store""" @abstractmethod - def __setitem__(self, key: str, value: str) -> None: + def __setitem__(self, key: str, value: t.Union[str, bytes]) -> None: """Assign a value using key :param key: Unique key of an item to set in the feature store :param value: Value to persist in the feature store""" diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 808c9cf9bf..900a8241de 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -24,11 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# isort: off -from dragon.utils import b64decode -# isort: on - -import sys import typing as t from abc import ABC, abstractmethod @@ -38,7 +33,6 @@ from ...infrastructure.storage.featurestore import FeatureStore from ...mli_schemas.model.model_capnp import Model - logger = get_logger(__name__) @@ -174,7 +168,7 @@ def fetch_model( ) try: - raw_bytes = b64decode(feature_store[request.model_key]) + raw_bytes: bytes = t.cast(bytes, feature_store[request.model_key]) return FetchModelResult(raw_bytes) except FileNotFoundError as ex: logger.exception(ex) @@ -202,7 +196,7 @@ def fetch_inputs( data: t.List[bytes] = [] for input_ in request.input_keys: try: - tensor_bytes = feature_store[input_] + tensor_bytes = t.cast(bytes, feature_store[input_]) data.append(tensor_bytes) except KeyError as ex: logger.exception(ex) diff --git a/tests/mli/test_worker_manager.py b/tests/mli/test_worker_manager.py index 46cae5b2e4..62bd711ebb 100644 --- a/tests/mli/test_worker_manager.py +++ b/tests/mli/test_worker_manager.py @@ -32,11 +32,7 @@ import pytest -should_run = True -try: - import torch -except ImportError: - should_run = False +pytest.importorskip("torch") from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore @@ -51,8 +47,6 @@ # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a -pytest.mark.skipif(not should_run, "Test needs PyTorch to run") - def mock_work(worker_manager_queue: "mp.Queue[bytes]") -> None: """Mock event producer for triggering the inference pipeline""" From d76f88014cebe7a76175b06178d27ca32195841d Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 10 Jul 2024 13:10:08 -0500 Subject: [PATCH 27/84] Add redis driver, fix FLI --- ex/high_throughput_inference/mock_app.py | 10 ++- .../mock_app_redis.py | 88 +++++++++++++++++++ ex/high_throughput_inference/redis_driver.py | 65 ++++++++++++++ smartsim/_core/mli/comm/channel/dragonfli.py | 12 ++- .../infrastructure/control/workermanager.py | 2 +- 5 files changed, 170 insertions(+), 7 deletions(-) create mode 100644 ex/high_throughput_inference/mock_app_redis.py create mode 100644 ex/high_throughput_inference/redis_driver.py diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 4ecce58ac7..45246db2e5 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -112,9 +112,14 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): built_tensor = MessageHandler.build_tensor( batch.numpy(), "c", "float32", list(batch.shape)) self.measure_time("build_tensor") + built_model = None + if isinstance(model, str): + model_arg = MessageHandler.build_model_key(model) + else: + model_arg = MessageHandler.build_model(model, "resnet-50", "1.0") request = MessageHandler.build_request( reply_channel=self._from_worker_ch_serialized, - model=MessageHandler.build_model(model, "resnet-50", "1.0"), + model= model_arg, inputs=[built_tensor], outputs=[], output_descriptors=[], @@ -125,6 +130,7 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.measure_time("serialize_request") with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: to_sendh.send_bytes(request_bytes) + logger.info(f"Message size: {len(request_bytes)} bytes") self.measure_time("send") with self._from_worker_ch.recvh(timeout=None) as from_recvh: @@ -144,7 +150,7 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): return result def set_model(self, key: str, model: bytes): - self._ddict[key] = b64encode(model) + self._ddict[key] = model class ResNetWrapper(): diff --git a/ex/high_throughput_inference/mock_app_redis.py b/ex/high_throughput_inference/mock_app_redis.py new file mode 100644 index 0000000000..c56b4fb8b4 --- /dev/null +++ b/ex/high_throughput_inference/mock_app_redis.py @@ -0,0 +1,88 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import io +import numpy +import time +import torch +from smartsim.log import get_logger +from smartredis import Client + +logger = get_logger("App") + +class ResNetWrapper(): + def __init__(self, name: str, model: str): + self._model = torch.jit.load(model) + self._name = name + buffer = io.BytesIO() + scripted = torch.jit.trace(self._model, self.get_batch()) + torch.jit.save(scripted, buffer) + self._serialized_model = buffer.getvalue() + + def get_batch(self, batch_size: int=32): + return torch.randn((batch_size, 3, 224, 224), dtype=torch.float32) + + @property + def model(self): + return self._serialized_model + + @property + def name(self): + return self._name + +if __name__ == "__main__": + + parser = argparse.ArgumentParser("Mock application") + parser.add_argument("--device", default="cpu") + args = parser.parse_args() + + resnet = ResNetWrapper("resnet50", f"resnet50.{args.device.upper()}.pt") + + client = Client(cluster=False, address=None) + client.set_model(resnet.name, resnet.model, backend='TORCH', device=args.device.upper()) + + total_iterations = 100 + timings=[] + for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]: + logger.info(f"Batch size: {batch_size}") + for iteration_number in range(total_iterations + int(batch_size==1)): + timing = [batch_size] + logger.info(f"Iteration: {iteration_number}") + start = time.perf_counter() + client.put_tensor(name="batch", data=resnet.get_batch(batch_size).numpy()) + client.run_model(name=resnet.name, inputs=["batch"], outputs=["result"]) + result = client.get_tensor(name="result") + end = time.perf_counter() + timing.append(end-start) + timings.append(timing) + + + + timings_np = numpy.asarray(timings) + numpy.save("timings.npy", timings_np) + for timing in timings: + print(" ".join(str(t) for t in timing)) diff --git a/ex/high_throughput_inference/redis_driver.py b/ex/high_throughput_inference/redis_driver.py new file mode 100644 index 0000000000..ceddba4ef7 --- /dev/null +++ b/ex/high_throughput_inference/redis_driver.py @@ -0,0 +1,65 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import sys +from smartsim import Experiment +from smartsim.status import TERMINAL_STATUSES +import time +import typing as t + +device = "gpu" +filedir = os.path.dirname(__file__) +app_script_name = os.path.join(filedir, "mock_app_redis.py") +model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") + + +exp_path = os.path.join(filedir, "redis_ai") +os.makedirs(exp_path, exist_ok=True) +exp = Experiment("redis_ai", launcher="slurm", exp_path=exp_path) + +db = exp.create_database(interface="hsn0") + +app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) +app_rs.set_nodes(1) +app_rs.set_tasks(1) +app = exp.create_model("app", run_settings=app_rs) +app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) + +exp.generate(db, app, overwrite=True) + +exp.start(db, app, block=False) + +while True: + if exp.get_status(app)[0] in TERMINAL_STATUSES: + exp.stop(db) + break + if exp.get_status(db)[0] in TERMINAL_STATUSES: + exp.stop(app) + break + time.sleep(5) + +print("Exiting.") \ No newline at end of file diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 0c1aba94e3..eb3175e445 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -31,6 +31,7 @@ # isort: on import sys +import typing as t import smartsim._core.mli.comm.channel.channel as cch from smartsim.log import get_logger @@ -41,22 +42,25 @@ class DragonFLIChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon FLI Channel""" - def __init__(self, fli_desc: str) -> None: + def __init__(self, fli_desc: str, sender_supplied: bool = True) -> None: """Initialize the DragonFLIChannel instance""" super().__init__(fli_desc) # todo: do we need memory pool information to construct the channel correctly? - self._channel: "fli" = fli.FLInterface.attach(fli_desc) + self._fli: "fli" = fli.FLInterface.attach(fli_desc) + self._channel: t.Optional["dch"] = ( + dch.Channel.make_process_local() if sender_supplied else None + ) def send(self, value: bytes) -> None: """Send a message through the underlying communication channel :param value: The value to send""" - with self._channel.sendh(timeout=None) as sendh: + with self._fli.sendh(timeout=None, stream_channel=self._channel) as sendh: sendh.send_bytes(value) def recv(self) -> bytes: """Receieve a message through the underlying communication channel :returns: the received message""" - with self._channel.recvh(timeout=None) as recvh: + with self._fli.recvh(timeout=None) as recvh: try: request_bytes: bytes request_bytes, _ = recvh.recv_bytes(timeout=None) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index d3cc2d84ae..60e263f337 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -259,7 +259,7 @@ def _on_iteration(self) -> None: if not request.model_key: raise SmartSimError("Neither key, nor model provided") - if request.model_key in self._cached_models: + if False and (request.model_key in self._cached_models): timings.append(time.perf_counter() - interm) interm = time.perf_counter() model_result = LoadModelResult(self._cached_models[request.model_key]) From 3938ec8dbe9964235e6ed4791600257b08b9f3eb Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 11 Jul 2024 12:27:34 -0500 Subject: [PATCH 28/84] Update post-merge --- ex/high_throughput_inference/mli_driver.py | 1 - .../standalone_workermanager.py | 11 ++- .../infrastructure/control/workermanager.py | 68 +++++++++---------- .../mli/infrastructure/environmentloader.py | 11 +-- 4 files changed, 49 insertions(+), 42 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 4e68fdfbcb..6da559aa6f 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -30,7 +30,6 @@ worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) - app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) app = exp.create_model("app", run_settings=app_rs) app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index f3e8e7c589..c56e11a7c3 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -35,6 +35,7 @@ import argparse import base64 import cloudpickle +import pickle import os from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel @@ -42,6 +43,7 @@ from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager +from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader if __name__ == "__main__": @@ -77,10 +79,15 @@ dfs = DragonFeatureStore(ddict) comm_channel = DragonFLIChannel(to_worker_fli_serialized) + + os.environ["SSFeatureStore"] = base64.b64encode(pickle.dumps(dfs)).decode("utf-8") + os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + + config_loader = EnvironmentConfigLoader() + worker_manager = WorkerManager( - task_queue=comm_channel, + config_loader=config_loader, worker=torch_worker, - feature_store=dfs, as_service=True, cooldown=10, comm_channel_type=DragonCommChannel, diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index eaa77bdf3e..8c06351fb5 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -41,7 +41,7 @@ from .....log import get_logger from ....entrypoints.service import Service from ...comm.channel.channel import CommChannelBase -from ...comm.channel.dragonfli import DragonFLIChannel +from ...comm.channel.dragonchannel import DragonCommChannel from ...infrastructure.environmentloader import EnvironmentConfigLoader from ...infrastructure.storage.featurestore import FeatureStore from ...infrastructure.worker.worker import ( @@ -175,7 +175,7 @@ def __init__( worker: MachineLearningWorkerBase, as_service: bool = False, cooldown: int = 0, - comm_channel_type: t.Type[CommChannelBase] = DragonFLIChannel, + comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, device: t.Literal["cpu", "gpu"] = "cpu", ) -> None: """Initialize the WorkerManager @@ -244,34 +244,34 @@ def _on_iteration(self) -> None: logger.warning("No queue to check for tasks") return - timings = [] + timings = [] # timing # perform default deserialization of the message envelope request_bytes: bytes = self._task_queue.recv() - interm = time.perf_counter() + interm = time.perf_counter() # timing request = deserialize_message( request_bytes, self._comm_channel_type, self._device ) if not self._validate_request(request): return - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing if not request.raw_model: - if not request.model_key: - raise SmartSimError("Neither key, nor model provided") - - if False and (request.model_key in self._cached_models): - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + if request.model_key is None: + # A valid request should never get here. + raise ValueError("Could not read model key") + if request.model_key in self._cached_models: + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing model_result = LoadModelResult(self._cached_models[request.model_key]) else: fetch_model_result = None while True: try: - interm = time.perf_counter() + interm = time.perf_counter() # timing fetch_model_result = self._worker.fetch_model( request, self._feature_store ) @@ -282,8 +282,8 @@ def _on_iteration(self) -> None: if fetch_model_result is None: raise SmartSimError("Could not retrieve model from feature store") - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing model_result = self._worker.load_model( request, fetch_model_result, self._device ) @@ -294,18 +294,18 @@ def _on_iteration(self) -> None: request, fetch_result=fetch_model_result, device=self._device ) - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing fetch_input_result = self._worker.fetch_inputs(request, self._feature_store) - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing transformed_input = self._worker.transform_input( request, fetch_input_result, self._device ) - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing reply = InferenceReply() @@ -314,14 +314,14 @@ def _on_iteration(self) -> None: request, model_result, transformed_input ) - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing transformed_output = self._worker.transform_output( request, execute_result, self._device ) - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing if request.output_keys: reply.output_keys = self._worker.place_output( request, transformed_output, self._feature_store @@ -332,8 +332,8 @@ def _on_iteration(self) -> None: logger.exception("Error executing worker") reply.failed = True - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing if reply.failed: response = build_failure_reply("fail", "failure-occurred") @@ -343,21 +343,21 @@ def _on_iteration(self) -> None: response = build_reply(reply) - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing # serialized = self._worker.serialize_reply(request, transformed_output) serialized_resp = MessageHandler.serialize_response(response) # type: ignore - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing if request.callback: request.callback.send(serialized_resp) - timings.append(time.perf_counter() - interm) - interm = time.perf_counter() + timings.append(time.perf_counter() - interm) # timing + interm = time.perf_counter() # timing - print(" ".join(str(time) for time in timings)) + print(" ".join(str(time) for time in timings)) # timing def _can_shutdown(self) -> bool: """Return true when the criteria to shut down the service are met.""" diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index 267b668f63..f5e9532103 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -32,6 +32,7 @@ from dragon.fli import FLInterface # pylint: disable=all from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel class EnvironmentConfigLoader: @@ -41,10 +42,10 @@ class EnvironmentConfigLoader: """ def __init__(self) -> None: - self._feature_store_descriptor = os.getenv("SSFeatureStore", None) - self._queue_descriptor = os.getenv("SSQueue", None) + self._feature_store_descriptor: t.Optional[str] = os.getenv("SSFeatureStore", None) + self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None) self.feature_store: t.Optional[FeatureStore] = None - self.queue: t.Optional["FLInterface"] = None + self.queue: t.Optional[DragonFLIChannel] = None def get_feature_store(self) -> t.Optional[FeatureStore]: """Loads the Feature Store previously set in SSFeatureStore""" @@ -54,8 +55,8 @@ def get_feature_store(self) -> t.Optional[FeatureStore]: ) return self.feature_store - def get_queue(self) -> t.Optional["FLInterface"]: + def get_queue(self, sender_supplied: bool = True) -> t.Optional[DragonFLIChannel]: """Returns the Queue previously set in SSQueue""" if self._queue_descriptor is not None: - self.queue = FLInterface.attach(base64.b64decode(self._queue_descriptor)) + self.queue = DragonFLIChannel(fli_desc=base64.b64decode(self._queue_descriptor), sender_supplied=sender_supplied) return self.queue From 273a7d952fdcaa89984b654ce4b46c272c1c2bbd Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 11 Jul 2024 13:15:38 -0500 Subject: [PATCH 29/84] Fix typing --- smartsim/_core/mli/comm/channel/dragonfli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index eb3175e445..75f8fb4bfc 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -42,7 +42,7 @@ class DragonFLIChannel(cch.CommChannelBase): """Passes messages by writing to a Dragon FLI Channel""" - def __init__(self, fli_desc: str, sender_supplied: bool = True) -> None: + def __init__(self, fli_desc: bytes, sender_supplied: bool = True) -> None: """Initialize the DragonFLIChannel instance""" super().__init__(fli_desc) # todo: do we need memory pool information to construct the channel correctly? From a12d9232914ff9c2cf8def6224a3bb08896b80d9 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 11 Jul 2024 13:50:35 -0500 Subject: [PATCH 30/84] isort --- .../_core/mli/infrastructure/environmentloader.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/environmentloader.py b/smartsim/_core/mli/infrastructure/environmentloader.py index f5e9532103..9f6770623d 100644 --- a/smartsim/_core/mli/infrastructure/environmentloader.py +++ b/smartsim/_core/mli/infrastructure/environmentloader.py @@ -31,8 +31,8 @@ from dragon.fli import FLInterface # pylint: disable=all -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore class EnvironmentConfigLoader: @@ -42,7 +42,9 @@ class EnvironmentConfigLoader: """ def __init__(self) -> None: - self._feature_store_descriptor: t.Optional[str] = os.getenv("SSFeatureStore", None) + self._feature_store_descriptor: t.Optional[str] = os.getenv( + "SSFeatureStore", None + ) self._queue_descriptor: t.Optional[str] = os.getenv("SSQueue", None) self.feature_store: t.Optional[FeatureStore] = None self.queue: t.Optional[DragonFLIChannel] = None @@ -58,5 +60,8 @@ def get_feature_store(self) -> t.Optional[FeatureStore]: def get_queue(self, sender_supplied: bool = True) -> t.Optional[DragonFLIChannel]: """Returns the Queue previously set in SSQueue""" if self._queue_descriptor is not None: - self.queue = DragonFLIChannel(fli_desc=base64.b64decode(self._queue_descriptor), sender_supplied=sender_supplied) + self.queue = DragonFLIChannel( + fli_desc=base64.b64decode(self._queue_descriptor), + sender_supplied=sender_supplied, + ) return self.queue From 38b0de15266288b4a959bbbcb244e131407555ea Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 11 Jul 2024 14:42:16 -0500 Subject: [PATCH 31/84] Update envloader test --- tests/dragon/test_environment_loader.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/dragon/test_environment_loader.py b/tests/dragon/test_environment_loader.py index d339fec885..00db0a9d32 100644 --- a/tests/dragon/test_environment_loader.py +++ b/tests/dragon/test_environment_loader.py @@ -64,10 +64,9 @@ def test_environment_loader_attach_FLI(content, monkeypatch): config = EnvironmentConfigLoader() config_queue = config.get_queue() - new_sender = config_queue.sendh(use_main_as_stream_channel=True) - new_sender.send_bytes(content) + new_sender = config_queue.send(content) - old_recv = queue.recvh(use_main_as_stream_channel=True) + old_recv = queue.recvh() result, _ = old_recv.recv_bytes() assert result == content @@ -81,7 +80,7 @@ def test_environment_loader_serialize_FLI(monkeypatch): config = EnvironmentConfigLoader() config_queue = config.get_queue() - assert config_queue.serialize() == queue.serialize() + assert config_queue._fli.serialize() == queue.serialize() def test_environment_loader_FLI_fails(monkeypatch): From 8223f96e93e716202fa33e3e08b8fc2ecdb29da1 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 15 Jul 2024 13:16:10 -0500 Subject: [PATCH 32/84] Input not concatenated correctly --- .../_core/mli/comm/channel/dragonchannel.py | 6 +- smartsim/_core/mli/comm/channel/dragonfli.py | 3 +- .../mli/infrastructure/control/__init__.py | 0 .../infrastructure/control/devicemanager.py | 130 ++++++++++ .../control/requestdispatcher.py | 227 ++++++++++++++++++ .../infrastructure/control/workermanager.py | 128 ++++------ .../_core/mli/infrastructure/worker/worker.py | 2 +- .../_core/mli/mli_schemas/model/__init__.py | 0 smartsim/_core/mli/mli_schemas/model/utils.py | 41 ++++ 9 files changed, 447 insertions(+), 90 deletions(-) create mode 100644 smartsim/_core/mli/infrastructure/control/__init__.py create mode 100644 smartsim/_core/mli/infrastructure/control/devicemanager.py create mode 100644 smartsim/_core/mli/infrastructure/control/requestdispatcher.py create mode 100644 smartsim/_core/mli/mli_schemas/model/__init__.py create mode 100644 smartsim/_core/mli/mli_schemas/model/utils.py diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index 1409747a91..526910b275 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -31,11 +31,7 @@ logger = get_logger(__name__) -try: - import dragon.channels as dch -except ImportError as exc: - if not "pytest" in sys.modules: - raise exc from None +import dragon.channels as dch class DragonCommChannel(cch.CommChannelBase): diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 75f8fb4bfc..1c02857eab 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -30,7 +30,6 @@ # isort: on -import sys import typing as t import smartsim._core.mli.comm.channel.channel as cch @@ -65,5 +64,5 @@ def recv(self) -> bytes: request_bytes: bytes request_bytes, _ = recvh.recv_bytes(timeout=None) return request_bytes - except fli.FLIEOT as exc: + except fli.FLIEOT: return b"" diff --git a/smartsim/_core/mli/infrastructure/control/__init__.py b/smartsim/_core/mli/infrastructure/control/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py new file mode 100644 index 0000000000..94c2404ead --- /dev/null +++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py @@ -0,0 +1,130 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t +from contextlib import contextmanager +from threading import RLock +from types import TracebackType + +from ...infrastructure.storage.featurestore import FeatureStore +from ..worker.worker import MachineLearningWorkerBase +from .requestdispatcher import InferenceWork + + +class WorkerDevice: + def __init__(self, name: str) -> None: + """Wrapper around a device to keep track of loaded Models and availability + :param name: name used by the toolkit to identify this device, e.g. ``cuda:0`` + """ + self._name = name + """The name used by the toolkit to identify this device""" + self._lock = RLock() + """Lock to ensure only one thread at the time accesses this device""" + self._models: dict[str, t.Any] = {} + + def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]: + return self._lock.acquire(blocking=blocking, timeout=timeout) + + def release(self) -> None: + self._lock.release() + + def __enter__(self) -> None: + self.acquire() + + @property + def name(self) -> str: + return self._name + + def add_model(self, key: str, model: t.Any) -> None: + self._models[key] = model + + def remove_model(self, key: str) -> None: + self._models.pop(key) + + def get_model(self, key: str) -> t.Any: + return self._models[key] + + def __contains__(self, key: str): + return key in self._models + + def __exit__( + self, + exc_type: t.Optional[t.Type[BaseException]], + exc_val: t.Optional[BaseException], + exc_tb: t.Optional[TracebackType], + ) -> None: + self.release() + + +class DeviceManager: + def __init__(self, devices: list[WorkerDevice]): + self._devices = devices + """Dictionary of model key to devices on which it is loaded""" + + def get_free_device( + self, + worker: MachineLearningWorkerBase, + inference_work: InferenceWork, + feature_store: t.Optional[FeatureStore], + ) -> t.Generator[WorkerDevice, None, None]: + return_device = None + sample_request = inference_work.requests[0] + direct_inference = sample_request.raw_model is not None + while return_device is None: + loaded_devices = [] + if not direct_inference: + # Look up devices to see if any of them already has a copy of the model + for device in self._devices: + if inference_work.model_key in device: + loaded_devices.append(device) + + # If a pre-loaded model is found on a device, try using that device + for device in loaded_devices: + if device.acquire(blocking=False): + return_device = device + + # If the model is not loaded on a free device, load it on another device (if available) + if return_device is None: + for candidate_device in self._devices: + if ( + candidate_device not in loaded_devices + and candidate_device.acquire(blocking=False) + ): + model_bytes = worker.fetch_model(sample_request, feature_store) + loaded_model = worker.load_model( + sample_request, model_bytes, candidate_device.name + ) + candidate_device.add_model( + inference_work.model_key, loaded_model.model + ) + + return_device = candidate_device + + try: + yield return_device + finally: + return_device.remove_model(inference_work.model_key) + return_device.release() diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py new file mode 100644 index 0000000000..520605c588 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -0,0 +1,227 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import time +import typing as t +import uuid +from dataclasses import dataclass +from queue import Empty, Full, Queue +from threading import RLock +from types import TracebackType + +from packaging.version import Version + +from ...infrastructure.worker.worker import InferenceRequest +from ...mli_schemas.model.model_capnp import Model + +if t.TYPE_CHECKING: + from dragon.fli import FLInterface + + +@dataclass +class InferenceWork: + model_key: str + requests: list[InferenceRequest] + + +class WorkerDevice: + def __init__(self, name: str) -> None: + """Wrapper around a device to keep track of loaded Models and availability + :param name: name used by the toolkit to identify this device, e.g. ``cuda:0`` + """ + self._name = name + """The name used by the toolkit to identify this device""" + self._models: dict[str, t.Any] = {} + """Dictionary of model key to model for models stored on this device""" + self._lock = RLock() + """Lock to ensure only one thread at the time accesses this device""" + + def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]: + return self._lock.acquire(blocking=blocking, timeout=timeout) + + def release(self) -> None: + self._lock.release() + + def __enter__(self) -> None: + self.acquire() + + def __exit__( + self, + exc_type: t.Optional[t.Type[BaseException]], + exc_val: t.Optional[BaseException], + exc_tb: t.Optional[TracebackType], + ) -> None: + self.release() + + +class BatchQueue(Queue[InferenceRequest]): + def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> None: + super().__init__(maxsize=batch_size) + self._batch_timeout = batch_timeout + self._batch_size = batch_size + self._first_put: t.Optional[float] = None + self._disposable = False + self._model_key = model_key + self._flush_lock = RLock() + + def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]: + return self._flush_lock.acquire(blocking=blocking, timeout=timeout) + + def release(self) -> None: + self._flush_lock.release() + + def __enter__(self) -> None: + self.acquire() + + def __exit__( + self, + exc_type: t.Optional[t.Type[BaseException]], + exc_val: t.Optional[BaseException], + exc_tb: t.Optional[TracebackType], + ) -> None: + self.release() + + @property + def model_key(self) -> str: + return self._model_key + + def put( + self, + item: InferenceRequest, + block: bool = False, + timeout: t.Optional[float] = 0.0, + ) -> None: + if not self.acquire(blocking=False) or self.disposable: + raise Full + if self._first_put is None: + self._first_put = time.time() + super().put(item, block=block, timeout=timeout) + + @property + def _waited_time(self) -> float: + if self._first_put is None: + return 0 + return time.time() - self._first_put + + @property + def ready(self) -> bool: + if self.empty(): + return False + + return self.full() or (self._waited_time >= self._batch_timeout) + + def make_disposable(self) -> None: + self._disposable = True + + @property + def disposable(self) -> bool: + return self.empty() and self._disposable + + def flush(self) -> list[t.Any]: + num_items = self.qsize() + self._first_put = None + items = [] + # Avoid (unlikely) race condition error + for _ in range(num_items): + try: + items.append(self.get()) + except Empty: + break + + return items + + def full(self) -> bool: + return self.qsize() >= self._batch_size + + def empty(self) -> bool: + return self.qsize() == 0 + + +class RequestDispatcher: + def __init__( + self, + batch_timeout: float, + batch_size: int, + ) -> None: + self._queues: list[BatchQueue] + self._active_queues: dict[str, BatchQueue] = {} + self._model_last_version: dict[str, Version] = {} + self._model_name_to_key: dict[str, str] = {} + self._batch_timeout = batch_timeout + self._batch_size = batch_size + self._queue_swap_lock = RLock() + + def _swap_queue(self, model_key: str) -> None: + with self._queue_swap_lock: + for queue in self._queues: + if queue.model_key == model_key and not queue.full(): + self._active_queues[model_key] = queue + return + + new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key) + self._active_queues[model_key] = new_queue + return + + def dispatch(self, request: InferenceRequest) -> None: + if request.raw_model is not None: + tmp_id = f"_tmp_{str(uuid.uuid4())}" + tmp_queue: BatchQueue = BatchQueue( + batch_timeout=0, batch_size=1, model_key=tmp_id + ) + self._active_queues[tmp_id] = tmp_queue + tmp_queue.put_nowait(request) + tmp_queue.make_disposable() + return + + if request.model_key: + success = False + while not success: + try: + self._active_queues[request.model_key].put_nowait(request) + success = True + except (Full, KeyError): + self._swap_queue(request.model_key) + + def _update_model_version(self, model: Model) -> None: + if not model.version: + return + if ( + model.name not in self._model_last_version + or Version(model.version) > self._model_last_version[model.name] + ): + self._model_last_version[model.name] = Version(model.version) + return + + def flush_requests(self) -> t.Optional[InferenceWork]: + result = None + for queue in self._queues: + if queue.acquire(blocking=False) and queue.ready: + result = InferenceWork( + model_key=queue.model_key, requests=queue.flush() + ) + queue.release() + break + + return result diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 8c06351fb5..9163be4cec 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -23,22 +23,13 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import sys - -# isort: off -import dragon -from dragon import fli - -# isort: on - import time import typing as t import numpy as np from .....error import SmartSimError -from .....log import get_logger +from .....log import ContextThread, get_logger from ....entrypoints.service import Service from ...comm.channel.channel import CommChannelBase from ...comm.channel.dragonchannel import DragonCommChannel @@ -52,10 +43,10 @@ ) from ...message_handler import MessageHandler from ...mli_schemas.response.response_capnp import Response +from .devicemanager import DeviceManager, WorkerDevice +from .requestdispatcher import RequestDispatcher if t.TYPE_CHECKING: - from dragon.fli import FLInterface - from smartsim._core.mli.mli_schemas.model.model_capnp import Model from smartsim._core.mli.mli_schemas.response.response_capnp import StatusEnum @@ -65,7 +56,6 @@ def deserialize_message( data_blob: bytes, channel_type: t.Type[CommChannelBase], - device: t.Literal["cpu", "gpu"], ) -> InferenceRequest: """Deserialize a message from a byte stream into an InferenceRequest :param data_blob: The byte stream to deserialize""" @@ -177,6 +167,8 @@ def __init__( cooldown: int = 0, comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, device: t.Literal["cpu", "gpu"] = "cpu", + batch_timeout: float = 0.0, + batch_size: int = 0, ) -> None: """Initialize the WorkerManager :param config_loader: Environment config loader that loads the task queue and @@ -203,6 +195,35 @@ def __init__( """Device on which workers need to run""" self._cached_models: dict[str, t.Any] = {} """Dictionary of previously loaded models""" + self._request_dispatcher: RequestDispatcher = RequestDispatcher( + batch_timeout=batch_timeout, batch_size=batch_size + ) + """Dispatcher used to batch requests""" + self._dispatcher_threads = 1 + """Number of threads which dispatch requests""" + self._device_manager: DeviceManager = DeviceManager([WorkerDevice("gpu")]) + + def _receive_requests(self) -> None: + if self._task_queue is None: + return + while not self._can_shutdown(): + # perform default deserialization of the message envelope + request_bytes: bytes = self._task_queue.recv() + + request = deserialize_message(request_bytes, self._comm_channel_type) + if not self._validate_request(request): + return + + self._request_dispatcher.dispatch(request) + + def _on_start(self) -> None: + for thread_idx in range(self._dispatcher_threads): + dispatcher_thread = ContextThread( + name=f"Dispatcher_{thread_idx}", + target=self._receive_requests, + daemon=True, + ) + dispatcher_thread.start() def _validate_request(self, request: InferenceRequest) -> bool: """Ensure the request can be processed. @@ -244,69 +265,29 @@ def _on_iteration(self) -> None: logger.warning("No queue to check for tasks") return - timings = [] # timing - # perform default deserialization of the message envelope - request_bytes: bytes = self._task_queue.recv() - - interm = time.perf_counter() # timing - request = deserialize_message( - request_bytes, self._comm_channel_type, self._device - ) - if not self._validate_request(request): + inference_work = self._request_dispatcher.flush_requests() + if inference_work is None or 0 == len(inference_work.requests): return - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing + request = inference_work.requests[0] - if not request.raw_model: - if request.model_key is None: - # A valid request should never get here. - raise ValueError("Could not read model key") - if request.model_key in self._cached_models: - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - model_result = LoadModelResult(self._cached_models[request.model_key]) - - else: - fetch_model_result = None - while True: - try: - interm = time.perf_counter() # timing - fetch_model_result = self._worker.fetch_model( - request, self._feature_store - ) - except KeyError: - time.sleep(0.1) - else: - break - - if fetch_model_result is None: - raise SmartSimError("Could not retrieve model from feature store") - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - model_result = self._worker.load_model( - request, fetch_model_result, self._device - ) - self._cached_models[request.model_key] = model_result.model - else: - fetch_model_result = self._worker.fetch_model(request, None) - model_result = self._worker.load_model( - request, fetch_result=fetch_model_result, device=self._device + device: WorkerDevice = next( + self._device_manager.get_free_device( + worker=self._worker, + inference_work=inference_work, + feature_store=self._feature_store, ) + ) + + + model_result = device.get_model(inference_work.model_key) - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing fetch_input_result = self._worker.fetch_inputs(request, self._feature_store) - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing transformed_input = self._worker.transform_input( request, fetch_input_result, self._device ) - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - reply = InferenceReply() try: @@ -314,14 +295,10 @@ def _on_iteration(self) -> None: request, model_result, transformed_input ) - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing transformed_output = self._worker.transform_output( request, execute_result, self._device ) - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing if request.output_keys: reply.output_keys = self._worker.place_output( request, transformed_output, self._feature_store @@ -332,9 +309,6 @@ def _on_iteration(self) -> None: logger.exception("Error executing worker") reply.failed = True - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - if reply.failed: response = build_failure_reply("fail", "failure-occurred") else: @@ -343,22 +317,12 @@ def _on_iteration(self) -> None: response = build_reply(reply) - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - # serialized = self._worker.serialize_reply(request, transformed_output) serialized_resp = MessageHandler.serialize_response(response) # type: ignore - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing if request.callback: request.callback.send(serialized_resp) - timings.append(time.perf_counter() - interm) # timing - interm = time.perf_counter() # timing - - print(" ".join(str(time) for time in timings)) # timing - def _can_shutdown(self) -> bool: """Return true when the criteria to shut down the service are met.""" # todo: determine shutdown criteria diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 900a8241de..9dfa974785 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -250,7 +250,7 @@ def place_output( class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC): - """Abstrct base class providing contract for a machine learning + """Abstract base class providing contract for a machine learning worker implementation.""" @staticmethod diff --git a/smartsim/_core/mli/mli_schemas/model/__init__.py b/smartsim/_core/mli/mli_schemas/model/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/smartsim/_core/mli/mli_schemas/model/utils.py b/smartsim/_core/mli/mli_schemas/model/utils.py new file mode 100644 index 0000000000..b16dc8f623 --- /dev/null +++ b/smartsim/_core/mli/mli_schemas/model/utils.py @@ -0,0 +1,41 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t +from collections import namedtuple + +from .model_capnp import Model + +ModelInfo = namedtuple("ModelInfo", ["Name", "Version"]) + + +def make_model_key(model: Model) -> str: + return f"{model.name}_{model.version}" + + +def get_model_name_and_version(key: str) -> t.NamedTuple: + split_key = key.rsplit("_", 1) + return ModelInfo(split_key[0], split_key[1]) From 4a83abe1e1ca1b033b8a224ec5a2d9e058100846 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 15 Jul 2024 13:17:49 -0500 Subject: [PATCH 33/84] Changes to entrypoint --- ex/high_throughput_inference/standalone_workermanager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index c56e11a7c3..f91c2269c6 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -75,7 +75,9 @@ to_worker_fli_serialized = to_worker_fli.serialize() ddict["to_worker_fli"] = to_worker_fli_serialized - torch_worker = cloudpickle.loads(base64.b64decode(args.worker_class.encode('ascii')))() + torch_worker = cloudpickle.loads( + base64.b64decode(args.worker_class.encode('ascii')) + )() dfs = DragonFeatureStore(ddict) comm_channel = DragonFLIChannel(to_worker_fli_serialized) From 6ea0671e69192434732ad3e3195c019d45b21da8 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 16 Jul 2024 05:24:45 -0500 Subject: [PATCH 34/84] Use batch where needed --- .../infrastructure/control/devicemanager.py | 19 +-- .../control/requestdispatcher.py | 13 +- .../infrastructure/control/workermanager.py | 77 ++++++------ .../mli/infrastructure/worker/torch_worker.py | 71 +++++++---- .../_core/mli/infrastructure/worker/worker.py | 115 +++++++++--------- 5 files changed, 158 insertions(+), 137 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py index 94c2404ead..4b3d2a8edb 100644 --- a/smartsim/_core/mli/infrastructure/control/devicemanager.py +++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py @@ -31,7 +31,7 @@ from ...infrastructure.storage.featurestore import FeatureStore from ..worker.worker import MachineLearningWorkerBase -from .requestdispatcher import InferenceWork +from .requestdispatcher import InferenceBatch class WorkerDevice: @@ -67,7 +67,7 @@ def remove_model(self, key: str) -> None: def get_model(self, key: str) -> t.Any: return self._models[key] - def __contains__(self, key: str): + def __contains__(self, key: str) -> bool: return key in self._models def __exit__( @@ -87,18 +87,18 @@ def __init__(self, devices: list[WorkerDevice]): def get_free_device( self, worker: MachineLearningWorkerBase, - inference_work: InferenceWork, + batch: InferenceBatch, feature_store: t.Optional[FeatureStore], ) -> t.Generator[WorkerDevice, None, None]: return_device = None - sample_request = inference_work.requests[0] + sample_request = batch.requests[0] direct_inference = sample_request.raw_model is not None while return_device is None: loaded_devices = [] if not direct_inference: # Look up devices to see if any of them already has a copy of the model for device in self._devices: - if inference_work.model_key in device: + if batch.model_key in device: loaded_devices.append(device) # If a pre-loaded model is found on a device, try using that device @@ -113,12 +113,12 @@ def get_free_device( candidate_device not in loaded_devices and candidate_device.acquire(blocking=False) ): - model_bytes = worker.fetch_model(sample_request, feature_store) + model_bytes = worker.fetch_model(batch, feature_store) loaded_model = worker.load_model( - sample_request, model_bytes, candidate_device.name + batch, model_bytes, candidate_device.name ) candidate_device.add_model( - inference_work.model_key, loaded_model.model + batch.model_key, loaded_model.model ) return_device = candidate_device @@ -126,5 +126,6 @@ def get_free_device( try: yield return_device finally: - return_device.remove_model(inference_work.model_key) + if direct_inference: + return_device.remove_model(batch.model_key) return_device.release() diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index 520605c588..6592187f1f 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -26,26 +26,19 @@ import time import typing as t import uuid -from dataclasses import dataclass from queue import Empty, Full, Queue from threading import RLock from types import TracebackType from packaging.version import Version -from ...infrastructure.worker.worker import InferenceRequest +from ...infrastructure.worker.worker import InferenceBatch, InferenceRequest from ...mli_schemas.model.model_capnp import Model if t.TYPE_CHECKING: from dragon.fli import FLInterface -@dataclass -class InferenceWork: - model_key: str - requests: list[InferenceRequest] - - class WorkerDevice: def __init__(self, name: str) -> None: """Wrapper around a device to keep track of loaded Models and availability @@ -214,11 +207,11 @@ def _update_model_version(self, model: Model) -> None: self._model_last_version[model.name] = Version(model.version) return - def flush_requests(self) -> t.Optional[InferenceWork]: + def flush_requests(self) -> t.Optional[InferenceBatch]: result = None for queue in self._queues: if queue.acquire(blocking=False) and queue.ready: - result = InferenceWork( + result = InferenceBatch( model_key=queue.model_key, requests=queue.flush() ) queue.release() diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 9163be4cec..af7ceec844 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -23,12 +23,10 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import time import typing as t import numpy as np -from .....error import SmartSimError from .....log import ContextThread, get_logger from ....entrypoints.service import Service from ...comm.channel.channel import CommChannelBase @@ -265,63 +263,70 @@ def _on_iteration(self) -> None: logger.warning("No queue to check for tasks") return - inference_work = self._request_dispatcher.flush_requests() - if inference_work is None or 0 == len(inference_work.requests): + batch = self._request_dispatcher.flush_requests() + if batch is None or 0 == len(batch.requests): return - request = inference_work.requests[0] + # sample_request = inference_work.requests[0] device: WorkerDevice = next( self._device_manager.get_free_device( worker=self._worker, - inference_work=inference_work, + batch=batch, feature_store=self._feature_store, ) ) + model_result = LoadModelResult(device.get_model(batch.model_key)) - model_result = device.get_model(inference_work.model_key) - - fetch_input_result = self._worker.fetch_inputs(request, self._feature_store) + fetch_input_results = self._worker.fetch_inputs(batch, self._feature_store) transformed_input = self._worker.transform_input( - request, fetch_input_result, self._device + batch, fetch_input_results, self._device ) - reply = InferenceReply() + replies: list[InferenceReply] = [InferenceReply() for _ in range(len(batch.requests))] try: execute_result = self._worker.execute( - request, model_result, transformed_input + batch, model_result, transformed_input ) - - transformed_output = self._worker.transform_output( - request, execute_result, self._device + transformed_outputs = self._worker.transform_output( + batch, execute_result, self._device ) - - if request.output_keys: - reply.output_keys = self._worker.place_output( - request, transformed_output, self._feature_store - ) - else: - reply.outputs = transformed_output.outputs except Exception: logger.exception("Error executing worker") - reply.failed = True - - if reply.failed: - response = build_failure_reply("fail", "failure-occurred") + for reply in replies: + reply.failed = True else: - if reply.outputs is None or not reply.outputs: - response = build_failure_reply("fail", "no-results") - - response = build_reply(reply) - - # serialized = self._worker.serialize_reply(request, transformed_output) - serialized_resp = MessageHandler.serialize_response(response) # type: ignore - - if request.callback: - request.callback.send(serialized_resp) + for reply_idx, (request, transformed_output) in enumerate(zip( + batch.requests, transformed_outputs + )): + reply = replies[reply_idx] + try: + if request.output_keys: + reply.output_keys = self._worker.place_output( + request, transformed_output, self._feature_store + ) + else: + reply.outputs = transformed_output.outputs + except Exception: + logger.exception("Error executing worker") + reply.failed = True + + if reply.failed: + response = build_failure_reply("fail", "failure-occurred") + else: + if reply.outputs is None or not reply.outputs: + response = build_failure_reply("fail", "no-results") + + response = build_reply(reply) + + # serialized = self._worker.serialize_reply(request, transformed_output) + serialized_resp = MessageHandler.serialize_response(response) # type: ignore + + if request.callback: + request.callback.send(serialized_resp) def _can_shutdown(self) -> bool: """Return true when the criteria to shut down the service are met.""" diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index a4e725ab99..25c762c6a2 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -36,6 +36,7 @@ ExecuteResult, FetchInputResult, FetchModelResult, + InferenceBatch, InferenceRequest, LoadModelResult, MachineLearningWorkerBase, @@ -51,8 +52,9 @@ class TorchWorker(MachineLearningWorkerBase): @staticmethod def load_model( - request: InferenceRequest, fetch_result: FetchModelResult, device: str + batch: InferenceBatch, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: + request = batch.requests[0] if fetch_result.model_bytes: model_bytes = fetch_result.model_bytes elif request.raw_model and request.raw_model.data: @@ -69,27 +71,45 @@ def load_model( @staticmethod def transform_input( - request: InferenceRequest, fetch_result: FetchInputResult, device: str + batch: InferenceBatch, fetch_results: list[FetchInputResult], device: str ) -> TransformInputResult: - result = [] + results: list[list[torch.Tensor]] = [] + start = 0 + slices: list[slice] = [] device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - device = device_to_torch[device] - if fetch_result.meta is None: - raise ValueError("Cannot reconstruct tensor without meta information") - for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): - tensor_desc: tensor_capnp.TensorDescriptor = item_meta + for old, new in device_to_torch.items(): + device.replace(old, new) + + for fetch_result in fetch_results: + partial_result = [] + if fetch_result.meta is None: + raise ValueError("Cannot reconstruct tensor without meta information") + for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): + tensor_desc: tensor_capnp.TensorDescriptor = item_meta + partial_result.append( + torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType))) + .to(device) + .reshape(tuple(dim for dim in tensor_desc.dimensions)) + ) + results.append(partial_result) + num_samples = fetch_result.meta[0].dimensions[0] + slices.append(slice(start, start + num_samples)) + start = start + num_samples + + result: list[torch.Tensor] = [] + for t_idx in range(len(results[0])): result.append( - torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType))) - .to(device) - .reshape(tuple(dim for dim in tensor_desc.dimensions)) + torch.concatenate([partial_result[t_idx] for partial_result in results]) ) - return TransformInputResult(result) + + return TransformInputResult(result, slices) # return data # note: this fails copy test! + # pylint: disable-next=unused-argument @staticmethod def execute( - request: InferenceRequest, + batch: InferenceBatch, load_result: LoadModelResult, transform_result: TransformInputResult, ) -> ExecuteResult: @@ -100,20 +120,23 @@ def execute( model.eval() results = [model(tensor).detach() for tensor in transform_result.transformed] - execute_result = ExecuteResult(results) + execute_result = ExecuteResult(results, transform_result.slices) return execute_result @staticmethod def transform_output( - request: InferenceRequest, + batch: InferenceBatch, execute_result: ExecuteResult, result_device: str, - ) -> TransformOutputResult: - if result_device != "cpu": - transformed = [item.to("cpu") for item in execute_result.predictions] - # todo: need the shape from latest schemas added here. - return TransformOutputResult(transformed, None, "c", "float32") # fixme - - return TransformOutputResult( - execute_result.predictions, None, "c", "float32" - ) # fixme + ) -> list[TransformOutputResult]: + transformed_list: list[TransformOutputResult] = [] + for result_slice in execute_result.slices: + if result_device != "cpu": + transformed = [item.to("cpu") for item in execute_result.predictions[result_slice]] + # todo: need the shape from latest schemas added here. + transformed_list.append(TransformOutputResult(transformed, None, "c", "float32")) # fixme + + transformed_list.append(TransformOutputResult( + execute_result.predictions[result_slice], None, "c", "float32" + )) # fixme + return transformed_list \ No newline at end of file diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 9dfa974785..2fa03b1297 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -26,6 +26,7 @@ import typing as t from abc import ABC, abstractmethod +from dataclasses import dataclass from .....error import SmartSimError from .....log import get_logger @@ -63,6 +64,12 @@ def __init__( self.batch_size = batch_size +@dataclass +class InferenceBatch: + model_key: str + requests: list[InferenceRequest] + + class InferenceReply: """Internal representation of the reply to a client request for inference""" @@ -87,19 +94,21 @@ def __init__(self, model: t.Any) -> None: class TransformInputResult: - """A wrapper around a transformed input""" + """A wrapper around a transformed batchinput""" - def __init__(self, result: t.Any) -> None: + def __init__(self, result: t.Any, slices: list[slice]) -> None: """Initialize the object""" self.transformed = result + self.slices = slices class ExecuteResult: """A wrapper around inference results""" - def __init__(self, result: t.Any) -> None: + def __init__(self, result: t.Any, slices: list[slice]) -> None: """Initialize the object""" self.predictions = result + self.slices = slices class FetchInputResult: @@ -145,82 +154,72 @@ class MachineLearningWorkerCore: @staticmethod def fetch_model( - request: InferenceRequest, feature_store: t.Optional[FeatureStore] + batch: InferenceBatch, feature_store: t.Optional[FeatureStore] ) -> FetchModelResult: """Given a resource key, retrieve the raw model from a feature store - :param request: The request that triggered the pipeline + :param batc: The batch of requests that triggered the pipeline :param feature_store: The feature store used for persistence :return: Raw bytes of the model""" - if request.raw_model: - # Should we cache model in the feature store? - # model_key = hash(request.raw_model) - # feature_store[model_key] = request.raw_model - # short-circuit and return the directly supplied model - return FetchModelResult(request.raw_model.data) + # All requests in the same batch share the model + sample_request = batch.requests[0] + if sample_request.raw_model: + return FetchModelResult(sample_request.raw_model.data) if not feature_store: raise ValueError("Feature store is required for model retrieval") - if not request.model_key: + if not sample_request.model_key: raise SmartSimError( "Key must be provided to retrieve model from feature store" ) try: - raw_bytes: bytes = t.cast(bytes, feature_store[request.model_key]) + raw_bytes: bytes = t.cast(bytes, feature_store[sample_request.model_key]) return FetchModelResult(raw_bytes) except FileNotFoundError as ex: logger.exception(ex) raise SmartSimError( - f"Model could not be retrieved with key {request.model_key}" + f"Model could not be retrieved with key {sample_request.model_key}" ) from ex @staticmethod def fetch_inputs( - request: InferenceRequest, feature_store: t.Optional[FeatureStore] - ) -> FetchInputResult: + batch: InferenceBatch, feature_store: t.Optional[FeatureStore] + ) -> t.List[FetchInputResult]: """Given a collection of ResourceKeys, identify the physical location and input metadata :param request: The request that triggered the pipeline :param feature_store: The feature store used for persistence :return: the fetched input""" - - if request.raw_inputs: - return FetchInputResult(request.raw_inputs, request.input_meta) - - if not feature_store: - raise ValueError("No input and no feature store provided") - - if request.input_keys: - data: t.List[bytes] = [] - for input_ in request.input_keys: - try: - tensor_bytes = t.cast(bytes, feature_store[input_]) - data.append(tensor_bytes) - except KeyError as ex: - logger.exception(ex) - raise SmartSimError( - f"Model could not be retrieved with key {input_}" - ) from ex - return FetchInputResult( - data, None - ) # fixme: need to get both tensor and descriptor - - raise ValueError("No input source") - - @staticmethod - def batch_requests( - request: InferenceRequest, transform_result: TransformInputResult - ) -> CreateInputBatchResult: - """Create a batch of requests. Return the batch when batch_size datum have been - collected or a configured batch duration has elapsed. - :param request: The request that triggered the pipeline - :param transform_result: Transformed inputs ready for batching - :return: `None` if batch size has not been reached and timeout not exceeded.""" - if transform_result is not None or request.batch_size: - raise NotImplementedError("Batching is not yet supported") - return CreateInputBatchResult(None) + fetch_results = [] + for request in batch.requests: + if request.raw_inputs: + fetch_results.append( + FetchInputResult(request.raw_inputs, request.input_meta) + ) + + if not feature_store: + raise ValueError("No input and no feature store provided") + + if request.input_keys: + data: t.List[bytes] = [] + for input_ in request.input_keys: + try: + tensor_bytes = t.cast(bytes, feature_store[input_]) + data.append(tensor_bytes) + except KeyError as ex: + logger.exception(ex) + raise SmartSimError( + f"Input tensor could not be retrieved with key {input_}" + ) from ex + fetch_results.append( + FetchInputResult(data, None) + ) # fixme: need to get both tensor and descriptor + + raise ValueError("No input source") + + return fetch_results @staticmethod def place_output( @@ -256,7 +255,7 @@ class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC): @staticmethod @abstractmethod def load_model( - request: InferenceRequest, fetch_result: FetchModelResult, device: str + batch: InferenceBatch, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: """Given a loaded MachineLearningModel, ensure it is loaded into device memory @@ -267,18 +266,18 @@ def load_model( @staticmethod @abstractmethod def transform_input( - request: InferenceRequest, fetch_result: FetchInputResult, device: str + batch: InferenceBatch, fetch_results: list[FetchInputResult], device: str ) -> TransformInputResult: """Given a collection of data, perform a transformation on the data :param request: The request that triggered the pipeline - :param fetch_result: Raw output from fetching inputs out of a feature store + :param fetch_result: Raw outputs from fetching inputs out of a feature store :param device: The device on which the transformed input must be placed :return: The transformed inputs wrapped in a InputTransformResult""" @staticmethod @abstractmethod def execute( - request: InferenceRequest, + batch: InferenceBatch, load_result: LoadModelResult, transform_result: TransformInputResult, ) -> ExecuteResult: @@ -291,8 +290,8 @@ def execute( @staticmethod @abstractmethod def transform_output( - request: InferenceRequest, execute_result: ExecuteResult, result_device: str - ) -> TransformOutputResult: + batch: InferenceBatch, execute_result: ExecuteResult, result_device: str + ) -> t.List[TransformOutputResult]: """Given inference results, perform transformations required to transmit results to the requestor. :param request: The request that triggered the pipeline From d26e5f0b19bea99f4263990bc69ca1f7a6fce6b5 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 16 Jul 2024 18:09:57 -0500 Subject: [PATCH 35/84] Adjustments, get back to one thread --- smartsim/_core/entrypoints/service.py | 17 --- .../infrastructure/control/devicemanager.py | 1 + .../control/requestdispatcher.py | 47 ++++++-- .../infrastructure/control/workermanager.py | 112 ++++++++++++++---- .../mli/infrastructure/worker/torch_worker.py | 14 ++- .../_core/mli/infrastructure/worker/worker.py | 2 + 6 files changed, 137 insertions(+), 56 deletions(-) diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py index df9c2bbef6..6b4ef74b67 100644 --- a/smartsim/_core/entrypoints/service.py +++ b/smartsim/_core/entrypoints/service.py @@ -103,23 +103,6 @@ def execute(self) -> None: running = True cooldown_start: t.Optional[datetime.datetime] = None - headers = [ - "batch_size", - "w_deserialize", - "w_fetch_model", - "w_load_model", - "w_fetch_input", - "w_transform_input", - "w_execute", - "w_transform_output", - "w_assign_output", - "w_build_reply", - "w_serialize_resp", - "w_send", - ] - - print(",".join(headers)) - while running: self._on_iteration() diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py index 4b3d2a8edb..8d284c1262 100644 --- a/smartsim/_core/mli/infrastructure/control/devicemanager.py +++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py @@ -105,6 +105,7 @@ def get_free_device( for device in loaded_devices: if device.acquire(blocking=False): return_device = device + break # If the model is not loaded on a free device, load it on another device (if available) if return_device is None: diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index 6592187f1f..19f3256cef 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -32,12 +32,14 @@ from packaging.version import Version +from .....log import get_logger from ...infrastructure.worker.worker import InferenceBatch, InferenceRequest from ...mli_schemas.model.model_capnp import Model if t.TYPE_CHECKING: from dragon.fli import FLInterface +logger = get_logger("Request Dispatcher") class WorkerDevice: def __init__(self, name: str) -> None: @@ -78,6 +80,11 @@ def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> Non self._disposable = False self._model_key = model_key self._flush_lock = RLock() + self._id = str(uuid.uuid4()) + + @property + def id(self): + return self._id def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]: return self._flush_lock.acquire(blocking=blocking, timeout=timeout) @@ -106,11 +113,17 @@ def put( block: bool = False, timeout: t.Optional[float] = 0.0, ) -> None: - if not self.acquire(blocking=False) or self.disposable: + if not self.acquire(blocking=False): + logger.error(f"Could not acquire queue {self._id} to put") raise Full - if self._first_put is None: - self._first_put = time.time() - super().put(item, block=block, timeout=timeout) + try: + if self.full(): + raise Full + if self._first_put is None: + self._first_put = time.time() + super().put(item, block=block, timeout=timeout) + finally: + self.release() @property def _waited_time(self) -> float: @@ -146,6 +159,10 @@ def flush(self) -> list[t.Any]: return items def full(self) -> bool: + if self._disposable: + return True + if self._batch_size <= 0: + return False return self.qsize() >= self._batch_size def empty(self) -> bool: @@ -158,7 +175,7 @@ def __init__( batch_timeout: float, batch_size: int, ) -> None: - self._queues: list[BatchQueue] + self._queues: list[BatchQueue] = [] self._active_queues: dict[str, BatchQueue] = {} self._model_last_version: dict[str, Version] = {} self._model_name_to_key: dict[str, str] = {} @@ -170,15 +187,19 @@ def _swap_queue(self, model_key: str) -> None: with self._queue_swap_lock: for queue in self._queues: if queue.model_key == model_key and not queue.full(): + logger.info("Found queue, swapping") self._active_queues[model_key] = queue return + logger.info("Creating new queue") new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key) + self._queues.append(new_queue) self._active_queues[model_key] = new_queue return def dispatch(self, request: InferenceRequest) -> None: if request.raw_model is not None: + logger.info("Direct inference requested, creating tmp queue") tmp_id = f"_tmp_{str(uuid.uuid4())}" tmp_queue: BatchQueue = BatchQueue( batch_timeout=0, batch_size=1, model_key=tmp_id @@ -189,12 +210,14 @@ def dispatch(self, request: InferenceRequest) -> None: return if request.model_key: + logger.info("Indirect inference requested, dispatching it to existing queue") success = False while not success: try: self._active_queues[request.model_key].put_nowait(request) success = True except (Full, KeyError): + logger.info("Could not find non-full queue, swapping") self._swap_queue(request.model_key) def _update_model_version(self, model: Model) -> None: @@ -210,11 +233,15 @@ def _update_model_version(self, model: Model) -> None: def flush_requests(self) -> t.Optional[InferenceBatch]: result = None for queue in self._queues: - if queue.acquire(blocking=False) and queue.ready: - result = InferenceBatch( - model_key=queue.model_key, requests=queue.flush() - ) - queue.release() + # logger.info("Acquiring queue to flush") + if queue.ready and queue.acquire(blocking=False): + try: + logger.info(f"Acquired queue {queue.id}") + result = InferenceBatch( + model_key=queue.model_key, requests=queue.flush() + ) + finally: + queue.release() break return result diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index af7ceec844..674bfc93a1 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -24,8 +24,11 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import typing as t - +import time import numpy as np +import numbers + +from collections import OrderedDict from .....log import ContextThread, get_logger from ....entrypoints.service import Service @@ -166,7 +169,7 @@ def __init__( comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, device: t.Literal["cpu", "gpu"] = "cpu", batch_timeout: float = 0.0, - batch_size: int = 0, + batch_size: int = 1, ) -> None: """Initialize the WorkerManager :param config_loader: Environment config loader that loads the task queue and @@ -200,28 +203,73 @@ def __init__( self._dispatcher_threads = 1 """Number of threads which dispatch requests""" self._device_manager: DeviceManager = DeviceManager([WorkerDevice("gpu")]) + self._start = None + self._interm = None + self._timings: OrderedDict[str, list[numbers.Number]] = OrderedDict() + self._timing_on = True + + def _add_label_to_timings(self, label: str): + if label not in self._timings: + self._timings[label] = [] + + @staticmethod + def _format_number(number: numbers.Number): + return f"{number:0.4e}" + + def start_timings(self): + if self._timing_on: + # self._add_label_to_timings("batch_size") + # self._timings["batch_size"].append(batch_size) + self._start = time.perf_counter() + self._interm = time.perf_counter() + + def end_timings(self): + if self._timing_on: + self._add_label_to_timings("total_time") + self._timings["total_time"].append(self._format_number(time.perf_counter()-self._start)) + + def measure_time(self, label: str): + if self._timing_on: + self._add_label_to_timings(label) + self._timings[label].append(self._format_number(time.perf_counter()-self._interm)) + self._interm = time.perf_counter() + + def print_timings(self, to_file: bool = False): + print(" ".join(self._timings.keys())) + value_array = np.array([value for value in self._timings.values()], dtype=float) + value_array = np.transpose(value_array) + for i in range(value_array.shape[0]): + print(" ".join(self._format_number(value) for value in value_array[i])) + if to_file: + np.save("timings.npy", value_array) + np.savetxt("timings.txt", value_array) + def _receive_requests(self) -> None: if self._task_queue is None: return - while not self._can_shutdown(): - # perform default deserialization of the message envelope - request_bytes: bytes = self._task_queue.recv() - - request = deserialize_message(request_bytes, self._comm_channel_type) - if not self._validate_request(request): - return + # while not self._can_shutdown(): + # perform default deserialization of the message envelope + request_bytes: bytes = self._task_queue.recv() + + self.start_timings() + request = deserialize_message(request_bytes, self._comm_channel_type) + self.measure_time("w_deserialize") + if not self._validate_request(request): + return - self._request_dispatcher.dispatch(request) + self._request_dispatcher.dispatch(request) + self.measure_time("w_dispatch") def _on_start(self) -> None: - for thread_idx in range(self._dispatcher_threads): - dispatcher_thread = ContextThread( - name=f"Dispatcher_{thread_idx}", - target=self._receive_requests, - daemon=True, - ) - dispatcher_thread.start() + # for thread_idx in range(self._dispatcher_threads): + # dispatcher_thread = ContextThread( + # name=f"Dispatcher_{thread_idx}", + # target=self._receive_requests, + # daemon=True, + # ) + # dispatcher_thread.start() + pass def _validate_request(self, request: InferenceRequest) -> bool: """Ensure the request can be processed. @@ -259,16 +307,15 @@ def _on_iteration(self) -> None: the inference pipeline""" logger.debug("executing worker manager pipeline") - if self._task_queue is None: - logger.warning("No queue to check for tasks") - return + self._receive_requests() + # logger.info("Getting request batch") batch = self._request_dispatcher.flush_requests() if batch is None or 0 == len(batch.requests): return - # sample_request = inference_work.requests[0] - + self.measure_time("w_flush_requests") + # logger.info(f"Got batch of {len(batch.requests)} requests, acquiring device") device: WorkerDevice = next( self._device_manager.get_free_device( worker=self._worker, @@ -276,24 +323,32 @@ def _on_iteration(self) -> None: feature_store=self._feature_store, ) ) + self.measure_time("w_fetch_model") + + # logger.info(f"Acquired device {device.name}") model_result = LoadModelResult(device.get_model(batch.model_key)) + self.measure_time("w_load_model") fetch_input_results = self._worker.fetch_inputs(batch, self._feature_store) + self.measure_time("w_fetch_input") transformed_input = self._worker.transform_input( batch, fetch_input_results, self._device ) + self.measure_time("w_transform_input") - replies: list[InferenceReply] = [InferenceReply() for _ in range(len(batch.requests))] + replies = [InferenceReply() for _ in range(len(batch.requests))] try: execute_result = self._worker.execute( batch, model_result, transformed_input ) + self.measure_time("w_execute") transformed_outputs = self._worker.transform_output( batch, execute_result, self._device ) + self.measure_time("w_transform_output") except Exception: logger.exception("Error executing worker") for reply in replies: @@ -310,10 +365,12 @@ def _on_iteration(self) -> None: ) else: reply.outputs = transformed_output.outputs + self.measure_time("w_assign_output") except Exception: logger.exception("Error executing worker") reply.failed = True + if reply.failed: response = build_failure_reply("fail", "failure-occurred") else: @@ -321,12 +378,21 @@ def _on_iteration(self) -> None: response = build_failure_reply("fail", "no-results") response = build_reply(reply) + self.measure_time("w_build_reply") # serialized = self._worker.serialize_reply(request, transformed_output) serialized_resp = MessageHandler.serialize_response(response) # type: ignore + self.measure_time("w_serialize_resp") + if request.callback: request.callback.send(serialized_resp) + self.measure_time("w_send") + + self.end_timings() + + if len(self._timings["w_send"]) == 801: + self.print_timings(True) def _can_shutdown(self) -> bool: """Return true when the criteria to shut down the service are met.""" diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 25c762c6a2..3156b587a7 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -37,7 +37,6 @@ FetchInputResult, FetchModelResult, InferenceBatch, - InferenceRequest, LoadModelResult, MachineLearningWorkerBase, TransformInputResult, @@ -79,7 +78,7 @@ def transform_input( device_to_torch = {"cpu": "cpu", "gpu": "cuda"} for old, new in device_to_torch.items(): - device.replace(old, new) + device = device.replace(old, new) for fetch_result in fetch_results: partial_result = [] @@ -98,10 +97,13 @@ def transform_input( start = start + num_samples result: list[torch.Tensor] = [] - for t_idx in range(len(results[0])): - result.append( - torch.concatenate([partial_result[t_idx] for partial_result in results]) - ) + if len(batch.requests) > 1: + for t_idx in range(len(results[0])): + result.append( + torch.concatenate([partial_result[t_idx] for partial_result in results]) + ) + else: + result = results[0] return TransformInputResult(result, slices) # return data # note: this fails copy test! diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 2fa03b1297..adc6b6edee 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -198,6 +198,7 @@ def fetch_inputs( fetch_results.append( FetchInputResult(request.raw_inputs, request.input_meta) ) + continue if not feature_store: raise ValueError("No input and no feature store provided") @@ -216,6 +217,7 @@ def fetch_inputs( fetch_results.append( FetchInputResult(data, None) ) # fixme: need to get both tensor and descriptor + continue raise ValueError("No input source") From 293e9777e81183afc14e33f0b857f05fcdc0639a Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 17 Jul 2024 02:04:42 +0200 Subject: [PATCH 36/84] Move timing --- smartsim/_core/mli/infrastructure/control/workermanager.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 674bfc93a1..9b59e7144e 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -254,9 +254,9 @@ def _receive_requests(self) -> None: self.start_timings() request = deserialize_message(request_bytes, self._comm_channel_type) - self.measure_time("w_deserialize") if not self._validate_request(request): return + self.measure_time("w_deserialize") self._request_dispatcher.dispatch(request) self.measure_time("w_dispatch") @@ -365,10 +365,10 @@ def _on_iteration(self) -> None: ) else: reply.outputs = transformed_output.outputs - self.measure_time("w_assign_output") except Exception: logger.exception("Error executing worker") reply.failed = True + self.measure_time("w_assign_output") if reply.failed: @@ -380,7 +380,6 @@ def _on_iteration(self) -> None: response = build_reply(reply) self.measure_time("w_build_reply") - # serialized = self._worker.serialize_reply(request, transformed_output) serialized_resp = MessageHandler.serialize_response(response) # type: ignore self.measure_time("w_serialize_resp") From 40c047133897d31bf9f0d6f1f8450527fb56b62f Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 17 Jul 2024 16:00:15 -0500 Subject: [PATCH 37/84] multiprocess solution --- smartsim/_core/mli/comm/channel/dragonfli.py | 4 +- .../infrastructure/control/devicemanager.py | 4 +- .../control/requestdispatcher.py | 179 ++++++++++++++-- .../infrastructure/control/workermanager.py | 196 ++++-------------- .../mli/infrastructure/worker/torch_worker.py | 41 ++-- smartsim/_core/utils/timings.py | 89 ++++++++ 6 files changed, 322 insertions(+), 191 deletions(-) create mode 100644 smartsim/_core/utils/timings.py diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 1c02857eab..319875db2c 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -59,10 +59,10 @@ def send(self, value: bytes) -> None: def recv(self) -> bytes: """Receieve a message through the underlying communication channel :returns: the received message""" - with self._fli.recvh(timeout=None) as recvh: + with self._fli.recvh() as recvh: try: request_bytes: bytes - request_bytes, _ = recvh.recv_bytes(timeout=None) + request_bytes, _ = recvh.recv_bytes() return request_bytes except fli.FLIEOT: return b"" diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py index 8d284c1262..1a2a860aa9 100644 --- a/smartsim/_core/mli/infrastructure/control/devicemanager.py +++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py @@ -118,9 +118,7 @@ def get_free_device( loaded_model = worker.load_model( batch, model_bytes, candidate_device.name ) - candidate_device.add_model( - batch.model_key, loaded_model.model - ) + candidate_device.add_model(batch.model_key, loaded_model.model) return_device = candidate_device diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index 19f3256cef..2a8ed9e39f 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -23,24 +23,93 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +# isort: off +# pylint: disable-next=unused-import +import dragon +from dragon.mpbridge.queues import DragonQueue +# isort: on + +import multiprocessing as mp import time import typing as t import uuid from queue import Empty, Full, Queue -from threading import RLock +from threading import Lock from types import TracebackType from packaging.version import Version +from .....error import SmartSimError from .....log import get_logger +from ....utils.timings import PerfTimer +from ...comm.channel.channel import CommChannelBase +from ...comm.channel.dragonchannel import DragonCommChannel +from ...infrastructure.storage.featurestore import FeatureStore from ...infrastructure.worker.worker import InferenceBatch, InferenceRequest +from ...message_handler import MessageHandler from ...mli_schemas.model.model_capnp import Model -if t.TYPE_CHECKING: - from dragon.fli import FLInterface - logger = get_logger("Request Dispatcher") + +def deserialize_message( + data_blob: bytes, + channel_type: t.Type[CommChannelBase], +) -> InferenceRequest: + """Deserialize a message from a byte stream into an InferenceRequest + :param data_blob: The byte stream to deserialize""" + # todo: consider moving to XxxCore and only making + # workers implement the inputs and model conversion? + + # alternatively, consider passing the capnproto models + # to this method instead of the data_blob... + + # something is definitely wrong here... client shouldn't have to touch + # callback (or batch size) + + request = MessageHandler.deserialize_request(data_blob) + # return request + model_key: t.Optional[str] = None + model_bytes: t.Optional[Model] = None + + if request.model.which() == "key": + model_key = request.model.key.key + elif request.model.which() == "data": + model_bytes = request.model.data + + callback_key = request.replyChannel.reply + + # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel` + comm_channel = channel_type(callback_key) + # comm_channel = DragonCommChannel(request.replyChannel) + + input_keys: t.Optional[t.List[str]] = None + input_bytes: t.Optional[t.List[bytes]] = ( + None # these will really be tensors already + ) + + input_meta: t.List[t.Any] = [] + + if request.input.which() == "keys": + input_keys = [input_key.key for input_key in request.input.keys] + elif request.input.which() == "data": + input_bytes = [data.blob for data in request.input.data] + input_meta = [data.tensorDescriptor for data in request.input.data] + + inference_request = InferenceRequest( + model_key=model_key, + callback=comm_channel, + raw_inputs=input_bytes, + input_meta=input_meta, + input_keys=input_keys, + raw_model=model_bytes, + batch_size=0, + ) + return inference_request + + class WorkerDevice: def __init__(self, name: str) -> None: """Wrapper around a device to keep track of loaded Models and availability @@ -50,7 +119,7 @@ def __init__(self, name: str) -> None: """The name used by the toolkit to identify this device""" self._models: dict[str, t.Any] = {} """Dictionary of model key to model for models stored on this device""" - self._lock = RLock() + self._lock = Lock() """Lock to ensure only one thread at the time accesses this device""" def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]: @@ -79,11 +148,11 @@ def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> Non self._first_put: t.Optional[float] = None self._disposable = False self._model_key = model_key - self._flush_lock = RLock() + self._flush_lock = Lock() self._id = str(uuid.uuid4()) @property - def id(self): + def id(self) -> str: return self._id def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]: @@ -114,7 +183,6 @@ def put( timeout: t.Optional[float] = 0.0, ) -> None: if not self.acquire(blocking=False): - logger.error(f"Could not acquire queue {self._id} to put") raise Full try: if self.full(): @@ -174,24 +242,98 @@ def __init__( self, batch_timeout: float, batch_size: int, + incoming_channel: t.Optional[CommChannelBase], + comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, + feature_store: t.Optional[FeatureStore] = None, ) -> None: + mp.set_start_method("dragon") self._queues: list[BatchQueue] = [] self._active_queues: dict[str, BatchQueue] = {} self._model_last_version: dict[str, Version] = {} self._model_name_to_key: dict[str, str] = {} self._batch_timeout = batch_timeout self._batch_size = batch_size - self._queue_swap_lock = RLock() + self._queue_swap_lock: t.Optional[Lock] = None + self._incoming_channel = incoming_channel + self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0) + self._feature_store = feature_store + self._comm_channel_type = comm_channel_type + self._perf_timer = PerfTimer(prefix="r_") + + def _validate_request(self, request: InferenceRequest) -> bool: + """Ensure the request can be processed. + :param request: The request to validate + :return: True if the request is valid, False otherwise""" + if not self._feature_store: + if request.model_key: + logger.error("Unable to load model by key without feature store") + return False + + if request.input_keys: + logger.error("Unable to load inputs by key without feature store") + return False + + if request.output_keys: + logger.error("Unable to persist outputs by key without feature store") + return False + + if not request.model_key and not request.raw_model: + logger.error("Unable to continue without model bytes or feature store key") + return False + + if not request.input_keys and not request.raw_inputs: + logger.error("Unable to continue without input bytes or feature store keys") + return False + + if request.callback is None: + logger.error("No callback channel provided in request") + return False + + return True + + def run(self) -> None: + self._queue_swap_lock = Lock() + if self._incoming_channel is None: + raise SmartSimError("No incoming channel for dispatcher") + while True: + try: + request_bytes: bytes = self._incoming_channel.recv() + except Exception: + pass + else: + self._perf_timer.start_timings() + request = deserialize_message(request_bytes, self._comm_channel_type) + self._perf_timer.measure_time("deserialize_message") + if not self._validate_request(request): + return + self._perf_timer.measure_time("validate_request") + self.dispatch(request) + self._perf_timer.measure_time("dispatch") + finally: + self.flush_requests() + self._perf_timer.measure_time("flush_requests") + # TODO: implement this + # self.remove_queues() + + self._perf_timer.end_timings() + + # pylint: disable-next=protected-access + if len(self._perf_timer._timings["r_dispatch"]) == 801: + self._perf_timer.print_timings(True) + + @property + def task_queue(self) -> DragonQueue: + return self._outgoing_queue def _swap_queue(self, model_key: str) -> None: + if self._queue_swap_lock is None: + raise SmartSimError("Queue was not locked") with self._queue_swap_lock: for queue in self._queues: if queue.model_key == model_key and not queue.full(): - logger.info("Found queue, swapping") self._active_queues[model_key] = queue return - logger.info("Creating new queue") new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key) self._queues.append(new_queue) self._active_queues[model_key] = new_queue @@ -210,14 +352,12 @@ def dispatch(self, request: InferenceRequest) -> None: return if request.model_key: - logger.info("Indirect inference requested, dispatching it to existing queue") success = False while not success: try: self._active_queues[request.model_key].put_nowait(request) success = True except (Full, KeyError): - logger.info("Could not find non-full queue, swapping") self._swap_queue(request.model_key) def _update_model_version(self, model: Model) -> None: @@ -230,18 +370,15 @@ def _update_model_version(self, model: Model) -> None: self._model_last_version[model.name] = Version(model.version) return - def flush_requests(self) -> t.Optional[InferenceBatch]: - result = None + def flush_requests(self) -> None: for queue in self._queues: - # logger.info("Acquiring queue to flush") if queue.ready and queue.acquire(blocking=False): try: - logger.info(f"Acquired queue {queue.id}") - result = InferenceBatch( - model_key=queue.model_key, requests=queue.flush() + self._outgoing_queue.put( + InferenceBatch( + model_key=queue.model_key, requests=queue.flush() + ) ) finally: queue.release() break - - return result diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 674bfc93a1..76e9ecc659 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -23,14 +23,18 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import typing as t -import time -import numpy as np -import numbers +import multiprocessing as mp +import numbers +import time +import typing as t from collections import OrderedDict -from .....log import ContextThread, get_logger +import dragon +import numpy as np + +from ....utils.timings import PerfTimer +from .....log import get_logger from ....entrypoints.service import Service from ...comm.channel.channel import CommChannelBase from ...comm.channel.dragonchannel import DragonCommChannel @@ -54,62 +58,6 @@ logger = get_logger(__name__) -def deserialize_message( - data_blob: bytes, - channel_type: t.Type[CommChannelBase], -) -> InferenceRequest: - """Deserialize a message from a byte stream into an InferenceRequest - :param data_blob: The byte stream to deserialize""" - # todo: consider moving to XxxCore and only making - # workers implement the inputs and model conversion? - - # alternatively, consider passing the capnproto models - # to this method instead of the data_blob... - - # something is definitely wrong here... client shouldn't have to touch - # callback (or batch size) - - request = MessageHandler.deserialize_request(data_blob) - # return request - model_key: t.Optional[str] = None - model_bytes: t.Optional[Model] = None - - if request.model.which() == "key": - model_key = request.model.key.key - elif request.model.which() == "data": - model_bytes = request.model.data - - callback_key = request.replyChannel.reply - - # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel` - comm_channel = channel_type(callback_key) - # comm_channel = DragonCommChannel(request.replyChannel) - - input_keys: t.Optional[t.List[str]] = None - input_bytes: t.Optional[t.List[bytes]] = ( - None # these will really be tensors already - ) - - input_meta: t.List[t.Any] = [] - - if request.input.which() == "keys": - input_keys = [input_key.key for input_key in request.input.keys] - elif request.input.which() == "data": - input_bytes = [data.blob for data in request.input.data] - input_meta = [data.tensorDescriptor for data in request.input.data] - - inference_request = InferenceRequest( - model_key=model_key, - callback=comm_channel, - raw_inputs=input_bytes, - input_meta=input_meta, - input_keys=input_keys, - raw_model=model_bytes, - batch_size=0, - ) - return inference_request - - def build_failure_reply(status: "StatusEnum", message: str) -> Response: return MessageHandler.build_response( status=status, # todo: need to indicate correct status @@ -197,79 +145,30 @@ def __init__( self._cached_models: dict[str, t.Any] = {} """Dictionary of previously loaded models""" self._request_dispatcher: RequestDispatcher = RequestDispatcher( - batch_timeout=batch_timeout, batch_size=batch_size + batch_timeout=batch_timeout, + batch_size=batch_size, + incoming_channel=self._task_queue, + comm_channel_type=comm_channel_type, + feature_store=self._feature_store, ) """Dispatcher used to batch requests""" - self._dispatcher_threads = 1 - """Number of threads which dispatch requests""" self._device_manager: DeviceManager = DeviceManager([WorkerDevice("gpu")]) - self._start = None - self._interm = None - self._timings: OrderedDict[str, list[numbers.Number]] = OrderedDict() - self._timing_on = True - - def _add_label_to_timings(self, label: str): - if label not in self._timings: - self._timings[label] = [] - - @staticmethod - def _format_number(number: numbers.Number): - return f"{number:0.4e}" - - def start_timings(self): - if self._timing_on: - # self._add_label_to_timings("batch_size") - # self._timings["batch_size"].append(batch_size) - self._start = time.perf_counter() - self._interm = time.perf_counter() - - def end_timings(self): - if self._timing_on: - self._add_label_to_timings("total_time") - self._timings["total_time"].append(self._format_number(time.perf_counter()-self._start)) - - def measure_time(self, label: str): - if self._timing_on: - self._add_label_to_timings(label) - self._timings[label].append(self._format_number(time.perf_counter()-self._interm)) - self._interm = time.perf_counter() - - def print_timings(self, to_file: bool = False): - print(" ".join(self._timings.keys())) - value_array = np.array([value for value in self._timings.values()], dtype=float) - value_array = np.transpose(value_array) - for i in range(value_array.shape[0]): - print(" ".join(self._format_number(value) for value in value_array[i])) - if to_file: - np.save("timings.npy", value_array) - np.savetxt("timings.txt", value_array) - - - def _receive_requests(self) -> None: - if self._task_queue is None: - return - # while not self._can_shutdown(): - # perform default deserialization of the message envelope - request_bytes: bytes = self._task_queue.recv() - - self.start_timings() - request = deserialize_message(request_bytes, self._comm_channel_type) - self.measure_time("w_deserialize") - if not self._validate_request(request): - return - self._request_dispatcher.dispatch(request) - self.measure_time("w_dispatch") + self._perf_timer = PerfTimer(prefix="w_") + + try: + mp.set_start_method("dragon") + except RuntimeError: + pass + self._dispatcher_process = mp.Process( + target=self._request_dispatcher.run, name="Dispatcher" + ) def _on_start(self) -> None: - # for thread_idx in range(self._dispatcher_threads): - # dispatcher_thread = ContextThread( - # name=f"Dispatcher_{thread_idx}", - # target=self._receive_requests, - # daemon=True, - # ) - # dispatcher_thread.start() - pass + self._dispatcher_process.start() + + def _on_shutdown(self) -> None: + self._dispatcher_process.join() def _validate_request(self, request: InferenceRequest) -> bool: """Ensure the request can be processed. @@ -307,14 +206,12 @@ def _on_iteration(self) -> None: the inference pipeline""" logger.debug("executing worker manager pipeline") - self._receive_requests() - - # logger.info("Getting request batch") - batch = self._request_dispatcher.flush_requests() + batch = self._request_dispatcher.task_queue.get() + self._perf_timer.start_timings() if batch is None or 0 == len(batch.requests): return - self.measure_time("w_flush_requests") + self._perf_timer.measure_time("flush_requests") # logger.info(f"Got batch of {len(batch.requests)} requests, acquiring device") device: WorkerDevice = next( self._device_manager.get_free_device( @@ -323,20 +220,20 @@ def _on_iteration(self) -> None: feature_store=self._feature_store, ) ) - self.measure_time("w_fetch_model") + self._perf_timer.measure_time("fetch_model") # logger.info(f"Acquired device {device.name}") model_result = LoadModelResult(device.get_model(batch.model_key)) - self.measure_time("w_load_model") + self._perf_timer.measure_time("load_model") fetch_input_results = self._worker.fetch_inputs(batch, self._feature_store) - self.measure_time("w_fetch_input") + self._perf_timer.measure_time("fetch_input") transformed_input = self._worker.transform_input( batch, fetch_input_results, self._device ) - self.measure_time("w_transform_input") + self._perf_timer.measure_time("transform_input") replies = [InferenceReply() for _ in range(len(batch.requests))] @@ -344,19 +241,19 @@ def _on_iteration(self) -> None: execute_result = self._worker.execute( batch, model_result, transformed_input ) - self.measure_time("w_execute") + self._perf_timer.measure_time("execute") transformed_outputs = self._worker.transform_output( batch, execute_result, self._device ) - self.measure_time("w_transform_output") + self._perf_timer.measure_time("transform_output") except Exception: logger.exception("Error executing worker") for reply in replies: reply.failed = True else: - for reply_idx, (request, transformed_output) in enumerate(zip( - batch.requests, transformed_outputs - )): + for reply_idx, (request, transformed_output) in enumerate( + zip(batch.requests, transformed_outputs) + ): reply = replies[reply_idx] try: if request.output_keys: @@ -365,12 +262,11 @@ def _on_iteration(self) -> None: ) else: reply.outputs = transformed_output.outputs - self.measure_time("w_assign_output") + self._perf_timer.measure_time("assign_output") except Exception: logger.exception("Error executing worker") reply.failed = True - if reply.failed: response = build_failure_reply("fail", "failure-occurred") else: @@ -378,21 +274,21 @@ def _on_iteration(self) -> None: response = build_failure_reply("fail", "no-results") response = build_reply(reply) - self.measure_time("w_build_reply") + self._perf_timer.measure_time("build_reply") # serialized = self._worker.serialize_reply(request, transformed_output) serialized_resp = MessageHandler.serialize_response(response) # type: ignore - self.measure_time("w_serialize_resp") + self._perf_timer.measure_time("serialize_resp") if request.callback: request.callback.send(serialized_resp) - self.measure_time("w_send") + self._perf_timer.measure_time("send") - self.end_timings() + self._perf_timer.end_timings() - if len(self._timings["w_send"]) == 801: - self.print_timings(True) + if len(self._perf_timer._timings["w_send"]) == 801: + self._perf_timer.print_timings(True) def _can_shutdown(self) -> bool: """Return true when the criteria to shut down the service are met.""" diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 3156b587a7..4eedc18299 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -84,26 +84,31 @@ def transform_input( partial_result = [] if fetch_result.meta is None: raise ValueError("Cannot reconstruct tensor without meta information") - for item, item_meta in zip(fetch_result.inputs, fetch_result.meta): + for idx, (item, item_meta) in enumerate( + zip(fetch_result.inputs, fetch_result.meta) + ): tensor_desc: tensor_capnp.TensorDescriptor = item_meta partial_result.append( - torch.tensor(np.frombuffer(item, dtype=str(tensor_desc.dataType))) - .to(device) - .reshape(tuple(dim for dim in tensor_desc.dimensions)) + torch.tensor( + np.frombuffer(item, dtype=str(tensor_desc.dataType)) + ).reshape(tuple(dim for dim in tensor_desc.dimensions)) ) + if idx == 0: + num_samples = tensor_desc.dimensions[0] + slices.append(slice(start, start + num_samples)) + start = start + num_samples results.append(partial_result) - num_samples = fetch_result.meta[0].dimensions[0] - slices.append(slice(start, start + num_samples)) - start = start + num_samples result: list[torch.Tensor] = [] if len(batch.requests) > 1: for t_idx in range(len(results[0])): result.append( - torch.concatenate([partial_result[t_idx] for partial_result in results]) + torch.concatenate( + [partial_result[t_idx] for partial_result in results] + ).to(device) ) else: - result = results[0] + result = [tensor.to(device) for tensor in results[0]] return TransformInputResult(result, slices) # return data # note: this fails copy test! @@ -134,11 +139,17 @@ def transform_output( transformed_list: list[TransformOutputResult] = [] for result_slice in execute_result.slices: if result_device != "cpu": - transformed = [item.to("cpu") for item in execute_result.predictions[result_slice]] + transformed = [ + item.to("cpu") for item in execute_result.predictions[result_slice] + ] # todo: need the shape from latest schemas added here. - transformed_list.append(TransformOutputResult(transformed, None, "c", "float32")) # fixme + transformed_list.append( + TransformOutputResult(transformed, None, "c", "float32") + ) # fixme - transformed_list.append(TransformOutputResult( - execute_result.predictions[result_slice], None, "c", "float32" - )) # fixme - return transformed_list \ No newline at end of file + transformed_list.append( + TransformOutputResult( + execute_result.predictions[result_slice], None, "c", "float32" + ) + ) # fixme + return transformed_list diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py new file mode 100644 index 0000000000..7fa2af04a6 --- /dev/null +++ b/smartsim/_core/utils/timings.py @@ -0,0 +1,89 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import time +import typing as t +from collections import OrderedDict + +import numpy as np + + +class PerfTimer: + def __init__(self, filename: str = "timings", prefix: str = ""): + self._start: t.Optional[float] = None + self._interm: t.Optional[float] = None + self._timings: OrderedDict[str, list[t.Union[float, int, str]]] = OrderedDict() + self._timing_on = True + self._filename = filename + self._prefix = prefix + + def _add_label_to_timings(self, label: str) -> None: + if label not in self._timings: + self._timings[label] = [] + + @staticmethod + def _format_number(number: float | int) -> str: + return f"{number:0.4e}" + + def start_timings( + self, + first_label: t.Optional[str] = None, + first_value: t.Optional[float | int] = None, + ) -> None: + if self._timing_on: + if first_label is not None and first_value is not None: + self._add_label_to_timings(self._make_label(first_label)) + self._timings[self._make_label(first_label)].append(first_value) + self._start = time.perf_counter() + self._interm = time.perf_counter() + + def end_timings(self) -> None: + if self._timing_on and self._start is not None: + self._add_label_to_timings(self._make_label("total_time")) + self._timings[self._make_label("total_time")].append( + self._format_number(time.perf_counter() - self._start) + ) + self._interm = None + + def _make_label(self, label: str) -> str: + return self._prefix + label + + def measure_time(self, label: str) -> None: + if self._timing_on and self._interm is not None: + self._add_label_to_timings(self._make_label(label)) + self._timings[self._make_label(label)].append( + self._format_number(time.perf_counter() - self._interm) + ) + self._interm = time.perf_counter() + + def print_timings(self, to_file: bool = False) -> None: + print(" ".join(self._timings.keys())) + value_array = np.array(list(self._timings.values()), dtype=float) + value_array = np.transpose(value_array) + for i in range(value_array.shape[0]): + print(" ".join(self._format_number(value) for value in value_array[i])) + if to_file: + np.save(self._prefix + self._filename + ".npy", value_array) From 0bb14879e06e622749fbf9347f6f50c1238592c4 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 17 Jul 2024 18:19:51 -0500 Subject: [PATCH 38/84] Constrain torch threads in worker --- .../_core/mli/infrastructure/control/requestdispatcher.py | 5 +++-- smartsim/_core/mli/infrastructure/worker/torch_worker.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index 2a8ed9e39f..babbd3fe56 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -203,7 +203,6 @@ def _waited_time(self) -> float: def ready(self) -> bool: if self.empty(): return False - return self.full() or (self._waited_time >= self._batch_timeout) def make_disposable(self) -> None: @@ -311,7 +310,6 @@ def run(self) -> None: self._perf_timer.measure_time("dispatch") finally: self.flush_requests() - self._perf_timer.measure_time("flush_requests") # TODO: implement this # self.remove_queues() @@ -374,11 +372,14 @@ def flush_requests(self) -> None: for queue in self._queues: if queue.ready and queue.acquire(blocking=False): try: + + self._perf_timer.measure_time("find_queue") self._outgoing_queue.put( InferenceBatch( model_key=queue.model_key, requests=queue.flush() ) ) + self._perf_timer.measure_time("flush_requests") finally: queue.release() break diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 4eedc18299..f55d6d13d7 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -28,7 +28,6 @@ import numpy as np import torch - from .....error import SmartSimError from .....log import get_logger from ...mli_schemas.tensor import tensor_capnp @@ -43,6 +42,7 @@ TransformOutputResult, ) +torch.set_num_threads(1) logger = get_logger(__name__) From 7b9e00ce6b0395bfa259b7c08ef4e1eaf5dbeae4 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 18 Jul 2024 09:46:48 -0500 Subject: [PATCH 39/84] Affinity and correct process --- .../_core/launcher/dragon/dragonBackend.py | 4 ++ .../infrastructure/control/workermanager.py | 56 ++++++++++++++++--- .../mli/infrastructure/worker/torch_worker.py | 1 + 3 files changed, 53 insertions(+), 8 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index dcc5c8392b..545dbfaa6b 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -424,6 +424,8 @@ def _start_steps(self) -> None: global_policy = dragon_policy.Policy( placement=dragon_policy.Policy.Placement.HOST_NAME, host_name=hosts[0], + affinity = dragon_policy.Policy.Affinity.SPECIFIC, + cpu_affinity=list(range(32))+list(range(64,64+32)), ) options = dragon_process_desc.ProcessOptions(make_inf_channels=True) grp = dragon_process_group.ProcessGroup( @@ -435,6 +437,8 @@ def _start_steps(self) -> None: local_policy = dragon_policy.Policy( placement=dragon_policy.Policy.Placement.HOST_NAME, host_name=node_name, + affinity = dragon_policy.Policy.Affinity.SPECIFIC, + cpu_affinity=list(range(32))+list(range(64,64+32)), ) policies.extend([local_policy] * request.tasks_per_node) tmp_proc = dragon_process.ProcessTemplate( diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 21460186d0..3dde086367 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -24,14 +24,20 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import dragon +import dragon.data.ddict.ddict as dragon_ddict +import dragon.infrastructure.connection as dragon_connection +import dragon.infrastructure.policy as dragon_policy +import dragon.infrastructure.process_desc as dragon_process_desc +import dragon.native.group_state as dragon_group_state +import dragon.native.process as dragon_process +import dragon.native.process_group as dragon_process_group +import dragon.native.machine as dragon_machine + import multiprocessing as mp -import numbers -import time +import os +import socket import typing as t -from collections import OrderedDict - -import dragon -import numpy as np from ....utils.timings import PerfTimer from .....log import get_logger @@ -160,9 +166,43 @@ def __init__( mp.set_start_method("dragon") except RuntimeError: pass - self._dispatcher_process = mp.Process( - target=self._request_dispatcher.run, name="Dispatcher" + # self._dispatcher_process = mp.Process( + # target=self._request_dispatcher.run, name="Dispatcher" + # ) + self._dispatcher_process = self._create_local_dispatcher_process() + + def _create_local_dispatcher_process(self): + self_affinity = list(os.sched_getaffinity(os.getpid())) + os.sched_setaffinity(os.getpid(), self_affinity[:-8]) + global_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=socket.gethostname(), + affinity = dragon_policy.Policy.Affinity.SPECIFIC, + cpu_affinity=self_affinity[-8:], + device=dragon_policy.Policy.Device.CPU, + distribution = dragon_policy.Policy.Distribution.BLOCK, + ) + options = dragon_process_desc.ProcessOptions(make_inf_channels=True) + grp = dragon_process_group.ProcessGroup( + restart=False, pmi_enabled=True, policy=global_policy + ) + local_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=socket.gethostname(), + affinity = dragon_policy.Policy.Affinity.SPECIFIC, + cpu_affinity=self_affinity[-8:], + device=dragon_policy.Policy.Device.CPU, + ) + tmp_proc = dragon_process.ProcessTemplate( + target=self._request_dispatcher.run, + args=[], + cwd=os.getcwd(), + policy=local_policy, + options=options, ) + grp.add_process(nproc=1, template=tmp_proc) + grp.init() + return grp def _on_start(self) -> None: self._dispatcher_process.start() diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index f55d6d13d7..84bcec0887 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -43,6 +43,7 @@ ) torch.set_num_threads(1) +torch.set_num_interop_threads(16) logger = get_logger(__name__) From 94a526336479f784aaaa87ae10771b2535211a87 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 18 Jul 2024 09:48:39 -0500 Subject: [PATCH 40/84] Fixes to example --- ex/high_throughput_inference/mli_driver.py | 8 +++++--- ex/high_throughput_inference/mock_app.py | 2 -- ex/high_throughput_inference/standalone_workermanager.py | 6 ++++++ 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 6da559aa6f..1d4b121365 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -7,6 +7,7 @@ from smartsim import Experiment from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker from smartsim.status import TERMINAL_STATUSES +from smartsim.settings import DragonRunSettings import time import typing as t @@ -20,13 +21,13 @@ os.environ["SMARTSIM_DRAGON_TRANSPORT"] = transport -exp_path = os.path.join(filedir, f"MLI_proto_{transport.upper()}") +exp_path = os.path.join(filedir, f"MLI_proto_batch_{transport.upper()}") os.makedirs(exp_path, exist_ok=True) exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path) torch_worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii") -worker_manager_rs = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device, "--worker_class", torch_worker_str]) +worker_manager_rs: DragonRunSettings = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device, "--worker_class", torch_worker_str]) worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) @@ -40,11 +41,12 @@ while True: if exp.get_status(app)[0] in TERMINAL_STATUSES: + time.sleep(10) exp.stop(worker_manager) break if exp.get_status(worker_manager)[0] in TERMINAL_STATUSES: + time.sleep(10) exp.stop(app) break - time.sleep(5) print("Exiting.") \ No newline at end of file diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 45246db2e5..76969e6a4c 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -112,7 +112,6 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): built_tensor = MessageHandler.build_tensor( batch.numpy(), "c", "float32", list(batch.shape)) self.measure_time("build_tensor") - built_model = None if isinstance(model, str): model_arg = MessageHandler.build_model_key(model) else: @@ -130,7 +129,6 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.measure_time("serialize_request") with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: to_sendh.send_bytes(request_bytes) - logger.info(f"Message size: {len(request_bytes)} bytes") self.measure_time("send") with self._from_worker_ch.recvh(timeout=None) as from_recvh: diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index f91c2269c6..f781444d81 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -45,6 +45,12 @@ from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader +import os +import socket +pid = 0 +affinity = os.sched_getaffinity(pid) +print("Entry point:", socket.gethostname(), affinity) +print("CPUS:", os.cpu_count()) if __name__ == "__main__": parser = argparse.ArgumentParser("Worker Manager") From a7b52626f5be31eeadd1c3658c440cddd6abe715 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 19 Jul 2024 01:56:52 +0200 Subject: [PATCH 41/84] Add request dispatcher post-merge changes --- .../control/requestdispatcher.py | 63 +++++++++++++++---- 1 file changed, 52 insertions(+), 11 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index babbd3fe56..8684bc7b6e 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -57,6 +57,7 @@ def deserialize_message( data_blob: bytes, channel_type: t.Type[CommChannelBase], + device: t.Literal["cpu", "gpu"], ) -> InferenceRequest: """Deserialize a message from a byte stream into an InferenceRequest :param data_blob: The byte stream to deserialize""" @@ -79,31 +80,34 @@ def deserialize_message( elif request.model.which() == "data": model_bytes = request.model.data - callback_key = request.replyChannel.reply + callback_key = request.replyChannel.descriptor # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel` comm_channel = channel_type(callback_key) # comm_channel = DragonCommChannel(request.replyChannel) input_keys: t.Optional[t.List[str]] = None - input_bytes: t.Optional[t.List[bytes]] = ( - None # these will really be tensors already - ) + input_bytes: t.Optional[t.List[bytes]] = None + + output_keys: t.Optional[t.List[str]] = None - input_meta: t.List[t.Any] = [] + input_meta: t.Optional[t.List[TensorDescriptor]] = None if request.input.which() == "keys": input_keys = [input_key.key for input_key in request.input.keys] - elif request.input.which() == "data": - input_bytes = [data.blob for data in request.input.data] - input_meta = [data.tensorDescriptor for data in request.input.data] + elif request.input.which() == "descriptors": + input_meta = request.input.descriptors # type: ignore + + if request.output: + output_keys = [tensor_key.key for tensor_key in request.output] inference_request = InferenceRequest( model_key=model_key, callback=comm_channel, raw_inputs=input_bytes, - input_meta=input_meta, input_keys=input_keys, + input_meta=input_meta, + output_keys=output_keys, raw_model=model_bytes, batch_size=0, ) @@ -235,7 +239,26 @@ def full(self) -> bool: def empty(self) -> bool: return self.qsize() == 0 - +def exception_handler( + exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str +) -> None: + """ + Logs exceptions and sends a failure response. + + :param exc: The exception to be logged + :param reply_channel: The channel used to send replies + :param failure_message: Failure message to log and send back + """ + logger.exception( + f"{failure_message}\n" + f"Exception type: {type(exc).__name__}\n" + f"Exception message: {str(exc)}" + ) + serialized_resp = MessageHandler.serialize_response( + build_failure_reply("fail", failure_message) + ) + if reply_channel: + reply_channel.send(serialized_resp) class RequestDispatcher: def __init__( self, @@ -296,10 +319,28 @@ def run(self) -> None: raise SmartSimError("No incoming channel for dispatcher") while True: try: - request_bytes: bytes = self._incoming_channel.recv() + bytes_list: t.List[bytes] = self._incoming_channel.recv() + + if not bytes_list: + exception_handler( + ValueError("No request data found"), + None, + "No request data found.", + ) + return + + except Exception: pass else: + request_bytes = bytes_list[0] + tensor_bytes_list = bytes_list[1:] + + request = deserialize_message( + request_bytes, self._comm_channel_type, self._device + ) + if request.input_meta and tensor_bytes_list: + request.raw_inputs = tensor_bytes_list self._perf_timer.start_timings() request = deserialize_message(request_bytes, self._comm_channel_type) self._perf_timer.measure_time("deserialize_message") From 717ef8866eec89705d9fd9e4ca4d42a29a9e7535 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 19 Jul 2024 02:21:14 +0200 Subject: [PATCH 42/84] Misc fixes --- .../_core/launcher/dragon/dragonBackend.py | 4 +- smartsim/_core/mli/comm/channel/channel.py | 1 - .../_core/mli/comm/channel/dragonchannel.py | 1 - .../control/requestdispatcher.py | 37 ++--------- .../infrastructure/control/workermanager.py | 62 +++++++++---------- .../mli/infrastructure/worker/torch_worker.py | 1 + .../_core/mli/infrastructure/worker/worker.py | 1 + 7 files changed, 36 insertions(+), 71 deletions(-) diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index fff62fcdde..a6a8700ab0 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -504,8 +504,8 @@ def _start_steps(self) -> None: global_policy = dragon_policy.Policy( placement=dragon_policy.Policy.Placement.HOST_NAME, host_name=hosts[0], - affinity = dragon_policy.Policy.Affinity.SPECIFIC, - cpu_affinity=list(range(32))+list(range(64,64+32)), + affinity=dragon_policy.Policy.Affinity.SPECIFIC, + cpu_affinity=list(range(32)) + list(range(64, 64 + 32)), ) options = dragon_process_desc.ProcessOptions(make_inf_channels=True) grp = dragon_process_group.ProcessGroup( diff --git a/smartsim/_core/mli/comm/channel/channel.py b/smartsim/_core/mli/comm/channel/channel.py index df4872af1a..a3cce21814 100644 --- a/smartsim/_core/mli/comm/channel/channel.py +++ b/smartsim/_core/mli/comm/channel/channel.py @@ -41,7 +41,6 @@ def __init__(self, descriptor: t.Union[str, bytes]) -> None: @abstractmethod def send(self, value: bytes) -> None: - """Send a message through the underlying communication channel """Send a message through the underlying communication channel :param value: The value to send""" diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index ab98261409..a45adaee33 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import sys import sys import typing as t diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index 8684bc7b6e..10279c01d7 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -29,6 +29,7 @@ # pylint: disable-next=unused-import import dragon from dragon.mpbridge.queues import DragonQueue + # isort: on import multiprocessing as mp @@ -50,6 +51,7 @@ from ...infrastructure.worker.worker import InferenceBatch, InferenceRequest from ...message_handler import MessageHandler from ...mli_schemas.model.model_capnp import Model +from ...mli_schemas.tensor.tensor_capnp import TensorDescriptor logger = get_logger("Request Dispatcher") @@ -57,7 +59,6 @@ def deserialize_message( data_blob: bytes, channel_type: t.Type[CommChannelBase], - device: t.Literal["cpu", "gpu"], ) -> InferenceRequest: """Deserialize a message from a byte stream into an InferenceRequest :param data_blob: The byte stream to deserialize""" @@ -239,26 +240,7 @@ def full(self) -> bool: def empty(self) -> bool: return self.qsize() == 0 -def exception_handler( - exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str -) -> None: - """ - Logs exceptions and sends a failure response. - - :param exc: The exception to be logged - :param reply_channel: The channel used to send replies - :param failure_message: Failure message to log and send back - """ - logger.exception( - f"{failure_message}\n" - f"Exception type: {type(exc).__name__}\n" - f"Exception message: {str(exc)}" - ) - serialized_resp = MessageHandler.serialize_response( - build_failure_reply("fail", failure_message) - ) - if reply_channel: - reply_channel.send(serialized_resp) + class RequestDispatcher: def __init__( self, @@ -321,24 +303,13 @@ def run(self) -> None: try: bytes_list: t.List[bytes] = self._incoming_channel.recv() - if not bytes_list: - exception_handler( - ValueError("No request data found"), - None, - "No request data found.", - ) - return - - except Exception: pass else: request_bytes = bytes_list[0] tensor_bytes_list = bytes_list[1:] - request = deserialize_message( - request_bytes, self._comm_channel_type, self._device - ) + request = deserialize_message(request_bytes, self._comm_channel_type) if request.input_meta and tensor_bytes_list: request.raw_inputs = tensor_bytes_list self._perf_timer.start_timings() diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index d2cce15440..140ad9bc70 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -24,24 +24,25 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import multiprocessing as mp +import os +import socket +import sys +import typing as t + import dragon import dragon.data.ddict.ddict as dragon_ddict import dragon.infrastructure.connection as dragon_connection import dragon.infrastructure.policy as dragon_policy import dragon.infrastructure.process_desc as dragon_process_desc import dragon.native.group_state as dragon_group_state +import dragon.native.machine as dragon_machine import dragon.native.process as dragon_process import dragon.native.process_group as dragon_process_group -import dragon.native.machine as dragon_machine - -import multiprocessing as mp -import os -import socket -import typing as t -from ....utils.timings import PerfTimer from .....log import get_logger from ....entrypoints.service import Service +from ....utils.timings import PerfTimer from ...comm.channel.channel import CommChannelBase from ...comm.channel.dragonchannel import DragonCommChannel from ...infrastructure.environmentloader import EnvironmentConfigLoader @@ -53,7 +54,7 @@ MachineLearningWorkerBase, ) from ...message_handler import MessageHandler -from ...mli_schemas.response.response_capnp import Response, ResponseBuilder +from ...mli_schemas.response.response_capnp import ResponseBuilder from .devicemanager import DeviceManager, WorkerDevice from .requestdispatcher import RequestDispatcher @@ -187,16 +188,18 @@ def __init__( # ) self._dispatcher_process = self._create_local_dispatcher_process() - def _create_local_dispatcher_process(self): - self_affinity = list(os.sched_getaffinity(os.getpid())) - os.sched_setaffinity(os.getpid(), self_affinity[:-8]) + def _create_local_dispatcher_process(self) -> dragon_process_group.ProcessGroup: + if sys.platform != "darwin": + self_affinity: list[int] = list(os.sched_getaffinity(os.getpid())) + os.sched_setaffinity(os.getpid(), self_affinity[:-8]) + else: + self_affinity: list[int] = [] global_policy = dragon_policy.Policy( placement=dragon_policy.Policy.Placement.HOST_NAME, host_name=socket.gethostname(), - affinity = dragon_policy.Policy.Affinity.SPECIFIC, + affinity=dragon_policy.Policy.Affinity.SPECIFIC, cpu_affinity=self_affinity[-8:], device=dragon_policy.Policy.Device.CPU, - distribution = dragon_policy.Policy.Distribution.BLOCK, ) options = dragon_process_desc.ProcessOptions(make_inf_channels=True) grp = dragon_process_group.ProcessGroup( @@ -205,7 +208,7 @@ def _create_local_dispatcher_process(self): local_policy = dragon_policy.Policy( placement=dragon_policy.Policy.Placement.HOST_NAME, host_name=socket.gethostname(), - affinity = dragon_policy.Policy.Affinity.SPECIFIC, + affinity=dragon_policy.Policy.Affinity.SPECIFIC, cpu_affinity=self_affinity[-8:], device=dragon_policy.Policy.Device.CPU, ) @@ -278,8 +281,6 @@ def _on_iteration(self) -> None: ) self._perf_timer.measure_time("fetch_model") - # logger.info(f"Acquired device {device.name}") - model_result = LoadModelResult(device.get_model(batch.model_key)) self._perf_timer.measure_time("load_model") @@ -304,8 +305,7 @@ def _on_iteration(self) -> None: self._perf_timer.measure_time("transform_output") except Exception: logger.exception("Error executing worker") - for reply in replies: - reply.failed = True + else: for reply_idx, (request, transformed_output) in enumerate( zip(batch.requests, transformed_outputs) @@ -321,33 +321,27 @@ def _on_iteration(self) -> None: self._perf_timer.measure_time("assign_output") except Exception: logger.exception("Error executing worker") - reply.failed = True - if reply.outputs is None or not reply.outputs: - response = build_failure_reply("fail", "Outputs not found.") - else: - reply.status_enum = "complete" - reply.message = "Success" - response = build_reply(reply) - if reply.failed: - response = build_failure_reply("fail", "failure-occurred") + if reply.outputs is None or not reply.outputs: + response = build_failure_reply("fail", "Outputs not found.") else: - if reply.outputs is None or not reply.outputs: - response = build_failure_reply("fail", "no-results") + reply.status_enum = "complete" + reply.message = "Success" + response = build_reply(reply) response = build_reply(reply) self._perf_timer.measure_time("build_reply") - serialized_resp = MessageHandler.serialize_response(response) # type: ignore + serialized_resp = MessageHandler.serialize_response(response) self._perf_timer.measure_time("serialize_resp") if request.callback: request.callback.send(serialized_resp) - if reply.outputs: - # send tensor data after response - for output in reply.outputs: - request.callback.send(output) + if reply.outputs: + # send tensor data after response + for output in reply.outputs: + request.callback.send(output) self._perf_timer.measure_time("send") self._perf_timer.end_timings() diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 84bcec0887..2cb79767f9 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -28,6 +28,7 @@ import numpy as np import torch + from .....error import SmartSimError from .....log import get_logger from ...mli_schemas.tensor import tensor_capnp diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 31e189d1dd..7448fdfb79 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -26,6 +26,7 @@ import typing as t from abc import ABC, abstractmethod +from dataclasses import dataclass from .....error import SmartSimError from .....log import get_logger From 05b49f3bd2b9d883210c118dff7c6b15fda00fb6 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 19 Jul 2024 14:57:12 +0200 Subject: [PATCH 43/84] Correct exception_handler behavior on batch --- .../infrastructure/control/workermanager.py | 87 +++++++++++-------- 1 file changed, 50 insertions(+), 37 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 140ad9bc70..3e39b1731c 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -292,57 +292,70 @@ def _on_iteration(self) -> None: ) self._perf_timer.measure_time("transform_input") - replies = [InferenceReply() for _ in range(len(batch.requests))] try: execute_result = self._worker.execute( batch, model_result, transformed_input ) - self._perf_timer.measure_time("execute") + except Exception as e: + for request in batch.requests: + exception_handler( + e, request.callback, "Error executing worker." + ) + return + self._perf_timer.measure_time("execute") + + try: transformed_outputs = self._worker.transform_output( batch, execute_result, self._device ) - self._perf_timer.measure_time("transform_output") - except Exception: - logger.exception("Error executing worker") + except Exception as e: + for request in batch.requests: + exception_handler( + e, request.callback, "Failed while transforming the output." + ) + return + self._perf_timer.measure_time("transform_output") - else: - for reply_idx, (request, transformed_output) in enumerate( - zip(batch.requests, transformed_outputs) - ): - reply = replies[reply_idx] + for request, transformed_output in zip(batch.requests, transformed_outputs): + reply = InferenceReply() + if request.output_keys: try: - if request.output_keys: - reply.output_keys = self._worker.place_output( - request, transformed_output, self._feature_store - ) - else: - reply.outputs = transformed_output.outputs - self._perf_timer.measure_time("assign_output") - except Exception: - logger.exception("Error executing worker") - - if reply.outputs is None or not reply.outputs: - response = build_failure_reply("fail", "Outputs not found.") - else: - reply.status_enum = "complete" - reply.message = "Success" - response = build_reply(reply) - + reply.output_keys = self._worker.place_output( + request, + transformed_output, + self._feature_store, + ) + except Exception as e: + exception_handler( + e, request.callback, "Failed while placing the output." + ) + continue + else: + reply.outputs = transformed_output.outputs + self._perf_timer.measure_time("assign_output") + + + if reply.outputs is None: + response = build_failure_reply("fail", "Outputs not found.") + else: + reply.status_enum = "complete" + reply.message = "Success" response = build_reply(reply) - self._perf_timer.measure_time("build_reply") - serialized_resp = MessageHandler.serialize_response(response) + self._perf_timer.measure_time("build_reply") + + serialized_resp = MessageHandler.serialize_response(response) - self._perf_timer.measure_time("serialize_resp") + self._perf_timer.measure_time("serialize_resp") - if request.callback: - request.callback.send(serialized_resp) - if reply.outputs: - # send tensor data after response - for output in reply.outputs: - request.callback.send(output) - self._perf_timer.measure_time("send") + if request.callback: + request.callback.send(serialized_resp) + if reply.outputs: + # send tensor data after response + for output in reply.outputs: + request.callback.send(output) + self._perf_timer.measure_time("send") self._perf_timer.end_timings() From 14c3e9fec155e561696f310cc42fde060c63c6f5 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 19 Jul 2024 15:20:54 +0200 Subject: [PATCH 44/84] Style --- .../infrastructure/control/devicemanager.py | 4 +-- .../control/requestdispatcher.py | 13 +++---- .../infrastructure/control/workermanager.py | 35 +++++++++---------- smartsim/_core/utils/timings.py | 6 ++++ 4 files changed, 31 insertions(+), 27 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py index 1a2a860aa9..14b83a5044 100644 --- a/smartsim/_core/mli/infrastructure/control/devicemanager.py +++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py @@ -25,7 +25,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import typing as t -from contextlib import contextmanager from threading import RLock from types import TracebackType @@ -107,7 +106,8 @@ def get_free_device( return_device = device break - # If the model is not loaded on a free device, load it on another device (if available) + # If the model is not loaded on a free device, + # load it on another device (if available) if return_device is None: for candidate_device in self._devices: if ( diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index 10279c01d7..b63cdcc9ee 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -24,12 +24,13 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -# isort: off +# pylint: disable=import-error # pylint: disable-next=unused-import import dragon from dragon.mpbridge.queues import DragonQueue +# pylint: enable=import-error +# isort: off # isort: on import multiprocessing as mp @@ -157,7 +158,7 @@ def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> Non self._id = str(uuid.uuid4()) @property - def id(self) -> str: + def queue_id(self) -> str: return self._id def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]: @@ -327,9 +328,9 @@ def run(self) -> None: self._perf_timer.end_timings() - # pylint: disable-next=protected-access - if len(self._perf_timer._timings["r_dispatch"]) == 801: - self._perf_timer.print_timings(True) + + if self._perf_timer.max_length == 801: + self._perf_timer.print_timings(True) @property def task_queue(self) -> DragonQueue: diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 3e39b1731c..b5667293d6 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -24,22 +24,25 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import multiprocessing as mp -import os -import socket -import sys -import typing as t - +# pylint: disable=import-error +# pylint: disable-next=unused-import import dragon -import dragon.data.ddict.ddict as dragon_ddict -import dragon.infrastructure.connection as dragon_connection import dragon.infrastructure.policy as dragon_policy import dragon.infrastructure.process_desc as dragon_process_desc -import dragon.native.group_state as dragon_group_state -import dragon.native.machine as dragon_machine import dragon.native.process as dragon_process import dragon.native.process_group as dragon_process_group +# pylint: enable=import-error + +# isort: off +# isort: on + +import multiprocessing as mp +import os +import socket +import sys +import typing as t + from .....log import get_logger from ....entrypoints.service import Service from ....utils.timings import PerfTimer @@ -263,8 +266,6 @@ def _validate_request(self, request: InferenceRequest) -> bool: def _on_iteration(self) -> None: """Executes calls to the machine learning worker implementation to complete the inference pipeline""" - logger.debug("executing worker manager pipeline") - batch = self._request_dispatcher.task_queue.get() self._perf_timer.start_timings() if batch is None or 0 == len(batch.requests): @@ -292,17 +293,14 @@ def _on_iteration(self) -> None: ) self._perf_timer.measure_time("transform_input") - try: execute_result = self._worker.execute( batch, model_result, transformed_input ) except Exception as e: for request in batch.requests: - exception_handler( - e, request.callback, "Error executing worker." - ) - return + exception_handler(e, request.callback, "Error executing worker.") + return self._perf_timer.measure_time("execute") try: @@ -335,7 +333,6 @@ def _on_iteration(self) -> None: reply.outputs = transformed_output.outputs self._perf_timer.measure_time("assign_output") - if reply.outputs is None: response = build_failure_reply("fail", "Outputs not found.") else: @@ -359,7 +356,7 @@ def _on_iteration(self) -> None: self._perf_timer.end_timings() - if len(self._perf_timer._timings["w_send"]) == 801: + if self._perf_timer.max_length == 801: self._perf_timer.print_timings(True) def _can_shutdown(self) -> bool: diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py index 7fa2af04a6..1d35570e65 100644 --- a/smartsim/_core/utils/timings.py +++ b/smartsim/_core/utils/timings.py @@ -79,6 +79,12 @@ def measure_time(self, label: str) -> None: ) self._interm = time.perf_counter() + @property + def max_length(self) -> int: + if len(self._timings) == 0: + return 0 + return max(len(value) for value in self._timings.values()) + def print_timings(self, to_file: bool = False) -> None: print(" ".join(self._timings.keys())) value_array = np.array(list(self._timings.values()), dtype=float) From f93522f6b83cf86e2d9502efcde0561d4b6a6a9f Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Fri, 19 Jul 2024 15:05:58 -0500 Subject: [PATCH 45/84] Working post-merge version --- ex/high_throughput_inference/mock_app.py | 3 + smartsim/_core/entrypoints/service.py | 17 ---- .../_core/launcher/dragon/dragonBackend.py | 2 - .../control/requestdispatcher.py | 81 +++++++++++++++---- .../infrastructure/control/workermanager.py | 47 ++--------- .../mli/infrastructure/worker/torch_worker.py | 42 +++++----- .../_core/mli/infrastructure/worker/worker.py | 17 ++-- smartsim/_core/utils/timings.py | 38 ++++++--- 8 files changed, 132 insertions(+), 115 deletions(-) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index e244c93e0f..eef653791f 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -47,6 +47,9 @@ from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger +torch.set_num_interop_threads(16) +torch.set_num_threads(1) + logger = get_logger("App") class ProtoClient: diff --git a/smartsim/_core/entrypoints/service.py b/smartsim/_core/entrypoints/service.py index df9c2bbef6..6b4ef74b67 100644 --- a/smartsim/_core/entrypoints/service.py +++ b/smartsim/_core/entrypoints/service.py @@ -103,23 +103,6 @@ def execute(self) -> None: running = True cooldown_start: t.Optional[datetime.datetime] = None - headers = [ - "batch_size", - "w_deserialize", - "w_fetch_model", - "w_load_model", - "w_fetch_input", - "w_transform_input", - "w_execute", - "w_transform_output", - "w_assign_output", - "w_build_reply", - "w_serialize_resp", - "w_send", - ] - - print(",".join(headers)) - while running: self._on_iteration() diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index a6a8700ab0..445538f20e 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -504,8 +504,6 @@ def _start_steps(self) -> None: global_policy = dragon_policy.Policy( placement=dragon_policy.Policy.Placement.HOST_NAME, host_name=hosts[0], - affinity=dragon_policy.Policy.Affinity.SPECIFIC, - cpu_affinity=list(range(32)) + list(range(64, 64 + 32)), ) options = dragon_process_desc.ProcessOptions(make_inf_channels=True) grp = dragon_process_group.ProcessGroup( diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index b63cdcc9ee..c930d7d42c 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -28,6 +28,7 @@ # pylint: disable-next=unused-import import dragon from dragon.mpbridge.queues import DragonQueue + # pylint: enable=import-error # isort: off @@ -49,11 +50,16 @@ from ...comm.channel.channel import CommChannelBase from ...comm.channel.dragonchannel import DragonCommChannel from ...infrastructure.storage.featurestore import FeatureStore +from ...infrastructure.worker.torch_worker import TorchWorker from ...infrastructure.worker.worker import InferenceBatch, InferenceRequest from ...message_handler import MessageHandler from ...mli_schemas.model.model_capnp import Model +from ...mli_schemas.response.response_capnp import ResponseBuilder from ...mli_schemas.tensor.tensor_capnp import TensorDescriptor +if t.TYPE_CHECKING: + from smartsim._core.mli.mli_schemas.response.response_capnp import Status + logger = get_logger("Request Dispatcher") @@ -86,7 +92,6 @@ def deserialize_message( # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel` comm_channel = channel_type(callback_key) - # comm_channel = DragonCommChannel(request.replyChannel) input_keys: t.Optional[t.List[str]] = None input_bytes: t.Optional[t.List[bytes]] = None @@ -116,6 +121,37 @@ def deserialize_message( return inference_request +def build_failure_reply(status: "Status", message: str) -> ResponseBuilder: + return MessageHandler.build_response( + status=status, + message=message, + result=[], + custom_attributes=None, + ) + + +def exception_handler( + exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str +) -> None: + """ + Logs exceptions and sends a failure response. + + :param exc: The exception to be logged + :param reply_channel: The channel used to send replies + :param failure_message: Failure message to log and send back + """ + logger.exception( + f"{failure_message}\n" + f"Exception type: {type(exc).__name__}\n" + f"Exception message: {str(exc)}" + ) + serialized_resp = MessageHandler.serialize_response( + build_failure_reply("fail", failure_message) + ) + if reply_channel: + reply_channel.send(serialized_resp) + + class WorkerDevice: def __init__(self, name: str) -> None: """Wrapper around a device to keep track of loaded Models and availability @@ -263,7 +299,8 @@ def __init__( self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0) self._feature_store = feature_store self._comm_channel_type = comm_channel_type - self._perf_timer = PerfTimer(prefix="r_") + self._perf_timer = PerfTimer(prefix="r_", debug=False) + self._worker = TorchWorker() def _validate_request(self, request: InferenceRequest) -> bool: """Ensure the request can be processed. @@ -303,21 +340,26 @@ def run(self) -> None: while True: try: bytes_list: t.List[bytes] = self._incoming_channel.recv() - except Exception: pass else: + if not bytes_list: + exception_handler( + ValueError("No request data found"), + None, + "No request data found.", + ) + request_bytes = bytes_list[0] tensor_bytes_list = bytes_list[1:] + self._perf_timer.start_timings() request = deserialize_message(request_bytes, self._comm_channel_type) if request.input_meta and tensor_bytes_list: request.raw_inputs = tensor_bytes_list - self._perf_timer.start_timings() - request = deserialize_message(request_bytes, self._comm_channel_type) self._perf_timer.measure_time("deserialize_message") if not self._validate_request(request): - return + continue self._perf_timer.measure_time("validate_request") self.dispatch(request) self._perf_timer.measure_time("dispatch") @@ -328,7 +370,6 @@ def run(self) -> None: self._perf_timer.end_timings() - if self._perf_timer.max_length == 801: self._perf_timer.print_timings(True) @@ -384,15 +425,25 @@ def _update_model_version(self, model: Model) -> None: def flush_requests(self) -> None: for queue in self._queues: if queue.ready and queue.acquire(blocking=False): + self._perf_timer.measure_time("find_queue") try: - - self._perf_timer.measure_time("find_queue") - self._outgoing_queue.put( - InferenceBatch( - model_key=queue.model_key, requests=queue.flush() - ) + batch = InferenceBatch( + model_key=queue.model_key, requests=queue.flush(), inputs=None ) - self._perf_timer.measure_time("flush_requests") finally: + self._perf_timer.measure_time("flush_requests") queue.release() - break + fetch_results = self._worker.fetch_inputs( + batch=batch, feature_store=self._feature_store + ) + self._perf_timer.measure_time("fetch_input") + transformed_inputs = self._worker.transform_input( + batch=batch, fetch_results=fetch_results + ) + self._perf_timer.measure_time("transform_input") + batch.inputs = transformed_inputs + for request in batch.requests: + request.raw_inputs = [] + request.input_meta = [] + self._outgoing_queue.put(batch) + self._perf_timer.measure_time("put") diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index b5667293d6..d41a09a0d8 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -232,41 +232,12 @@ def _on_start(self) -> None: def _on_shutdown(self) -> None: self._dispatcher_process.join() - def _validate_request(self, request: InferenceRequest) -> bool: - """Ensure the request can be processed. - :param request: The request to validate - :return: True if the request is valid, False otherwise""" - if not self._feature_store: - if request.model_key: - logger.error("Unable to load model by key without feature store") - return False - - if request.input_keys: - logger.error("Unable to load inputs by key without feature store") - return False - - if request.output_keys: - logger.error("Unable to persist outputs by key without feature store") - return False - - if not request.model_key and not request.raw_model: - logger.error("Unable to continue without model bytes or feature store key") - return False - - if not request.input_keys and not request.raw_inputs: - logger.error("Unable to continue without input bytes or feature store keys") - return False - - if request.callback is None: - logger.error("No callback channel provided in request") - return False - - return True - def _on_iteration(self) -> None: """Executes calls to the machine learning worker implementation to complete the inference pipeline""" - batch = self._request_dispatcher.task_queue.get() + + batch: InferenceRequest = self._request_dispatcher.task_queue.get() + self._perf_timer.start_timings() if batch is None or 0 == len(batch.requests): return @@ -285,17 +256,11 @@ def _on_iteration(self) -> None: model_result = LoadModelResult(device.get_model(batch.model_key)) self._perf_timer.measure_time("load_model") - fetch_input_results = self._worker.fetch_inputs(batch, self._feature_store) - self._perf_timer.measure_time("fetch_input") - - transformed_input = self._worker.transform_input( - batch, fetch_input_results, self._device - ) - self._perf_timer.measure_time("transform_input") + transformed_input = batch.inputs try: execute_result = self._worker.execute( - batch, model_result, transformed_input + batch, model_result, transformed_input, device.name ) except Exception as e: for request in batch.requests: @@ -305,7 +270,7 @@ def _on_iteration(self) -> None: try: transformed_outputs = self._worker.transform_output( - batch, execute_result, self._device + batch, execute_result, device.name ) except Exception as e: for request in batch.requests: diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 2cb79767f9..45c9caadb3 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -64,7 +64,9 @@ def load_model( raise ValueError("Unable to load model without reference object") device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - device = device_to_torch[device] + for old, new in device_to_torch.items(): + device = device.replace(old, new) + buffer = io.BytesIO(initial_bytes=model_bytes) model = torch.jit.load(buffer, map_location=device) # type: ignore result = LoadModelResult(model) @@ -72,16 +74,12 @@ def load_model( @staticmethod def transform_input( - batch: InferenceBatch, fetch_results: list[FetchInputResult], device: str + batch: InferenceBatch, fetch_results: list[FetchInputResult] ) -> TransformInputResult: results: list[list[torch.Tensor]] = [] start = 0 slices: list[slice] = [] - device_to_torch = {"cpu": "cpu", "gpu": "cuda"} - for old, new in device_to_torch.items(): - device = device.replace(old, new) - for fetch_result in fetch_results: partial_result = [] if fetch_result.meta is None: @@ -107,10 +105,10 @@ def transform_input( result.append( torch.concatenate( [partial_result[t_idx] for partial_result in results] - ).to(device) + ) ) else: - result = [tensor.to(device) for tensor in results[0]] + result = results[0] return TransformInputResult(result, slices) # return data # note: this fails copy test! @@ -121,13 +119,18 @@ def execute( batch: InferenceBatch, load_result: LoadModelResult, transform_result: TransformInputResult, + device: str, ) -> ExecuteResult: if not load_result.model: raise SmartSimError("Model must be loaded to execute") - + device_to_torch = {"cpu": "cpu", "gpu": "cuda"} + for old, new in device_to_torch.items(): + device = device.replace(old, new) model: torch.nn.Module = load_result.model model.eval() - results = [model(tensor).detach() for tensor in transform_result.transformed] + results = [ + model(tensor.to(device)).detach() for tensor in transform_result.transformed + ] execute_result = ExecuteResult(results, transform_result.slices) return execute_result @@ -140,18 +143,13 @@ def transform_output( ) -> list[TransformOutputResult]: transformed_list: list[TransformOutputResult] = [] for result_slice in execute_result.slices: - if result_device != "cpu": - transformed = [ - item.to("cpu") for item in execute_result.predictions[result_slice] - ] - # todo: need the shape from latest schemas added here. - transformed_list.append( - TransformOutputResult(transformed, None, "c", "float32") - ) # fixme - + transformed = [ + item.to("cpu").numpy().tobytes() + for item in execute_result.predictions[result_slice] + ] + # todo: need the shape from latest schemas added here. transformed_list.append( - TransformOutputResult( - execute_result.predictions[result_slice], None, "c", "float32" - ) + TransformOutputResult(transformed, None, "c", "float32") ) # fixme + return transformed_list diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 7448fdfb79..ae0a847aea 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -67,12 +67,6 @@ def __init__( self.batch_size = batch_size -@dataclass -class InferenceBatch: - model_key: str - requests: list[InferenceRequest] - - class InferenceReply: """Internal representation of the reply to a client request for inference""" @@ -154,6 +148,13 @@ def __init__(self, result: bytes) -> None: self.model_bytes: bytes = result +@dataclass +class InferenceBatch: + model_key: str + requests: t.Optional[list[InferenceRequest]] + inputs: t.Optional[list[TransformInputResult]] + + class MachineLearningWorkerCore: """Basic functionality of ML worker that is shared across all worker types""" @@ -274,12 +275,11 @@ def load_model( @staticmethod @abstractmethod def transform_input( - batch: InferenceBatch, fetch_results: list[FetchInputResult], device: str + batch: InferenceBatch, fetch_results: list[FetchInputResult] ) -> TransformInputResult: """Given a collection of data, perform a transformation on the data :param request: The request that triggered the pipeline :param fetch_result: Raw outputs from fetching inputs out of a feature store - :param device: The device on which the transformed input must be placed :return: The transformed inputs wrapped in a InputTransformResult""" @staticmethod @@ -288,6 +288,7 @@ def execute( batch: InferenceBatch, load_result: LoadModelResult, transform_result: TransformInputResult, + device: str, ) -> ExecuteResult: """Execute an ML model on inputs transformed for use by the model :param request: The request that triggered the pipeline diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py index 1d35570e65..0ac13662a6 100644 --- a/smartsim/_core/utils/timings.py +++ b/smartsim/_core/utils/timings.py @@ -30,15 +30,22 @@ import numpy as np +from ...log import get_logger + +logger = get_logger("PerfTimer") + class PerfTimer: - def __init__(self, filename: str = "timings", prefix: str = ""): + def __init__( + self, filename: str = "timings", prefix: str = "", debug: bool = False + ): self._start: t.Optional[float] = None self._interm: t.Optional[float] = None self._timings: OrderedDict[str, list[t.Union[float, int, str]]] = OrderedDict() self._timing_on = True self._filename = filename self._prefix = prefix + self._debug = debug def _add_label_to_timings(self, label: str) -> None: if label not in self._timings: @@ -55,30 +62,40 @@ def start_timings( ) -> None: if self._timing_on: if first_label is not None and first_value is not None: + self._log(f"{first_label}: {first_value}") self._add_label_to_timings(self._make_label(first_label)) - self._timings[self._make_label(first_label)].append(first_value) + self._timings[self._make_label(first_label)].append( + self._format_number(first_value) + ) self._start = time.perf_counter() self._interm = time.perf_counter() def end_timings(self) -> None: if self._timing_on and self._start is not None: self._add_label_to_timings(self._make_label("total_time")) - self._timings[self._make_label("total_time")].append( - self._format_number(time.perf_counter() - self._start) - ) + delta = self._format_number(time.perf_counter() - self._start) + self._timings[self._make_label("total_time")].append(delta) + self._log(f"total_time: {delta}") self._interm = None def _make_label(self, label: str) -> str: return self._prefix + label + def _get_delta(self) -> float | int: + return time.perf_counter() - self._interm + def measure_time(self, label: str) -> None: if self._timing_on and self._interm is not None: self._add_label_to_timings(self._make_label(label)) - self._timings[self._make_label(label)].append( - self._format_number(time.perf_counter() - self._interm) - ) + delta = self._format_number(self._get_delta()) + self._timings[self._make_label(label)].append(delta) + self._log(f"{label}: {delta}") self._interm = time.perf_counter() + def _log(self, msg: str) -> None: + if self._debug: + logger.info(msg) + @property def max_length(self) -> int: if len(self._timings) == 0: @@ -89,7 +106,8 @@ def print_timings(self, to_file: bool = False) -> None: print(" ".join(self._timings.keys())) value_array = np.array(list(self._timings.values()), dtype=float) value_array = np.transpose(value_array) - for i in range(value_array.shape[0]): - print(" ".join(self._format_number(value) for value in value_array[i])) + if self._debug: + for i in range(value_array.shape[0]): + print(" ".join(self._format_number(value) for value in value_array[i])) if to_file: np.save(self._prefix + self._filename + ".npy", value_array) From 1bd73883243651452da2d26430c932787382e197 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sat, 20 Jul 2024 12:52:54 -0500 Subject: [PATCH 46/84] Fix indexing in multi-output --- ex/high_throughput_inference/mli_driver.py | 3 +- ex/high_throughput_inference/mock_app.py | 79 ++++++------------- .../mock_app_redis.py | 14 +++- ex/high_throughput_inference/redis_driver.py | 15 ++-- .../standalone_workermanager.py | 2 + .../control/requestdispatcher.py | 6 +- .../infrastructure/control/workermanager.py | 17 ++-- .../mli/infrastructure/worker/torch_worker.py | 9 ++- .../_core/mli/infrastructure/worker/worker.py | 3 +- smartsim/_core/utils/timings.py | 26 +++--- 10 files changed, 77 insertions(+), 97 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 1d4b121365..a03f391b60 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -31,7 +31,8 @@ worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) -app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) +app_rs: DragonRunSettings = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) +app_rs.set_tasks_per_node(4) app = exp.create_model("app", run_settings=app_rs) app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index eef653791f..545c18b509 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -41,11 +41,11 @@ import os import time import torch -import numbers -from collections import OrderedDict +from mpi4py import MPI from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger +from smartsim._core.utils.timings import PerfTimer torch.set_num_interop_threads(16) torch.set_num_threads(1) @@ -54,6 +54,8 @@ class ProtoClient: def __init__(self, timing_on: bool): + comm = MPI.COMM_WORLD + rank = comm.Get_rank() connect_to_infrastructure() ddict_str = os.environ["SS_DRG_DDICT"] self._ddict = DDict.attach(ddict_str) @@ -70,53 +72,14 @@ def __init__(self, timing_on: bool): self._start = None self._interm = None - self._timings: OrderedDict[str, list[numbers.Number]] = OrderedDict() - self._timing_on = timing_on - - def _add_label_to_timings(self, label: str): - if label not in self._timings: - self._timings[label] = [] - - @staticmethod - def _format_number(number: numbers.Number): - return f"{number:0.4e}" - - def start_timings(self, batch_size: int): - if self._timing_on: - self._add_label_to_timings("batch_size") - self._timings["batch_size"].append(batch_size) - self._start = time.perf_counter() - self._interm = time.perf_counter() - - def end_timings(self): - if self._timing_on: - self._add_label_to_timings("total_time") - self._timings["total_time"].append(self._format_number(time.perf_counter()-self._start)) - - def measure_time(self, label: str): - if self._timing_on: - self._add_label_to_timings(label) - self._timings[label].append(self._format_number(time.perf_counter()-self._interm)) - self._interm = time.perf_counter() - - def print_timings(self, to_file: bool = False): - print(" ".join(self._timings.keys())) - value_array = numpy.array([value for value in self._timings.values()], dtype=float) - value_array = numpy.transpose(value_array) - for i in range(value_array.shape[0]): - print(" ".join(self._format_number(value) for value in value_array[i])) - if to_file: - numpy.save("timings.npy", value_array) - numpy.savetxt("timings.txt", value_array) - + self._perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"a{rank}_") def run_model(self, model: bytes | str, batch: torch.Tensor): tensors = [batch.numpy()] - self.start_timings(batch.shape[0]) + self._perf_timer.start_timings("batch_size", batch.shape[0]) built_tensor_desc = MessageHandler.build_tensor_descriptor( "c", "float32", list(batch.shape)) - self.measure_time("build_tensor_descriptor") - built_model = None + self._perf_timer.measure_time("build_tensor_descriptor") if isinstance(model, str): model_arg = MessageHandler.build_model_key(model) else: @@ -129,22 +92,21 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): output_descriptors=[], custom_attributes=None, ) - self.measure_time("build_request") + self._perf_timer.measure_time("build_request") request_bytes = MessageHandler.serialize_request(request) - self.measure_time("serialize_request") + self._perf_timer.measure_time("serialize_request") with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: to_sendh.send_bytes(request_bytes) for t in tensors: to_sendh.send_bytes(t.tobytes()) #TODO NOT FAST ENOUGH!!! # to_sendh.send_bytes(bytes(t.data)) - logger.info(f"Message size: {len(request_bytes)} bytes") - self.measure_time("send") + self._perf_timer.measure_time("send") with self._from_worker_ch.recvh(timeout=None) as from_recvh: resp = from_recvh.recv_bytes(timeout=None) - self.measure_time("receive") + self._perf_timer.measure_time("receive") response = MessageHandler.deserialize_response(resp) - self.measure_time("deserialize_response") + self._perf_timer.measure_time("deserialize_response") # list of data blobs? recv depending on the len(response.result.descriptors)? data_blob = from_recvh.recv_bytes(timeout=None) result = torch.from_numpy( @@ -153,14 +115,17 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): dtype=str(response.result.descriptors[0].dataType), ) ) - self.measure_time("deserialize_tensor") + self._perf_timer.measure_time("deserialize_tensor") - self.end_timings() + self._perf_timer.end_timings() return result def set_model(self, key: str, model: bytes): self._ddict[key] = model + def print_timings(self, to_file: bool): + self._perf_timer.print_timings(to_file) + class ResNetWrapper(): def __init__(self, name: str, model: str): @@ -193,12 +158,12 @@ def name(self): client = ProtoClient(timing_on=True) client.set_model(resnet.name, resnet.model) - total_iterations = 100 + TOTAL_ITERATIONS = 100 - for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]: - logger.info(f"Batch size: {batch_size}") - for iteration_number in range(total_iterations + int(batch_size==1)): + for b_size in [1, 2, 4, 8, 16, 32, 64, 128]: + logger.info(f"Batch size: {b_size}") + for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)): logger.info(f"Iteration: {iteration_number}") - client.run_model(resnet.name, resnet.get_batch(batch_size)) + client.run_model(resnet.name, resnet.get_batch(b_size)) client.print_timings(to_file=True) \ No newline at end of file diff --git a/ex/high_throughput_inference/mock_app_redis.py b/ex/high_throughput_inference/mock_app_redis.py index c56b4fb8b4..c0e67f82df 100644 --- a/ex/high_throughput_inference/mock_app_redis.py +++ b/ex/high_throughput_inference/mock_app_redis.py @@ -29,6 +29,7 @@ import numpy import time import torch +from mpi4py import MPI from smartsim.log import get_logger from smartredis import Client @@ -56,6 +57,9 @@ def name(self): if __name__ == "__main__": + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + parser = argparse.ArgumentParser("Mock application") parser.add_argument("--device", default="cpu") args = parser.parse_args() @@ -73,9 +77,11 @@ def name(self): timing = [batch_size] logger.info(f"Iteration: {iteration_number}") start = time.perf_counter() - client.put_tensor(name="batch", data=resnet.get_batch(batch_size).numpy()) - client.run_model(name=resnet.name, inputs=["batch"], outputs=["result"]) - result = client.get_tensor(name="result") + input_name = f"batch_{rank}" + output_name = f"result_{rank}" + client.put_tensor(name=input_name, data=resnet.get_batch(batch_size).numpy()) + client.run_model(name=resnet.name, inputs=[input_name], outputs=[output_name]) + result = client.get_tensor(name=output_name) end = time.perf_counter() timing.append(end-start) timings.append(timing) @@ -83,6 +89,6 @@ def name(self): timings_np = numpy.asarray(timings) - numpy.save("timings.npy", timings_np) + numpy.save(f"timings_{rank}.npy", timings_np) for timing in timings: print(" ".join(str(t) for t in timing)) diff --git a/ex/high_throughput_inference/redis_driver.py b/ex/high_throughput_inference/redis_driver.py index ceddba4ef7..6a8b00c2a8 100644 --- a/ex/high_throughput_inference/redis_driver.py +++ b/ex/high_throughput_inference/redis_driver.py @@ -29,23 +29,24 @@ from smartsim import Experiment from smartsim.status import TERMINAL_STATUSES import time -import typing as t -device = "gpu" +DEVICE = "gpu" filedir = os.path.dirname(__file__) app_script_name = os.path.join(filedir, "mock_app_redis.py") -model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") +model_name = os.path.join(filedir, f"resnet50.{DEVICE.upper()}.pt") -exp_path = os.path.join(filedir, "redis_ai") +exp_path = os.path.join(filedir, "redis_ai_multi") os.makedirs(exp_path, exist_ok=True) -exp = Experiment("redis_ai", launcher="slurm", exp_path=exp_path) +exp = Experiment("redis_ai_multi", launcher="slurm", exp_path=exp_path) db = exp.create_database(interface="hsn0") -app_rs = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) +app_rs = exp.create_run_settings( + sys.executable, exe_args = [app_script_name, "--device", DEVICE] + ) app_rs.set_nodes(1) -app_rs.set_tasks(1) +app_rs.set_tasks(4) app = exp.create_model("app", run_settings=app_rs) app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index f781444d81..7ccfdc21c4 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -100,5 +100,7 @@ cooldown=10, comm_channel_type=DragonCommChannel, device = args.device, + batch_size=4, + batch_timeout=0.1, ) worker_manager.execute() diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index c930d7d42c..c45edb33f2 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -299,7 +299,7 @@ def __init__( self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0) self._feature_store = feature_store self._comm_channel_type = comm_channel_type - self._perf_timer = PerfTimer(prefix="r_", debug=False) + self._perf_timer = PerfTimer(prefix="r_", debug=True) self._worker = TorchWorker() def _validate_request(self, request: InferenceRequest) -> bool: @@ -370,8 +370,8 @@ def run(self) -> None: self._perf_timer.end_timings() - if self._perf_timer.max_length == 801: - self._perf_timer.print_timings(True) + if self._perf_timer.max_length == 4*801: + self._perf_timer.print_timings(False) @property def task_queue(self) -> DragonQueue: diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index d41a09a0d8..159ce10478 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -180,7 +180,7 @@ def __init__( """Dispatcher used to batch requests""" self._device_manager: DeviceManager = DeviceManager([WorkerDevice("gpu")]) - self._perf_timer = PerfTimer(prefix="w_") + self._perf_timer = PerfTimer(prefix="w_", debug=False) try: mp.set_start_method("dragon") @@ -192,17 +192,17 @@ def __init__( self._dispatcher_process = self._create_local_dispatcher_process() def _create_local_dispatcher_process(self) -> dragon_process_group.ProcessGroup: + dispatcher_cpus = 2 if sys.platform != "darwin": self_affinity: list[int] = list(os.sched_getaffinity(os.getpid())) - os.sched_setaffinity(os.getpid(), self_affinity[:-8]) + os.sched_setaffinity(os.getpid(), self_affinity[:-dispatcher_cpus]) else: self_affinity: list[int] = [] global_policy = dragon_policy.Policy( placement=dragon_policy.Policy.Placement.HOST_NAME, host_name=socket.gethostname(), affinity=dragon_policy.Policy.Affinity.SPECIFIC, - cpu_affinity=self_affinity[-8:], - device=dragon_policy.Policy.Device.CPU, + cpu_affinity=self_affinity[-dispatcher_cpus:], ) options = dragon_process_desc.ProcessOptions(make_inf_channels=True) grp = dragon_process_group.ProcessGroup( @@ -212,8 +212,7 @@ def _create_local_dispatcher_process(self) -> dragon_process_group.ProcessGroup: placement=dragon_policy.Policy.Placement.HOST_NAME, host_name=socket.gethostname(), affinity=dragon_policy.Policy.Affinity.SPECIFIC, - cpu_affinity=self_affinity[-8:], - device=dragon_policy.Policy.Device.CPU, + cpu_affinity=self_affinity[-dispatcher_cpus:], ) tmp_proc = dragon_process.ProcessTemplate( target=self._request_dispatcher.run, @@ -243,7 +242,6 @@ def _on_iteration(self) -> None: return self._perf_timer.measure_time("flush_requests") - # logger.info(f"Got batch of {len(batch.requests)} requests, acquiring device") device: WorkerDevice = next( self._device_manager.get_free_device( worker=self._worker, @@ -270,7 +268,7 @@ def _on_iteration(self) -> None: try: transformed_outputs = self._worker.transform_output( - batch, execute_result, device.name + batch, execute_result ) except Exception as e: for request in batch.requests: @@ -281,6 +279,7 @@ def _on_iteration(self) -> None: self._perf_timer.measure_time("transform_output") for request, transformed_output in zip(batch.requests, transformed_outputs): + print(len(transformed_output.outputs), flush=True) reply = InferenceReply() if request.output_keys: try: @@ -321,7 +320,7 @@ def _on_iteration(self) -> None: self._perf_timer.end_timings() - if self._perf_timer.max_length == 801: + if self._perf_timer.max_length == 4*801: self._perf_timer.print_timings(True) def _can_shutdown(self) -> bool: diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 45c9caadb3..cc70c9451c 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -27,6 +27,7 @@ import io import numpy as np +import pickle import torch from .....error import SmartSimError @@ -44,7 +45,7 @@ ) torch.set_num_threads(1) -torch.set_num_interop_threads(16) +torch.set_num_interop_threads(2) logger = get_logger(__name__) @@ -139,13 +140,13 @@ def execute( def transform_output( batch: InferenceBatch, execute_result: ExecuteResult, - result_device: str, ) -> list[TransformOutputResult]: transformed_list: list[TransformOutputResult] = [] for result_slice in execute_result.slices: + print(result_slice, flush=True) transformed = [ - item.to("cpu").numpy().tobytes() - for item in execute_result.predictions[result_slice] + item[result_slice].to("cpu").numpy().tobytes() + for item in execute_result.predictions ] # todo: need the shape from latest schemas added here. transformed_list.append( diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index ae0a847aea..f0074e474e 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -299,11 +299,10 @@ def execute( @staticmethod @abstractmethod def transform_output( - batch: InferenceBatch, execute_result: ExecuteResult, result_device: str + batch: InferenceBatch, execute_result: ExecuteResult ) -> t.List[TransformOutputResult]: """Given inference results, perform transformations required to transmit results to the requestor. :param request: The request that triggered the pipeline :param execute_result: The result of inference wrapped in an ExecuteResult - :param result_device: The device on which the result of inference is placed :return:""" diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py index 0ac13662a6..154ebb67b8 100644 --- a/smartsim/_core/utils/timings.py +++ b/smartsim/_core/utils/timings.py @@ -37,7 +37,7 @@ class PerfTimer: def __init__( - self, filename: str = "timings", prefix: str = "", debug: bool = False + self, filename: str = "timings", prefix: str = "", timing_on: bool = True, debug: bool = False ): self._start: t.Optional[float] = None self._interm: t.Optional[float] = None @@ -62,34 +62,40 @@ def start_timings( ) -> None: if self._timing_on: if first_label is not None and first_value is not None: - self._log(f"{first_label}: {first_value}") - self._add_label_to_timings(self._make_label(first_label)) - self._timings[self._make_label(first_label)].append( - self._format_number(first_value) + mod_label = self._make_label(first_label) + value = self._format_number(first_value) + self._log(f"Started timing: {first_label}: {value}") + self._add_label_to_timings(mod_label) + self._timings[mod_label].append( + value ) self._start = time.perf_counter() self._interm = time.perf_counter() def end_timings(self) -> None: if self._timing_on and self._start is not None: - self._add_label_to_timings(self._make_label("total_time")) + mod_label = self._make_label("total_time") + self._add_label_to_timings(mod_label) delta = self._format_number(time.perf_counter() - self._start) self._timings[self._make_label("total_time")].append(delta) - self._log(f"total_time: {delta}") + self._log(f"Finished timing: {mod_label}: {delta}") self._interm = None def _make_label(self, label: str) -> str: return self._prefix + label def _get_delta(self) -> float | int: + if self._interm is None: + return 0 return time.perf_counter() - self._interm def measure_time(self, label: str) -> None: if self._timing_on and self._interm is not None: - self._add_label_to_timings(self._make_label(label)) + mod_label = self._make_label(label) + self._add_label_to_timings(mod_label) delta = self._format_number(self._get_delta()) - self._timings[self._make_label(label)].append(delta) - self._log(f"{label}: {delta}") + self._timings[mod_label].append(delta) + self._log(f"{mod_label}: {delta}") self._interm = time.perf_counter() def _log(self, msg: str) -> None: From d1e9639260010d706512dbb86020a3b441e45468 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sun, 21 Jul 2024 18:24:46 -0500 Subject: [PATCH 47/84] Almost good results --- ex/high_throughput_inference/mli_driver.py | 21 ++- ex/high_throughput_inference/mock_app.py | 4 +- .../standalone_workermanager.py | 2 +- .../_core/mli/comm/channel/dragonchannel.py | 5 +- smartsim/_core/mli/comm/channel/dragonfli.py | 6 +- .../control/requestdispatcher.py | 24 ++-- .../infrastructure/control/workermanager.py | 35 +++-- .../mli/infrastructure/worker/torch_worker.py | 135 +++++++++++++----- .../_core/mli/infrastructure/worker/worker.py | 4 +- 9 files changed, 159 insertions(+), 77 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index a03f391b60..c7c5445b8a 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -1,5 +1,3 @@ - - import os import base64 import cloudpickle @@ -27,16 +25,27 @@ torch_worker_str = base64.b64encode(cloudpickle.dumps(TorchWorker)).decode("ascii") -worker_manager_rs: DragonRunSettings = exp.create_run_settings(sys.executable, [worker_manager_script_name, "--device", device, "--worker_class", torch_worker_str]) +worker_manager_rs: DragonRunSettings = exp.create_run_settings( + sys.executable, + [ + worker_manager_script_name, + "--device", + device, + "--worker_class", + torch_worker_str, + ], +) worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) -app_rs: DragonRunSettings = exp.create_run_settings(sys.executable, exe_args = [app_script_name, "--device", device]) +app_rs: DragonRunSettings = exp.create_run_settings( + sys.executable, + exe_args=[app_script_name, "--device", device], +) app_rs.set_tasks_per_node(4) app = exp.create_model("app", run_settings=app_rs) app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) - exp.generate(worker_manager, app, overwrite=True) exp.start(worker_manager, app, block=False) @@ -50,4 +59,4 @@ exp.stop(app) break -print("Exiting.") \ No newline at end of file +print("Exiting.") diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 545c18b509..e497c1fdee 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -160,10 +160,12 @@ def name(self): TOTAL_ITERATIONS = 100 - for b_size in [1, 2, 4, 8, 16, 32, 64, 128]: + for log2_bsize in range(7): + b_size: int = 2**log2_bsize logger.info(f"Batch size: {b_size}") for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)): logger.info(f"Iteration: {iteration_number}") client.run_model(resnet.name, resnet.get_batch(b_size)) + logger.info(client._perf_timer.get_last("total_time")) client.print_timings(to_file=True) \ No newline at end of file diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 7ccfdc21c4..8c870d1b95 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -101,6 +101,6 @@ comm_channel_type=DragonCommChannel, device = args.device, batch_size=4, - batch_timeout=0.1, + batch_timeout=0.0005, # 1e-3 is the best with ResNet50 ) worker_manager.execute() diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index a45adaee33..1370c57452 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import sys import typing as t import smartsim._core.mli.comm.channel.channel as cch @@ -52,6 +51,6 @@ def send(self, value: bytes) -> None: def recv(self) -> t.List[bytes]: """Receieve a message through the underlying communication channel :returns: the received message""" - with self._channel.recvh(timeout=None) as recvh: - message_bytes: bytes = recvh.recv_bytes(timeout=None) + with self._channel.recvh(timeout=0.01) as recvh: + message_bytes: bytes = recvh.recv_bytes(timeout=1) return [message_bytes] diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 28b4c2bf3b..3d1ed3a1f6 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -62,11 +62,11 @@ def recv(self) -> t.List[bytes]: :returns: the received message""" messages = [] eot = False - with self._fli.recvh(timeout=None) as recvh: + with self._fli.recvh(timeout=0.01) as recvh: while not eot: try: - message, _ = recvh.recv_bytes(timeout=None) + message, _ = recvh.recv_bytes(timeout=1) messages.append(message) - except fli.FLIEOT as exc: + except fli.FLIEOT: eot = True return messages diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index c45edb33f2..d8ac4e2b7c 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -38,14 +38,15 @@ import time import typing as t import uuid +from concurrent.futures import Future, ThreadPoolExecutor from queue import Empty, Full, Queue -from threading import Lock +from threading import RLock from types import TracebackType from packaging.version import Version from .....error import SmartSimError -from .....log import get_logger +from .....log import ContextThread, get_logger from ....utils.timings import PerfTimer from ...comm.channel.channel import CommChannelBase from ...comm.channel.dragonchannel import DragonCommChannel @@ -161,7 +162,7 @@ def __init__(self, name: str) -> None: """The name used by the toolkit to identify this device""" self._models: dict[str, t.Any] = {} """Dictionary of model key to model for models stored on this device""" - self._lock = Lock() + self._lock = RLock() """Lock to ensure only one thread at the time accesses this device""" def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]: @@ -190,7 +191,7 @@ def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> Non self._first_put: t.Optional[float] = None self._disposable = False self._model_key = model_key - self._flush_lock = Lock() + self._flush_lock = RLock() self._id = str(uuid.uuid4()) @property @@ -294,12 +295,12 @@ def __init__( self._model_name_to_key: dict[str, str] = {} self._batch_timeout = batch_timeout self._batch_size = batch_size - self._queue_swap_lock: t.Optional[Lock] = None + self._queue_swap_lock: t.Optional[RLock] = None self._incoming_channel = incoming_channel self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0) self._feature_store = feature_store self._comm_channel_type = comm_channel_type - self._perf_timer = PerfTimer(prefix="r_", debug=True) + self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=False) self._worker = TorchWorker() def _validate_request(self, request: InferenceRequest) -> bool: @@ -334,7 +335,7 @@ def _validate_request(self, request: InferenceRequest) -> bool: return True def run(self) -> None: - self._queue_swap_lock = Lock() + self._queue_swap_lock = RLock() if self._incoming_channel is None: raise SmartSimError("No incoming channel for dispatcher") while True: @@ -357,11 +358,14 @@ def run(self) -> None: request = deserialize_message(request_bytes, self._comm_channel_type) if request.input_meta and tensor_bytes_list: request.raw_inputs = tensor_bytes_list + self._perf_timer.measure_time("deserialize_message") if not self._validate_request(request): continue + self._perf_timer.measure_time("validate_request") self.dispatch(request) + self._perf_timer.measure_time("dispatch") finally: self.flush_requests() @@ -370,9 +374,6 @@ def run(self) -> None: self._perf_timer.end_timings() - if self._perf_timer.max_length == 4*801: - self._perf_timer.print_timings(False) - @property def task_queue(self) -> DragonQueue: return self._outgoing_queue @@ -425,6 +426,7 @@ def _update_model_version(self, model: Model) -> None: def flush_requests(self) -> None: for queue in self._queues: if queue.ready and queue.acquire(blocking=False): + self._perf_timer.start_timings() self._perf_timer.measure_time("find_queue") try: batch = InferenceBatch( @@ -445,5 +447,7 @@ def flush_requests(self) -> None: for request in batch.requests: request.raw_inputs = [] request.input_meta = [] + self._outgoing_queue.put(batch) self._perf_timer.measure_time("put") + self._perf_timer.end_timings() diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 159ce10478..65111fe482 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -51,8 +51,8 @@ from ...infrastructure.environmentloader import EnvironmentConfigLoader from ...infrastructure.storage.featurestore import FeatureStore from ...infrastructure.worker.worker import ( + InferenceBatch, InferenceReply, - InferenceRequest, LoadModelResult, MachineLearningWorkerBase, ) @@ -178,21 +178,21 @@ def __init__( feature_store=self._feature_store, ) """Dispatcher used to batch requests""" - self._device_manager: DeviceManager = DeviceManager([WorkerDevice("gpu")]) - - self._perf_timer = PerfTimer(prefix="w_", debug=False) + self._device_manager: DeviceManager = DeviceManager( + [WorkerDevice(f"gpu:{idx}") for idx in range(4)] + ) + self._device_idx: int = 0 + self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=False) try: mp.set_start_method("dragon") except RuntimeError: pass - # self._dispatcher_process = mp.Process( - # target=self._request_dispatcher.run, name="Dispatcher" - # ) + self._dispatcher_process = self._create_local_dispatcher_process() def _create_local_dispatcher_process(self) -> dragon_process_group.ProcessGroup: - dispatcher_cpus = 2 + dispatcher_cpus = 16 if sys.platform != "darwin": self_affinity: list[int] = list(os.sched_getaffinity(os.getpid())) os.sched_setaffinity(os.getpid(), self_affinity[:-dispatcher_cpus]) @@ -235,7 +235,7 @@ def _on_iteration(self) -> None: """Executes calls to the machine learning worker implementation to complete the inference pipeline""" - batch: InferenceRequest = self._request_dispatcher.task_queue.get() + batch: InferenceBatch = self._request_dispatcher.task_queue.get() self._perf_timer.start_timings() if batch is None or 0 == len(batch.requests): @@ -254,6 +254,14 @@ def _on_iteration(self) -> None: model_result = LoadModelResult(device.get_model(batch.model_key)) self._perf_timer.measure_time("load_model") + if batch.inputs is None: + for request in batch.requests: + exception_handler( + ValueError("Error batching inputs"), + request.callback, + "Error batching inputs.", + ) + return transformed_input = batch.inputs try: @@ -267,9 +275,7 @@ def _on_iteration(self) -> None: self._perf_timer.measure_time("execute") try: - transformed_outputs = self._worker.transform_output( - batch, execute_result - ) + transformed_outputs = self._worker.transform_output(batch, execute_result) except Exception as e: for request in batch.requests: exception_handler( @@ -279,7 +285,6 @@ def _on_iteration(self) -> None: self._perf_timer.measure_time("transform_output") for request, transformed_output in zip(batch.requests, transformed_outputs): - print(len(transformed_output.outputs), flush=True) reply = InferenceReply() if request.output_keys: try: @@ -320,8 +325,8 @@ def _on_iteration(self) -> None: self._perf_timer.end_timings() - if self._perf_timer.max_length == 4*801: - self._perf_timer.print_timings(True) + # if self._perf_timer.max_length == 4 * 801: + # self._perf_timer.print_timings(True) def _can_shutdown(self) -> bool: """Return true when the criteria to shut down the service are met.""" diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index cc70c9451c..52a2698467 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -25,9 +25,9 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import io +from concurrent.futures import Future, ThreadPoolExecutor import numpy as np -import pickle import torch from .....error import SmartSimError @@ -44,7 +44,7 @@ TransformOutputResult, ) -torch.set_num_threads(1) +torch.set_num_threads(4) torch.set_num_interop_threads(2) logger = get_logger(__name__) @@ -70,6 +70,7 @@ def load_model( buffer = io.BytesIO(initial_bytes=model_bytes) model = torch.jit.load(buffer, map_location=device) # type: ignore + model.eval() result = LoadModelResult(model) return result @@ -77,42 +78,99 @@ def load_model( def transform_input( batch: InferenceBatch, fetch_results: list[FetchInputResult] ) -> TransformInputResult: - results: list[list[torch.Tensor]] = [] - start = 0 + results: list[torch.Tensor] = [] + total_samples = 0 slices: list[slice] = [] - for fetch_result in fetch_results: - partial_result = [] - if fetch_result.meta is None: - raise ValueError("Cannot reconstruct tensor without meta information") - for idx, (item, item_meta) in enumerate( - zip(fetch_result.inputs, fetch_result.meta) + all_dims: list[list[int]] = [] + all_dtypes: list[str] = [] + if fetch_results[0].meta is None: + raise ValueError("Cannot reconstruct tensor without meta information") + # Traverse inputs to get total number of samples and compute slices + # Assumption: first dimension is samples, all tensors in the same input + # have same number of samples + # thus we only look at the first tensor for each input + for res_idx, fetch_result in enumerate(fetch_results): + if fetch_result.meta is None or any( + item_meta is None for item_meta in fetch_result.meta ): - tensor_desc: tensor_capnp.TensorDescriptor = item_meta - partial_result.append( - torch.tensor( - np.frombuffer(item, dtype=str(tensor_desc.dataType)) - ).reshape(tuple(dim for dim in tensor_desc.dimensions)) - ) - if idx == 0: - num_samples = tensor_desc.dimensions[0] - slices.append(slice(start, start + num_samples)) - start = start + num_samples - results.append(partial_result) - - result: list[torch.Tensor] = [] - if len(batch.requests) > 1: - for t_idx in range(len(results[0])): - result.append( - torch.concatenate( - [partial_result[t_idx] for partial_result in results] - ) + raise ValueError("Cannot reconstruct tensor without meta information") + first_tensor_desc: tensor_capnp.TensorDescriptor = fetch_result.meta[0] + num_samples = first_tensor_desc.dimensions[0] + slices.append(slice(total_samples, total_samples + num_samples)) + total_samples = total_samples + num_samples + + if res_idx == len(fetch_results)-1: + # For each tensor in the last input, get remaining dimensions + # Assumptions: all inputs have the same number of tensors and + # last N-1 dimensions match across inputs for corresponding tensors + # thus: resulting array will be of size (num_samples, all_other_dims) + for item_meta in fetch_result.meta: + tensor_desc: tensor_capnp.TensorDescriptor = item_meta + tensor_dims = list(tensor_desc.dimensions) + all_dims.append([total_samples, *tensor_dims[1:]]) + all_dtypes.append(str(tensor_desc.dataType)) + + for result_tensor_idx, (dims, dtype) in enumerate(zip(all_dims, all_dtypes)): + # List comprehension concatenation can be faster sometimes + all_bytes = b"".join( + [ + fetch_result.inputs[result_tensor_idx] + for fetch_result in fetch_results + ] + ) + + results.append( + torch.from_numpy( + np.frombuffer( + all_bytes, + dtype=dtype, + ).reshape(dims) ) - else: - result = results[0] - - return TransformInputResult(result, slices) - # return data # note: this fails copy test! + ) + + return TransformInputResult(results, slices) + + # @staticmethod + # def _transform_input( + # batch: InferenceBatch, fetch_results: list[FetchInputResult] + # ) -> TransformInputResult: + # results: list[list[torch.Tensor]] = [] + # start = 0 + # slices: list[slice] = [] + + # for fetch_result in fetch_results: + # partial_result = [] + # if fetch_result.meta is None: + # raise ValueError("Cannot reconstruct tensor without meta information") + # for idx, (item, item_meta) in enumerate( + # zip(fetch_result.inputs, fetch_result.meta) + # ): + # tensor_desc: tensor_capnp.TensorDescriptor = item_meta + # partial_result.append( + # torch.tensor( + # np.frombuffer(item, dtype=str(tensor_desc.dataType)) + # ).reshape(tuple(dim for dim in tensor_desc.dimensions)) + # ) + # if idx == 0: + # num_samples = tensor_desc.dimensions[0] + # slices.append(slice(start, start + num_samples)) + # start = start + num_samples + # results.append(partial_result) + + # result: list[torch.Tensor] = [] + # if len(batch.requests) > 1: + # for t_idx in range(len(results[0])): + # result.append( + # torch.concatenate( + # [partial_result[t_idx] for partial_result in results] + # ) + # ) + # else: + # result = results[0] + + # return TransformInputResult(result, slices) + # return data # note: this fails copy test! # pylint: disable-next=unused-argument @staticmethod @@ -129,10 +187,14 @@ def execute( device = device.replace(old, new) model: torch.nn.Module = load_result.model model.eval() + # print([tensor.shape for tensor in transform_result.transformed]) + # torch.cuda.empty_cache() results = [ model(tensor.to(device)).detach() for tensor in transform_result.transformed ] + transform_result.transformed = [] + execute_result = ExecuteResult(results, transform_result.slices) return execute_result @@ -143,9 +205,8 @@ def transform_output( ) -> list[TransformOutputResult]: transformed_list: list[TransformOutputResult] = [] for result_slice in execute_result.slices: - print(result_slice, flush=True) transformed = [ - item[result_slice].to("cpu").numpy().tobytes() + item[result_slice].cpu().numpy().tobytes() for item in execute_result.predictions ] # todo: need the shape from latest schemas added here. @@ -153,4 +214,6 @@ def transform_output( TransformOutputResult(transformed, None, "c", "float32") ) # fixme + execute_result.predictions = [] + return transformed_list diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index f0074e474e..a7dc6811da 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -151,8 +151,8 @@ def __init__(self, result: bytes) -> None: @dataclass class InferenceBatch: model_key: str - requests: t.Optional[list[InferenceRequest]] - inputs: t.Optional[list[TransformInputResult]] + requests: list[InferenceRequest] + inputs: t.Optional[TransformInputResult] class MachineLearningWorkerCore: From 91ffaee5d802c4a72e801cd7617f4d65ab2bb1b4 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sun, 21 Jul 2024 18:24:59 -0500 Subject: [PATCH 48/84] New timings API --- smartsim/_core/utils/timings.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py index 154ebb67b8..c8f6c71003 100644 --- a/smartsim/_core/utils/timings.py +++ b/smartsim/_core/utils/timings.py @@ -37,12 +37,16 @@ class PerfTimer: def __init__( - self, filename: str = "timings", prefix: str = "", timing_on: bool = True, debug: bool = False + self, + filename: str = "timings", + prefix: str = "", + timing_on: bool = True, + debug: bool = False, ): self._start: t.Optional[float] = None self._interm: t.Optional[float] = None self._timings: OrderedDict[str, list[t.Union[float, int, str]]] = OrderedDict() - self._timing_on = True + self._timing_on = timing_on self._filename = filename self._prefix = prefix self._debug = debug @@ -66,9 +70,7 @@ def start_timings( value = self._format_number(first_value) self._log(f"Started timing: {first_label}: {value}") self._add_label_to_timings(mod_label) - self._timings[mod_label].append( - value - ) + self._timings[mod_label].append(value) self._start = time.perf_counter() self._interm = time.perf_counter() @@ -89,6 +91,15 @@ def _get_delta(self) -> float | int: return 0 return time.perf_counter() - self._interm + def get_last(self, label: str) -> str: + mod_label = self._make_label(label) + if mod_label in self._timings: + value = self._timings[mod_label][-1] + if value: + return f"{label}: {value}" + + return "Not measured yet" + def measure_time(self, label: str) -> None: if self._timing_on and self._interm is not None: mod_label = self._make_label(label) From b9e9796f7065501cc77c8aebebcc421a9fac9f00 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 23 Jul 2024 16:51:32 -0500 Subject: [PATCH 49/84] Pre-cleanup, best results so far --- ex/high_throughput_inference/mli_driver.py | 11 +- ex/high_throughput_inference/mock_app.py | 25 ++- .../standalone_workermanager.py | 4 +- .../_core/launcher/dragon/dragonBackend.py | 5 +- .../_core/mli/comm/channel/dragonchannel.py | 4 +- smartsim/_core/mli/comm/channel/dragonfli.py | 4 +- .../control/requestdispatcher.py | 15 +- .../infrastructure/control/workermanager.py | 41 +++-- .../mli/infrastructure/worker/torch_worker.py | 155 +++++++++--------- .../_core/mli/infrastructure/worker/worker.py | 14 +- 10 files changed, 159 insertions(+), 119 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index c7c5445b8a..1d1642567c 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -19,7 +19,7 @@ os.environ["SMARTSIM_DRAGON_TRANSPORT"] = transport -exp_path = os.path.join(filedir, f"MLI_proto_batch_{transport.upper()}") +exp_path = os.path.join(filedir, f"MLI_proto_{transport.upper()}") os.makedirs(exp_path, exist_ok=True) exp = Experiment("MLI_proto", launcher="dragon", exp_path=exp_path) @@ -35,6 +35,11 @@ torch_worker_str, ], ) +aff = [] +for i in range(32): + aff.append(i) + # aff.append(i+64) +worker_manager_rs.set_cpu_affinity(aff) worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) @@ -42,7 +47,9 @@ sys.executable, exe_args=[app_script_name, "--device", device], ) -app_rs.set_tasks_per_node(4) +app_rs.set_tasks_per_node(1) + + app = exp.create_model("app", run_settings=app_rs) app.attach_generator_files(to_copy=[app_script_name], to_symlink=[model_name]) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index e497c1fdee..2a76fdbe9d 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -52,6 +52,8 @@ logger = get_logger("App") +CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False + class ProtoClient: def __init__(self, timing_on: bool): comm = MPI.COMM_WORLD @@ -70,8 +72,6 @@ def __init__(self, timing_on: bool): self._from_worker_ch_serialized = self._from_worker_ch.serialize() self._to_worker_ch = Channel.make_process_local() - self._start = None - self._interm = None self._perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"a{rank}_") def run_model(self, model: bytes | str, batch: torch.Tensor): @@ -95,10 +95,13 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self._perf_timer.measure_time("build_request") request_bytes = MessageHandler.serialize_request(request) self._perf_timer.measure_time("serialize_request") + tensor_bytes = [bytes(tensor.data) for tensor in tensors] + # tensor_bytes = [tensor.reshape(-1).view(numpy.uint8).data for tensor in tensors] + self._perf_timer.measure_time("serialize_tensor") with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: to_sendh.send_bytes(request_bytes) - for t in tensors: - to_sendh.send_bytes(t.tobytes()) #TODO NOT FAST ENOUGH!!! + for tb in tensor_bytes: + to_sendh.send_bytes(tb) #TODO NOT FAST ENOUGH!!! # to_sendh.send_bytes(bytes(t.data)) self._perf_timer.measure_time("send") @@ -158,14 +161,24 @@ def name(self): client = ProtoClient(timing_on=True) client.set_model(resnet.name, resnet.model) + pt_model = torch.jit.load(io.BytesIO(initial_bytes=(resnet.model))).to("cuda:0") + TOTAL_ITERATIONS = 100 - for log2_bsize in range(7): + for log2_bsize in range(8): b_size: int = 2**log2_bsize logger.info(f"Batch size: {b_size}") for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)): logger.info(f"Iteration: {iteration_number}") - client.run_model(resnet.name, resnet.get_batch(b_size)) + batch = resnet.get_batch(b_size) + remote_result = client.run_model(resnet.name, batch) logger.info(client._perf_timer.get_last("total_time")) + if CHECK_RESULTS_AND_MAKE_ALL_SLOWER: + local_res = pt_model(batch.to("cuda:0")) + err_norm = torch.linalg.vector_norm(torch.flatten(remote_result).to("cuda:0")-torch.flatten(local_res), ord=1).cpu() + res_norm = torch.linalg.vector_norm(remote_result, ord=1).item() + local_res_norm = torch.linalg.vector_norm(local_res, ord=1).item() + logger.info(f"Avg norm of error {err_norm.item()/b_size} compared to result norm of {res_norm/b_size}:{local_res_norm/b_size}") + torch.cuda.synchronize() client.print_timings(to_file=True) \ No newline at end of file diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 8c870d1b95..89f5eedd0d 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -100,7 +100,7 @@ cooldown=10, comm_channel_type=DragonCommChannel, device = args.device, - batch_size=4, - batch_timeout=0.0005, # 1e-3 is the best with ResNet50 + batch_size=1, + batch_timeout=0.001, # 1e-3 is the best with ResNet50 for bs>32 ) worker_manager.execute() diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py index 445538f20e..344a57bc34 100644 --- a/smartsim/_core/launcher/dragon/dragonBackend.py +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -501,10 +501,7 @@ def _start_steps(self) -> None: logger.debug(f"Step id {step_id} allocated on {hosts}") - global_policy = dragon_policy.Policy( - placement=dragon_policy.Policy.Placement.HOST_NAME, - host_name=hosts[0], - ) + global_policy = self.create_run_policy(request, hosts[0]) options = dragon_process_desc.ProcessOptions(make_inf_channels=True) grp = dragon_process_group.ProcessGroup( restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy diff --git a/smartsim/_core/mli/comm/channel/dragonchannel.py b/smartsim/_core/mli/comm/channel/dragonchannel.py index 1370c57452..e09f2f628c 100644 --- a/smartsim/_core/mli/comm/channel/dragonchannel.py +++ b/smartsim/_core/mli/comm/channel/dragonchannel.py @@ -51,6 +51,6 @@ def send(self, value: bytes) -> None: def recv(self) -> t.List[bytes]: """Receieve a message through the underlying communication channel :returns: the received message""" - with self._channel.recvh(timeout=0.01) as recvh: - message_bytes: bytes = recvh.recv_bytes(timeout=1) + with self._channel.recvh(timeout=None) as recvh: + message_bytes: bytes = recvh.recv_bytes(timeout=None) return [message_bytes] diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 3d1ed3a1f6..9f5d628d5f 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -62,10 +62,10 @@ def recv(self) -> t.List[bytes]: :returns: the received message""" messages = [] eot = False - with self._fli.recvh(timeout=0.01) as recvh: + with self._fli.recvh(timeout=None) as recvh: while not eot: try: - message, _ = recvh.recv_bytes(timeout=1) + message, _ = recvh.recv_bytes(timeout=None) messages.append(message) except fli.FLIEOT: eot = True diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index d8ac4e2b7c..018e094e0b 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -27,6 +27,7 @@ # pylint: disable=import-error # pylint: disable-next=unused-import import dragon +from dragon.managed_memory import MemoryAlloc, MemoryPool from dragon.mpbridge.queues import DragonQueue # pylint: enable=import-error @@ -38,7 +39,6 @@ import time import typing as t import uuid -from concurrent.futures import Future, ThreadPoolExecutor from queue import Empty, Full, Queue from threading import RLock from types import TracebackType @@ -46,7 +46,7 @@ from packaging.version import Version from .....error import SmartSimError -from .....log import ContextThread, get_logger +from .....log import get_logger from ....utils.timings import PerfTimer from ...comm.channel.channel import CommChannelBase from ...comm.channel.dragonchannel import DragonCommChannel @@ -284,6 +284,7 @@ def __init__( self, batch_timeout: float, batch_size: int, + mem_pool: MemoryPool, incoming_channel: t.Optional[CommChannelBase], comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, feature_store: t.Optional[FeatureStore] = None, @@ -300,8 +301,9 @@ def __init__( self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0) self._feature_store = feature_store self._comm_channel_type = comm_channel_type - self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=False) + self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True) self._worker = TorchWorker() + self._mem_pool = mem_pool def _validate_request(self, request: InferenceRequest) -> bool: """Ensure the request can be processed. @@ -374,6 +376,9 @@ def run(self) -> None: self._perf_timer.end_timings() + if self._perf_timer.max_length == 801: + self._perf_timer.print_timings(True) + @property def task_queue(self) -> DragonQueue: return self._outgoing_queue @@ -426,7 +431,6 @@ def _update_model_version(self, model: Model) -> None: def flush_requests(self) -> None: for queue in self._queues: if queue.ready and queue.acquire(blocking=False): - self._perf_timer.start_timings() self._perf_timer.measure_time("find_queue") try: batch = InferenceBatch( @@ -440,7 +444,7 @@ def flush_requests(self) -> None: ) self._perf_timer.measure_time("fetch_input") transformed_inputs = self._worker.transform_input( - batch=batch, fetch_results=fetch_results + batch=batch, fetch_results=fetch_results, mem_pool=self._mem_pool ) self._perf_timer.measure_time("transform_input") batch.inputs = transformed_inputs @@ -450,4 +454,3 @@ def flush_requests(self) -> None: self._outgoing_queue.put(batch) self._perf_timer.measure_time("put") - self._perf_timer.end_timings() diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 65111fe482..4d351f9bff 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -31,6 +31,7 @@ import dragon.infrastructure.process_desc as dragon_process_desc import dragon.native.process as dragon_process import dragon.native.process_group as dragon_process_group +from dragon.managed_memory import MemoryAlloc, MemoryPool # pylint: enable=import-error @@ -41,6 +42,7 @@ import os import socket import sys +import time import typing as t from .....log import get_logger @@ -170,19 +172,21 @@ def __init__( """Device on which workers need to run""" self._cached_models: dict[str, t.Any] = {} """Dictionary of previously loaded models""" + self._mem_pool = MemoryPool(size=1024**3, fname="wm_mempool", uid=123458) self._request_dispatcher: RequestDispatcher = RequestDispatcher( batch_timeout=batch_timeout, batch_size=batch_size, incoming_channel=self._task_queue, comm_channel_type=comm_channel_type, feature_store=self._feature_store, + mem_pool=self._mem_pool, ) """Dispatcher used to batch requests""" self._device_manager: DeviceManager = DeviceManager( - [WorkerDevice(f"gpu:{idx}") for idx in range(4)] + [WorkerDevice(f"gpu:{idx}") for idx in [3]] ) self._device_idx: int = 0 - self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=False) + self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True) try: mp.set_start_method("dragon") @@ -192,17 +196,19 @@ def __init__( self._dispatcher_process = self._create_local_dispatcher_process() def _create_local_dispatcher_process(self) -> dragon_process_group.ProcessGroup: - dispatcher_cpus = 16 + wm_cpus = 0 if sys.platform != "darwin": self_affinity: list[int] = list(os.sched_getaffinity(os.getpid())) - os.sched_setaffinity(os.getpid(), self_affinity[:-dispatcher_cpus]) + wm_cpus = len(self_affinity) // 2 + os.sched_setaffinity(os.getpid(), self_affinity[:wm_cpus]) else: self_affinity: list[int] = [] + disp_affinity = self_affinity[wm_cpus:] global_policy = dragon_policy.Policy( placement=dragon_policy.Policy.Placement.HOST_NAME, host_name=socket.gethostname(), affinity=dragon_policy.Policy.Affinity.SPECIFIC, - cpu_affinity=self_affinity[-dispatcher_cpus:], + cpu_affinity=disp_affinity, ) options = dragon_process_desc.ProcessOptions(make_inf_channels=True) grp = dragon_process_group.ProcessGroup( @@ -212,7 +218,7 @@ def _create_local_dispatcher_process(self) -> dragon_process_group.ProcessGroup: placement=dragon_policy.Policy.Placement.HOST_NAME, host_name=socket.gethostname(), affinity=dragon_policy.Policy.Affinity.SPECIFIC, - cpu_affinity=self_affinity[-dispatcher_cpus:], + cpu_affinity=disp_affinity, ) tmp_proc = dragon_process.ProcessTemplate( target=self._request_dispatcher.run, @@ -235,13 +241,21 @@ def _on_iteration(self) -> None: """Executes calls to the machine learning worker implementation to complete the inference pipeline""" - batch: InferenceBatch = self._request_dispatcher.task_queue.get() + pre_batch_time = time.perf_counter() + try: + batch: InferenceBatch = self._request_dispatcher.task_queue.get( + timeout=0.001 + ) + except Exception: + return + + self._perf_timer.start_timings( + "flush_requests", time.perf_counter() - pre_batch_time + ) - self._perf_timer.start_timings() if batch is None or 0 == len(batch.requests): return - self._perf_timer.measure_time("flush_requests") device: WorkerDevice = next( self._device_manager.get_free_device( worker=self._worker, @@ -275,14 +289,15 @@ def _on_iteration(self) -> None: self._perf_timer.measure_time("execute") try: - transformed_outputs = self._worker.transform_output(batch, execute_result) + transformed_outputs = self._worker.transform_output( + batch, execute_result, self._perf_timer + ) except Exception as e: for request in batch.requests: exception_handler( e, request.callback, "Failed while transforming the output." ) return - self._perf_timer.measure_time("transform_output") for request, transformed_output in zip(batch.requests, transformed_outputs): reply = InferenceReply() @@ -325,8 +340,8 @@ def _on_iteration(self) -> None: self._perf_timer.end_timings() - # if self._perf_timer.max_length == 4 * 801: - # self._perf_timer.print_timings(True) + if self._perf_timer.max_length == 801: + self._perf_timer.print_timings(True) def _can_shutdown(self) -> bool: """Return true when the criteria to shut down the service are met.""" diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 52a2698467..0e8273dd56 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -25,13 +25,14 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import io -from concurrent.futures import Future, ThreadPoolExecutor import numpy as np import torch +from dragon.managed_memory import MemoryAlloc, MemoryPool from .....error import SmartSimError from .....log import get_logger +from ....utils.timings import PerfTimer from ...mli_schemas.tensor import tensor_capnp from .worker import ( ExecuteResult, @@ -44,8 +45,8 @@ TransformOutputResult, ) -torch.set_num_threads(4) -torch.set_num_interop_threads(2) +torch.set_num_threads(1) +torch.set_num_interop_threads(4) logger = get_logger(__name__) @@ -69,14 +70,17 @@ def load_model( device = device.replace(old, new) buffer = io.BytesIO(initial_bytes=model_bytes) - model = torch.jit.load(buffer, map_location=device) # type: ignore - model.eval() + with torch.no_grad(): + model = torch.jit.load(buffer, map_location=device) # type: ignore + model.eval() result = LoadModelResult(model) return result @staticmethod def transform_input( - batch: InferenceBatch, fetch_results: list[FetchInputResult] + batch: InferenceBatch, + fetch_results: list[FetchInputResult], + mem_pool: MemoryPool, ) -> TransformInputResult: results: list[torch.Tensor] = [] total_samples = 0 @@ -100,7 +104,7 @@ def transform_input( slices.append(slice(total_samples, total_samples + num_samples)) total_samples = total_samples + num_samples - if res_idx == len(fetch_results)-1: + if res_idx == len(fetch_results) - 1: # For each tensor in the last input, get remaining dimensions # Assumptions: all inputs have the same number of tensors and # last N-1 dimensions match across inputs for corresponding tensors @@ -112,65 +116,32 @@ def transform_input( all_dtypes.append(str(tensor_desc.dataType)) for result_tensor_idx, (dims, dtype) in enumerate(zip(all_dims, all_dtypes)): - # List comprehension concatenation can be faster sometimes - all_bytes = b"".join( - [ - fetch_result.inputs[result_tensor_idx] - for fetch_result in fetch_results - ] - ) - - results.append( - torch.from_numpy( - np.frombuffer( - all_bytes, - dtype=dtype, - ).reshape(dims) + itemsize = np.empty((1), dtype=dtype).itemsize + alloc_size = int(np.prod(dims) * itemsize) + try: + mem_alloc = mem_pool.alloc(alloc_size) + mem_view = mem_alloc.get_memview() + mem_view[:alloc_size] = b"".join( + [ + fetch_result.inputs[result_tensor_idx] + for fetch_result in fetch_results + ] ) - ) - - return TransformInputResult(results, slices) - - # @staticmethod - # def _transform_input( - # batch: InferenceBatch, fetch_results: list[FetchInputResult] - # ) -> TransformInputResult: - # results: list[list[torch.Tensor]] = [] - # start = 0 - # slices: list[slice] = [] - - # for fetch_result in fetch_results: - # partial_result = [] - # if fetch_result.meta is None: - # raise ValueError("Cannot reconstruct tensor without meta information") - # for idx, (item, item_meta) in enumerate( - # zip(fetch_result.inputs, fetch_result.meta) - # ): - # tensor_desc: tensor_capnp.TensorDescriptor = item_meta - # partial_result.append( - # torch.tensor( - # np.frombuffer(item, dtype=str(tensor_desc.dataType)) - # ).reshape(tuple(dim for dim in tensor_desc.dimensions)) - # ) - # if idx == 0: - # num_samples = tensor_desc.dimensions[0] - # slices.append(slice(start, start + num_samples)) - # start = start + num_samples - # results.append(partial_result) - - # result: list[torch.Tensor] = [] - # if len(batch.requests) > 1: - # for t_idx in range(len(results[0])): - # result.append( - # torch.concatenate( - # [partial_result[t_idx] for partial_result in results] - # ) - # ) - # else: - # result = results[0] - - # return TransformInputResult(result, slices) - # return data # note: this fails copy test! + except Exception as e: + print(e) + raise e + # results.append( + # torch.from_numpy( + # np.frombuffer( + # all_bytes, + # dtype=dtype, + # ).reshape(dims) + # ) + # ) + + results.append(mem_alloc.serialize()) + + return TransformInputResult(results, slices, all_dims) # pylint: disable-next=unused-argument @staticmethod @@ -185,34 +156,60 @@ def execute( device_to_torch = {"cpu": "cpu", "gpu": "cuda"} for old, new in device_to_torch.items(): device = device.replace(old, new) + + tensors = [] + mem_allocs = [] + for transformed, dims in zip( + transform_result.transformed, transform_result.dims + ): + mem_alloc = MemoryAlloc.attach(transformed) + mem_allocs.append(mem_alloc) + tensors.append( + torch.from_numpy( + np.frombuffer( + mem_alloc.get_memview()[0 : np.prod(dims) * 4], dtype=np.float32 + ).reshape(dims) + ) + ) + model: torch.nn.Module = load_result.model - model.eval() - # print([tensor.shape for tensor in transform_result.transformed]) - # torch.cuda.empty_cache() - results = [ - model(tensor.to(device)).detach() for tensor in transform_result.transformed - ] + with torch.no_grad(): + model.eval() + results = [ + model(tensor.to(device, non_blocking=True)).detach() + for tensor in tensors + ] + + torch.cuda.synchronize(3) transform_result.transformed = [] execute_result = ExecuteResult(results, transform_result.slices) + for mem_alloc in mem_allocs: + mem_alloc.free() return execute_result @staticmethod def transform_output( batch: InferenceBatch, execute_result: ExecuteResult, + perf_timer: PerfTimer, ) -> list[TransformOutputResult]: transformed_list: list[TransformOutputResult] = [] + cpu_predictions = [ + prediction.cpu() for prediction in execute_result.predictions + ] + perf_timer.measure_time("to_cpu") for result_slice in execute_result.slices: - transformed = [ - item[result_slice].cpu().numpy().tobytes() - for item in execute_result.predictions - ] - # todo: need the shape from latest schemas added here. - transformed_list.append( - TransformOutputResult(transformed, None, "c", "float32") - ) # fixme + transformed = [] + for cpu_item in cpu_predictions: + transformed.append(cpu_item[result_slice].numpy().tobytes()) + perf_timer.measure_time("serialize_tensor") + + # todo: need the shape from latest schemas added here. + transformed_list.append( + TransformOutputResult(transformed, None, "c", "float32") + ) # fixme execute_result.predictions = [] diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index a7dc6811da..068e47b2fd 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -28,8 +28,11 @@ from abc import ABC, abstractmethod from dataclasses import dataclass +from dragon.managed_memory import MemoryAlloc, MemoryPool + from .....error import SmartSimError from .....log import get_logger +from ....utils.timings import PerfTimer from ...comm.channel.channel import CommChannelBase from ...infrastructure.storage.featurestore import FeatureStore from ...mli_schemas.model.model_capnp import Model @@ -95,10 +98,13 @@ def __init__(self, model: t.Any) -> None: class TransformInputResult: """A wrapper around a transformed batchinput""" - def __init__(self, result: t.Any, slices: list[slice]) -> None: + def __init__( + self, result: t.Any, slices: list[slice], dims: list[list[int]] + ) -> None: """Initialize the object""" self.transformed = result self.slices = slices + self.dims = dims class ExecuteResult: @@ -275,7 +281,9 @@ def load_model( @staticmethod @abstractmethod def transform_input( - batch: InferenceBatch, fetch_results: list[FetchInputResult] + batch: InferenceBatch, + fetch_results: list[FetchInputResult], + mem_pool: MemoryPool, ) -> TransformInputResult: """Given a collection of data, perform a transformation on the data :param request: The request that triggered the pipeline @@ -299,7 +307,7 @@ def execute( @staticmethod @abstractmethod def transform_output( - batch: InferenceBatch, execute_result: ExecuteResult + batch: InferenceBatch, execute_result: ExecuteResult, perf_timer: PerfTimer ) -> t.List[TransformOutputResult]: """Given inference results, perform transformations required to transmit results to the requestor. From 8958aa14c28cbea40cfc09f717ee5eef382316c6 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 24 Jul 2024 17:07:37 -0500 Subject: [PATCH 50/84] Make dispatcher a service and refactor --- ex/high_throughput_inference/mli_driver.py | 3 +- .../standalone_workermanager.py | 166 +++++++++++++++--- .../control/requestdispatcher.py | 97 +++++----- .../infrastructure/control/workermanager.py | 93 ++-------- .../mli/infrastructure/worker/torch_worker.py | 2 + .../_core/mli/infrastructure/worker/worker.py | 7 +- 6 files changed, 220 insertions(+), 148 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 1d1642567c..effdc567d9 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -38,7 +38,8 @@ aff = [] for i in range(32): aff.append(i) - # aff.append(i+64) + aff.append(i+64) + worker_manager_rs.set_cpu_affinity(aff) worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 89f5eedd0d..d26493fa1e 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -24,34 +24,120 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# isort: off + import dragon + +# pylint disable=import-error +import dragon.globalservices.pool as dragon_gs_pool +import dragon.infrastructure.policy as dragon_policy +import dragon.infrastructure.process_desc as dragon_process_desc +import dragon.native.process as dragon_process from dragon import fli from dragon.channels import Channel from dragon.data.ddict.ddict import DDict -from dragon.utils import b64decode, b64encode from dragon.globalservices.api_setup import connect_to_infrastructure +from dragon.managed_memory import MemoryPool +from dragon.utils import b64decode, b64encode +# pylint enable=import-error + +# isort: off # isort: on + import argparse import base64 -import cloudpickle -import pickle +import multiprocessing as mp import os +import pickle +import socket +import sys +import time +import typing as t + +import cloudpickle +from smartsim._core.entrypoints.service import Service +from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel -from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import DragonFeatureStore from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel -from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker +from smartsim._core.mli.infrastructure.control.requestdispatcher import ( + RequestDispatcher, +) from smartsim._core.mli.infrastructure.control.workermanager import WorkerManager from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader +from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( + DragonFeatureStore, +) +from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase -import os -import socket -pid = 0 +mp.set_start_method("dragon") + +pid = os.getpid() affinity = os.sched_getaffinity(pid) print("Entry point:", socket.gethostname(), affinity) print("CPUS:", os.cpu_count()) + +def create_request_dispatcher( + batch_size: int, + batch_timeout: float, + comm_channel_type: t.Type[CommChannelBase], + worker_type: t.Type[MachineLearningWorkerBase], + config_loader: EnvironmentConfigLoader, +) -> RequestDispatcher: + mem_pool = MemoryPool.attach(dragon_gs_pool.create(2 * 1024**3).sdesc) + + return RequestDispatcher( + batch_timeout=batch_timeout, + batch_size=batch_size, + config_loader=config_loader, + comm_channel_type=comm_channel_type, + mem_pool=mem_pool, + worker_type=worker_type, + ) + + +def create_worker_manager( + worker_type: t.Type[MachineLearningWorkerBase], + config_loader: EnvironmentConfigLoader, + device: str, + dispatcher_queue: mp.Queue, +) -> WorkerManager: + return WorkerManager( + config_loader=config_loader, + worker_type=worker_type, + as_service=True, + cooldown=10, + comm_channel_type=DragonCommChannel, + device=device, + task_queue=dispatcher_queue, + ) + + +def service_as_dragon_proc( + service: Service, cpu_affinity: list[int], gpu_affinity: list[int] +) -> dragon_process.Process: + + options = dragon_process_desc.ProcessOptions(make_inf_channels=True) + local_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=socket.gethostname(), + affinity=dragon_policy.Policy.Affinity.SPECIFIC, + cpu_affinity=cpu_affinity, + gpu_affinity=gpu_affinity, + ) + proc = dragon_process.Process( + target=service.execute, + args=[], + cwd=os.getcwd(), + policy=local_policy, + options=options, + stderr=dragon_process.Popen.PIPE, + stdout=dragon_process.Popen.STDOUT, + ) + + return proc + + if __name__ == "__main__": parser = argparse.ArgumentParser("Worker Manager") parser.add_argument( @@ -70,8 +156,20 @@ parser.add_argument( "--num_workers", type=int, default=1, help="Number of workers to run" ) - + parser.add_argument( + "--batch_size", + type=int, + default=1, + help="How many requests the workers will try to aggregate before processing them", + ) + parser.add_argument( + "--batch_timeout", + type=float, + default=0.001, + help="How much time (in seconds) should be waited before processing an incomplete aggregated request", + ) args = parser.parse_args() + connect_to_infrastructure() ddict_str = os.environ["SS_DRG_DDICT"] ddict = DDict.attach(ddict_str) @@ -81,9 +179,9 @@ to_worker_fli_serialized = to_worker_fli.serialize() ddict["to_worker_fli"] = to_worker_fli_serialized - torch_worker = cloudpickle.loads( - base64.b64decode(args.worker_class.encode('ascii')) - )() + arg_worker_type = cloudpickle.loads( + base64.b64decode(args.worker_class.encode("ascii")) + ) dfs = DragonFeatureStore(ddict) comm_channel = DragonFLIChannel(to_worker_fli_serialized) @@ -91,16 +189,38 @@ os.environ["SSFeatureStore"] = base64.b64encode(pickle.dumps(dfs)).decode("utf-8") os.environ["SSQueue"] = base64.b64encode(to_worker_fli_serialized).decode("utf-8") - config_loader = EnvironmentConfigLoader() + ss_config_loader = EnvironmentConfigLoader() - worker_manager = WorkerManager( - config_loader=config_loader, - worker=torch_worker, - as_service=True, - cooldown=10, + dispatcher = create_request_dispatcher( + batch_size=args.batch_size, + batch_timeout=args.batch_timeout, comm_channel_type=DragonCommChannel, - device = args.device, - batch_size=1, - batch_timeout=0.001, # 1e-3 is the best with ResNet50 for bs>32 + worker_type=arg_worker_type, + config_loader=ss_config_loader, ) - worker_manager.execute() + + worker_manager = create_worker_manager( + worker_type=arg_worker_type, + config_loader=ss_config_loader, + device=args.device, + dispatcher_queue=dispatcher.task_queue, + ) + + wm_affinity: list[int] = [] + disp_affinity: list[int] = [] + if sys.platform != "darwin": + curr_affinity: list[int] = list(os.sched_getaffinity(os.getpid())) + wm_cpus = 3 * len(curr_affinity) // 4 + disp_affinity = curr_affinity[wm_cpus:] + wm_affinity = curr_affinity[:wm_cpus] + + dispatcher_proc = service_as_dragon_proc(dispatcher, cpu_affinity=disp_affinity, gpu_affinity=[]) + worker_manager_proc = service_as_dragon_proc( + worker_manager, cpu_affinity=wm_affinity, gpu_affinity=[] + ) + + dispatcher_proc.start() + worker_manager_proc.start() + + while all(proc.is_alive for proc in [dispatcher_proc, worker_manager_proc]): + time.sleep(1) diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index 018e094e0b..b5925f70c6 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -27,7 +27,7 @@ # pylint: disable=import-error # pylint: disable-next=unused-import import dragon -from dragon.managed_memory import MemoryAlloc, MemoryPool +from dragon.managed_memory import MemoryPool from dragon.mpbridge.queues import DragonQueue # pylint: enable=import-error @@ -45,14 +45,19 @@ from packaging.version import Version +from smartsim._core.entrypoints.service import Service from .....error import SmartSimError from .....log import get_logger from ....utils.timings import PerfTimer from ...comm.channel.channel import CommChannelBase from ...comm.channel.dragonchannel import DragonCommChannel +from ...infrastructure.environmentloader import EnvironmentConfigLoader from ...infrastructure.storage.featurestore import FeatureStore -from ...infrastructure.worker.torch_worker import TorchWorker -from ...infrastructure.worker.worker import InferenceBatch, InferenceRequest +from ...infrastructure.worker.worker import ( + InferenceBatch, + InferenceRequest, + MachineLearningWorkerBase, +) from ...message_handler import MessageHandler from ...mli_schemas.model.model_capnp import Model from ...mli_schemas.response.response_capnp import ResponseBuilder @@ -279,17 +284,17 @@ def empty(self) -> bool: return self.qsize() == 0 -class RequestDispatcher: +class RequestDispatcher(Service): def __init__( self, batch_timeout: float, batch_size: int, mem_pool: MemoryPool, - incoming_channel: t.Optional[CommChannelBase], + config_loader: EnvironmentConfigLoader, + worker_type: t.Type[MachineLearningWorkerBase], comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, - feature_store: t.Optional[FeatureStore] = None, ) -> None: - mp.set_start_method("dragon") + super().__init__(as_service=True, cooldown=1) self._queues: list[BatchQueue] = [] self._active_queues: dict[str, BatchQueue] = {} self._model_last_version: dict[str, Version] = {} @@ -297,12 +302,16 @@ def __init__( self._batch_timeout = batch_timeout self._batch_size = batch_size self._queue_swap_lock: t.Optional[RLock] = None - self._incoming_channel = incoming_channel + self._incoming_channel = config_loader.get_queue() + """the queue the manager monitors for new tasks""" self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0) - self._feature_store = feature_store + self._feature_store: t.Optional[FeatureStore] = ( + config_loader.get_feature_store() + ) + """a feature store to retrieve models from""" self._comm_channel_type = comm_channel_type self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True) - self._worker = TorchWorker() + self._worker = worker_type() self._mem_pool = mem_pool def _validate_request(self, request: InferenceRequest) -> bool: @@ -336,48 +345,49 @@ def _validate_request(self, request: InferenceRequest) -> bool: return True - def run(self) -> None: + def _on_start(self) -> None: self._queue_swap_lock = RLock() if self._incoming_channel is None: raise SmartSimError("No incoming channel for dispatcher") - while True: - try: - bytes_list: t.List[bytes] = self._incoming_channel.recv() - except Exception: - pass - else: - if not bytes_list: - exception_handler( - ValueError("No request data found"), - None, - "No request data found.", - ) - request_bytes = bytes_list[0] - tensor_bytes_list = bytes_list[1:] - self._perf_timer.start_timings() + def _on_iteration(self) -> None: + try: + bytes_list: t.List[bytes] = self._incoming_channel.recv() + except Exception: + pass + else: + if not bytes_list: + exception_handler( + ValueError("No request data found"), + None, + "No request data found.", + ) - request = deserialize_message(request_bytes, self._comm_channel_type) - if request.input_meta and tensor_bytes_list: - request.raw_inputs = tensor_bytes_list + request_bytes = bytes_list[0] + tensor_bytes_list = bytes_list[1:] + self._perf_timer.start_timings() - self._perf_timer.measure_time("deserialize_message") - if not self._validate_request(request): - continue + request = deserialize_message(request_bytes, self._comm_channel_type) + if request.input_meta and tensor_bytes_list: + request.raw_inputs = tensor_bytes_list - self._perf_timer.measure_time("validate_request") - self.dispatch(request) + self._perf_timer.measure_time("deserialize_message") + if not self._validate_request(request): + return - self._perf_timer.measure_time("dispatch") - finally: - self.flush_requests() - # TODO: implement this - # self.remove_queues() + self._perf_timer.measure_time("validate_request") + self.dispatch(request) - self._perf_timer.end_timings() + self._perf_timer.measure_time("dispatch") + finally: + self.flush_requests() + # TODO: implement this + # self.remove_queues() + + self._perf_timer.end_timings() - if self._perf_timer.max_length == 801: - self._perf_timer.print_timings(True) + if self._perf_timer.max_length == 801: + self._perf_timer.print_timings(True) @property def task_queue(self) -> DragonQueue: @@ -454,3 +464,6 @@ def flush_requests(self) -> None: self._outgoing_queue.put(batch) self._perf_timer.measure_time("put") + + def _can_shutdown(self) -> bool: + return False \ No newline at end of file diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 4d351f9bff..9626506a41 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -27,11 +27,6 @@ # pylint: disable=import-error # pylint: disable-next=unused-import import dragon -import dragon.infrastructure.policy as dragon_policy -import dragon.infrastructure.process_desc as dragon_process_desc -import dragon.native.process as dragon_process -import dragon.native.process_group as dragon_process_group -from dragon.managed_memory import MemoryAlloc, MemoryPool # pylint: enable=import-error @@ -39,9 +34,6 @@ # isort: on import multiprocessing as mp -import os -import socket -import sys import time import typing as t @@ -61,7 +53,6 @@ from ...message_handler import MessageHandler from ...mli_schemas.response.response_capnp import ResponseBuilder from .devicemanager import DeviceManager, WorkerDevice -from .requestdispatcher import RequestDispatcher if t.TYPE_CHECKING: from smartsim._core.mli.mli_schemas.model.model_capnp import Model @@ -109,7 +100,6 @@ def build_reply(reply: InferenceReply) -> ResponseBuilder: custom_attributes=None, ) - def exception_handler( exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str ) -> None: @@ -139,13 +129,12 @@ class WorkerManager(Service): def __init__( self, config_loader: EnvironmentConfigLoader, - worker: MachineLearningWorkerBase, + worker_type: t.Type[MachineLearningWorkerBase], + task_queue: "mp.Queue[InferenceBatch]", as_service: bool = False, cooldown: int = 0, comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, device: t.Literal["cpu", "gpu"] = "cpu", - batch_timeout: float = 0.0, - batch_size: int = 1, ) -> None: """Initialize the WorkerManager :param config_loader: Environment config loader that loads the task queue and @@ -158,84 +147,26 @@ def __init__( """ super().__init__(as_service, cooldown) - self._task_queue: t.Optional[CommChannelBase] = config_loader.get_queue() + self._task_queue = task_queue """the queue the manager monitors for new tasks""" self._feature_store: t.Optional[FeatureStore] = ( config_loader.get_feature_store() ) """a feature store to retrieve models from""" - self._worker = worker + self._worker = worker_type() """The ML Worker implementation""" self._comm_channel_type = comm_channel_type """The type of communication channel to construct for callbacks""" self._device = device """Device on which workers need to run""" - self._cached_models: dict[str, t.Any] = {} - """Dictionary of previously loaded models""" - self._mem_pool = MemoryPool(size=1024**3, fname="wm_mempool", uid=123458) - self._request_dispatcher: RequestDispatcher = RequestDispatcher( - batch_timeout=batch_timeout, - batch_size=batch_size, - incoming_channel=self._task_queue, - comm_channel_type=comm_channel_type, - feature_store=self._feature_store, - mem_pool=self._mem_pool, - ) - """Dispatcher used to batch requests""" - self._device_manager: DeviceManager = DeviceManager( - [WorkerDevice(f"gpu:{idx}") for idx in [3]] - ) - self._device_idx: int = 0 - self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True) - try: - mp.set_start_method("dragon") - except RuntimeError: - pass - - self._dispatcher_process = self._create_local_dispatcher_process() - - def _create_local_dispatcher_process(self) -> dragon_process_group.ProcessGroup: - wm_cpus = 0 - if sys.platform != "darwin": - self_affinity: list[int] = list(os.sched_getaffinity(os.getpid())) - wm_cpus = len(self_affinity) // 2 - os.sched_setaffinity(os.getpid(), self_affinity[:wm_cpus]) - else: - self_affinity: list[int] = [] - disp_affinity = self_affinity[wm_cpus:] - global_policy = dragon_policy.Policy( - placement=dragon_policy.Policy.Placement.HOST_NAME, - host_name=socket.gethostname(), - affinity=dragon_policy.Policy.Affinity.SPECIFIC, - cpu_affinity=disp_affinity, - ) - options = dragon_process_desc.ProcessOptions(make_inf_channels=True) - grp = dragon_process_group.ProcessGroup( - restart=False, pmi_enabled=True, policy=global_policy - ) - local_policy = dragon_policy.Policy( - placement=dragon_policy.Policy.Placement.HOST_NAME, - host_name=socket.gethostname(), - affinity=dragon_policy.Policy.Affinity.SPECIFIC, - cpu_affinity=disp_affinity, - ) - tmp_proc = dragon_process.ProcessTemplate( - target=self._request_dispatcher.run, - args=[], - cwd=os.getcwd(), - policy=local_policy, - options=options, - ) - grp.add_process(nproc=1, template=tmp_proc) - grp.init() - return grp + self._device_manager: t.Optional[DeviceManager] = None + self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True) def _on_start(self) -> None: - self._dispatcher_process.start() - - def _on_shutdown(self) -> None: - self._dispatcher_process.join() + self._device_manager = DeviceManager( + [WorkerDevice(f"gpu:{idx}") for idx in [3]] + ) def _on_iteration(self) -> None: """Executes calls to the machine learning worker implementation to complete @@ -243,9 +174,7 @@ def _on_iteration(self) -> None: pre_batch_time = time.perf_counter() try: - batch: InferenceBatch = self._request_dispatcher.task_queue.get( - timeout=0.001 - ) + batch: InferenceBatch = self._task_queue.get(timeout=0.0001) except Exception: return @@ -256,6 +185,8 @@ def _on_iteration(self) -> None: if batch is None or 0 == len(batch.requests): return + if self._device_manager is None: + raise ValueError("No Device Manager available: did you call _on_start()") device: WorkerDevice = next( self._device_manager.get_free_device( worker=self._worker, diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 0e8273dd56..37b8b7e843 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -28,6 +28,8 @@ import numpy as np import torch + +# pylint: disable=import-error from dragon.managed_memory import MemoryAlloc, MemoryPool from .....error import SmartSimError diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 068e47b2fd..bc96633204 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -24,11 +24,16 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# pylint: disable=import-error +from dragon.managed_memory import MemoryPool + +# isort: off +# isort: on + import typing as t from abc import ABC, abstractmethod from dataclasses import dataclass -from dragon.managed_memory import MemoryAlloc, MemoryPool from .....error import SmartSimError from .....log import get_logger From 79eb936ba79168604499b60fd8781d1cf6fafede Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 24 Jul 2024 18:03:16 -0500 Subject: [PATCH 51/84] Fixes for batched requests --- ex/high_throughput_inference/mli_driver.py | 15 ++++++++++----- ex/high_throughput_inference/mock_app.py | 3 ++- smartsim/_core/mli/comm/channel/dragonfli.py | 2 +- .../infrastructure/control/requestdispatcher.py | 3 ++- .../mli/infrastructure/control/workermanager.py | 2 +- smartsim/_core/utils/timings.py | 6 +++++- 6 files changed, 21 insertions(+), 10 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index effdc567d9..c965a6d9a6 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -9,11 +9,12 @@ import time import typing as t -device = "gpu" +DEVICE = "gpu" +NUM_RANKS = 4 filedir = os.path.dirname(__file__) worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") app_script_name = os.path.join(filedir, "mock_app.py") -model_name = os.path.join(filedir, f"resnet50.{device.upper()}.pt") +model_name = os.path.join(filedir, f"resnet50.{DEVICE.upper()}.pt") transport: t.Literal["hsta", "tcp"] = "hsta" @@ -30,9 +31,13 @@ [ worker_manager_script_name, "--device", - device, + DEVICE, "--worker_class", torch_worker_str, + "--batch_size", + str(NUM_RANKS), + "--batch_timeout", + str(0.001), ], ) aff = [] @@ -46,9 +51,9 @@ app_rs: DragonRunSettings = exp.create_run_settings( sys.executable, - exe_args=[app_script_name, "--device", device], + exe_args=[app_script_name, "--device", DEVICE, "--log_max_batchsize", str(7)], ) -app_rs.set_tasks_per_node(1) +app_rs.set_tasks_per_node(NUM_RANKS) app = exp.create_model("app", run_settings=app_rs) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 2a76fdbe9d..28b8a3d98a 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -154,6 +154,7 @@ def name(self): parser = argparse.ArgumentParser("Mock application") parser.add_argument("--device", default="cpu") + parser.add_argument("--log_max_batchsize", default=8, type=int) args = parser.parse_args() resnet = ResNetWrapper("resnet50", f"resnet50.{args.device.upper()}.pt") @@ -165,7 +166,7 @@ def name(self): TOTAL_ITERATIONS = 100 - for log2_bsize in range(8): + for log2_bsize in range(args.log_max_batchsize): b_size: int = 2**log2_bsize logger.info(f"Batch size: {b_size}") for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)): diff --git a/smartsim/_core/mli/comm/channel/dragonfli.py b/smartsim/_core/mli/comm/channel/dragonfli.py index 9f5d628d5f..555d9104d9 100644 --- a/smartsim/_core/mli/comm/channel/dragonfli.py +++ b/smartsim/_core/mli/comm/channel/dragonfli.py @@ -62,7 +62,7 @@ def recv(self) -> t.List[bytes]: :returns: the received message""" messages = [] eot = False - with self._fli.recvh(timeout=None) as recvh: + with self._fli.recvh(timeout=0.001) as recvh: while not eot: try: message, _ = recvh.recv_bytes(timeout=None) diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index b5925f70c6..9b3d8cabdb 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -253,6 +253,7 @@ def ready(self) -> bool: return False return self.full() or (self._waited_time >= self._batch_timeout) + def make_disposable(self) -> None: self._disposable = True @@ -310,7 +311,7 @@ def __init__( ) """a feature store to retrieve models from""" self._comm_channel_type = comm_channel_type - self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True) + self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=False) self._worker = worker_type() self._mem_pool = mem_pool diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 9626506a41..0ee146ef1b 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -161,7 +161,7 @@ def __init__( """Device on which workers need to run""" self._device_manager: t.Optional[DeviceManager] = None - self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True) + self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=False) def _on_start(self) -> None: self._device_manager = DeviceManager( diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py index c8f6c71003..286bd4f4a8 100644 --- a/smartsim/_core/utils/timings.py +++ b/smartsim/_core/utils/timings.py @@ -121,7 +121,11 @@ def max_length(self) -> int: def print_timings(self, to_file: bool = False) -> None: print(" ".join(self._timings.keys())) - value_array = np.array(list(self._timings.values()), dtype=float) + try: + value_array = np.array(list(self._timings.values()), dtype=float) + except Exception as e: + logger.exception(e) + return value_array = np.transpose(value_array) if self._debug: for i in range(value_array.shape[0]): From 8759e9f56468b984a934e77df787959663edfd9b Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 25 Jul 2024 16:18:53 -0500 Subject: [PATCH 52/84] Pre-PR --- ex/high_throughput_inference/mli_driver.py | 14 +-- ex/high_throughput_inference/mock_app.py | 7 +- .../standalone_workermanager.py | 57 +++++++---- .../control/requestdispatcher.py | 99 ++++++++++++++----- .../infrastructure/control/workermanager.py | 23 +++-- .../mli/infrastructure/worker/torch_worker.py | 34 ++----- .../_core/mli/infrastructure/worker/worker.py | 25 +++-- 7 files changed, 162 insertions(+), 97 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index c965a6d9a6..6d852ec6c3 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -10,7 +10,8 @@ import typing as t DEVICE = "gpu" -NUM_RANKS = 4 +NUM_RANKS = 1 +NUM_WORKERS = 1 filedir = os.path.dirname(__file__) worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") app_script_name = os.path.join(filedir, "mock_app.py") @@ -35,17 +36,18 @@ "--worker_class", torch_worker_str, "--batch_size", - str(NUM_RANKS), + str(NUM_RANKS//NUM_WORKERS), "--batch_timeout", - str(0.001), + str(0.002), + "--num_workers", + str(NUM_WORKERS) ], ) + aff = [] -for i in range(32): - aff.append(i) - aff.append(i+64) worker_manager_rs.set_cpu_affinity(aff) + worker_manager = exp.create_model("worker_manager", run_settings=worker_manager_rs) worker_manager.attach_generator_files(to_copy=[worker_manager_script_name]) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 28b8a3d98a..2440aa87c4 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -107,11 +107,12 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self._perf_timer.measure_time("send") with self._from_worker_ch.recvh(timeout=None) as from_recvh: resp = from_recvh.recv_bytes(timeout=None) - self._perf_timer.measure_time("receive") + self._perf_timer.measure_time("receive_response") response = MessageHandler.deserialize_response(resp) self._perf_timer.measure_time("deserialize_response") # list of data blobs? recv depending on the len(response.result.descriptors)? - data_blob = from_recvh.recv_bytes(timeout=None) + data_blob: bytes = from_recvh.recv_bytes(timeout=None) + self._perf_timer.measure_time("receive_tensor") result = torch.from_numpy( numpy.frombuffer( data_blob, @@ -166,7 +167,7 @@ def name(self): TOTAL_ITERATIONS = 100 - for log2_bsize in range(args.log_max_batchsize): + for log2_bsize in range(args.log_max_batchsize+1): b_size: int = 2**log2_bsize logger.info(f"Batch size: {b_size}") for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)): diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index d26493fa1e..72e2bd20f0 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -125,17 +125,17 @@ def service_as_dragon_proc( cpu_affinity=cpu_affinity, gpu_affinity=gpu_affinity, ) - proc = dragon_process.Process( + return dragon_process.Process( target=service.execute, args=[], cwd=os.getcwd(), policy=local_policy, options=options, - stderr=dragon_process.Popen.PIPE, + stderr=dragon_process.Popen.STDOUT, stdout=dragon_process.Popen.STDOUT, ) - return proc + if __name__ == "__main__": @@ -199,28 +199,45 @@ def service_as_dragon_proc( config_loader=ss_config_loader, ) - worker_manager = create_worker_manager( - worker_type=arg_worker_type, - config_loader=ss_config_loader, - device=args.device, - dispatcher_queue=dispatcher.task_queue, - ) + wms = [] + worker_device = args.device + for wm_idx in range(args.num_workers): + # if args.num_workers > 0: + # worker_device = f"{args.device}:{wm_idx}" + worker_manager = create_worker_manager( + worker_type=arg_worker_type, + config_loader=ss_config_loader, + device=worker_device, + dispatcher_queue=dispatcher.task_queue, + ) + wms.append(worker_manager) wm_affinity: list[int] = [] disp_affinity: list[int] = [] - if sys.platform != "darwin": - curr_affinity: list[int] = list(os.sched_getaffinity(os.getpid())) - wm_cpus = 3 * len(curr_affinity) // 4 - disp_affinity = curr_affinity[wm_cpus:] - wm_affinity = curr_affinity[:wm_cpus] + + # This is hardcoded for a specific type of node! + gpu_to_cpu_aff: dict[int, list[int]] = {} + gpu_to_cpu_aff[0] = list(range(48,64)) + list(range(112,128)) + gpu_to_cpu_aff[1] = list(range(32,48)) + list(range(96,112)) + gpu_to_cpu_aff[2] = list(range(16,32)) + list(range(80,96)) + gpu_to_cpu_aff[3] = list(range(0,16)) + list(range(64,80)) + + worker_manager_procs = [] + for worker_idx in range(args.num_workers): + wm_cpus = len(gpu_to_cpu_aff[worker_idx]) - 4 + wm_affinity = gpu_to_cpu_aff[worker_idx][:wm_cpus] + disp_affinity.extend(gpu_to_cpu_aff[worker_idx][wm_cpus:]) + worker_manager_procs.append(service_as_dragon_proc( + worker_manager, cpu_affinity=wm_affinity, gpu_affinity=[worker_idx] + )) dispatcher_proc = service_as_dragon_proc(dispatcher, cpu_affinity=disp_affinity, gpu_affinity=[]) - worker_manager_proc = service_as_dragon_proc( - worker_manager, cpu_affinity=wm_affinity, gpu_affinity=[] - ) - dispatcher_proc.start() - worker_manager_proc.start() + # TODO: use ProcessGroup and restart=True? + all_procs = [dispatcher_proc, *worker_manager_procs] + + for proc in all_procs: + proc.start() - while all(proc.is_alive for proc in [dispatcher_proc, worker_manager_proc]): + while all(proc.is_alive for proc in all_procs): time.sleep(1) diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index 9b3d8cabdb..6a6f811fc9 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -43,9 +43,8 @@ from threading import RLock from types import TracebackType -from packaging.version import Version - from smartsim._core.entrypoints.service import Service + from .....error import SmartSimError from .....log import get_logger from ....utils.timings import PerfTimer @@ -190,23 +189,46 @@ def __exit__( class BatchQueue(Queue[InferenceRequest]): def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> None: + """Queue used to store inference requests waiting to be batched and + sent to Worker Managers. + :param batch_timeout: Time in seconds that has to be waited before flushing a + non-full queue. The time of the firt item put is 0 seconds. + :param batch_size: Total capacity of the queue. + :param model_key: Key of the model which needs to be executed on the queued + requests + """ super().__init__(maxsize=batch_size) self._batch_timeout = batch_timeout + """Time in seconds that has to be waited before flushing a non-full queue. + The time of the firt item put is 0 seconds.""" self._batch_size = batch_size + """Total capacity of the queue.""" self._first_put: t.Optional[float] = None + """Time at which the first item was put on the queue""" self._disposable = False + """Whether the queue will not be used again and can be deleted. + A disposable queue is always full.""" self._model_key = model_key + """Key of the model which needs to be executed on the queued requets""" self._flush_lock = RLock() + """Lock used to make sure only one process can flush the queue (unused now)""" self._id = str(uuid.uuid4()) + """Id of queue""" @property def queue_id(self) -> str: + """ID of this queue""" return self._id def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]: + """Acquire queue lock to flush + :param blocking: whether to block on lock acquisition + :param timeout: Time to wait if blocking, before raising exception + """ return self._flush_lock.acquire(blocking=blocking, timeout=timeout) def release(self) -> None: + """Release queue lock""" self._flush_lock.release() def __enter__(self) -> None: @@ -222,6 +244,7 @@ def __exit__( @property def model_key(self) -> str: + """Key of the model which needs to be run on the queued requests""" return self._model_key def put( @@ -230,6 +253,11 @@ def put( block: bool = False, timeout: t.Optional[float] = 0.0, ) -> None: + """Put an inference request in the queue + :param item: The request + :param block: Whether to block when trying to put the item + :param timeout: Time to wait if block==True + """ if not self.acquire(blocking=False): raise Full try: @@ -249,19 +277,24 @@ def _waited_time(self) -> float: @property def ready(self) -> bool: + """True if the queue can be flushed""" if self.empty(): return False return self.full() or (self._waited_time >= self._batch_timeout) - def make_disposable(self) -> None: + """Set this queue as disposable, and never use it again after it gets flushed""" self._disposable = True @property def disposable(self) -> bool: + """Whether this queue can be used to put items or should be deleted""" return self.empty() and self._disposable def flush(self) -> list[t.Any]: + """Get all requests from queue + :return: Requests waiting to be executed + """ num_items = self.qsize() self._first_put = None items = [] @@ -275,6 +308,7 @@ def flush(self) -> list[t.Any]: return items def full(self) -> bool: + """Return True if the queue has reached its maximum capacity""" if self._disposable: return True if self._batch_size <= 0: @@ -282,6 +316,7 @@ def full(self) -> bool: return self.qsize() >= self._batch_size def empty(self) -> bool: + """Return True if the queue has 0 elements""" return self.qsize() == 0 @@ -295,25 +330,46 @@ def __init__( worker_type: t.Type[MachineLearningWorkerBase], comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, ) -> None: - super().__init__(as_service=True, cooldown=1) + """The RquestDispatcher intercepts inference requests, stages them in + queues and batches them together before making them available to Worker + Managers. + :param batch_timeout: Time in seconds that has to be waited before flushing a + non-full queue after having put at least one item on it. + :param batch_size: Total capacity of each batch queue. + :param mem_pool: Memory pool used to share batched input tensors with worker + managers + :param config_loader: Object to load configuration from environment + :param worker_type: Type of worker to instantiate to batch inputs + :param comm_channel_type: Type of channel used to get requests + """ + super().__init__(as_service=False, cooldown=1) self._queues: list[BatchQueue] = [] + """All batch queues""" self._active_queues: dict[str, BatchQueue] = {} - self._model_last_version: dict[str, Version] = {} - self._model_name_to_key: dict[str, str] = {} + """Mapping telling which queue is the recipient of requets for a given model + key""" self._batch_timeout = batch_timeout + """Time in seconds that has to be waited before flushing a non-full queue""" self._batch_size = batch_size + """Total capacity of each batch queue.""" self._queue_swap_lock: t.Optional[RLock] = None + """Lock used to swap the active queue for a key""" self._incoming_channel = config_loader.get_queue() - """the queue the manager monitors for new tasks""" + """The channel the dispatcher monitors for new tasks""" self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0) + """The queue on which batched inference requests are placed""" self._feature_store: t.Optional[FeatureStore] = ( config_loader.get_feature_store() ) - """a feature store to retrieve models from""" + """A feature store to retrieve models from""" self._comm_channel_type = comm_channel_type - self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=False) + """The type of the channel used to receive requests""" self._worker = worker_type() + """The worker used to batch inputs""" self._mem_pool = mem_pool + """Memory pool used to share batched input tensors with the Worker Managers""" + self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=False) + """Performance timer""" def _validate_request(self, request: InferenceRequest) -> bool: """Ensure the request can be processed. @@ -348,10 +404,12 @@ def _validate_request(self, request: InferenceRequest) -> bool: def _on_start(self) -> None: self._queue_swap_lock = RLock() + + def _on_iteration(self) -> None: + if self._incoming_channel is None: raise SmartSimError("No incoming channel for dispatcher") - def _on_iteration(self) -> None: try: bytes_list: t.List[bytes] = self._incoming_channel.recv() except Exception: @@ -392,11 +450,12 @@ def _on_iteration(self) -> None: @property def task_queue(self) -> DragonQueue: + """The queue on which batched requests are placed""" return self._outgoing_queue def _swap_queue(self, model_key: str) -> None: if self._queue_swap_lock is None: - raise SmartSimError("Queue was not locked") + raise SmartSimError("Queues were not locked") with self._queue_swap_lock: for queue in self._queues: if queue.model_key == model_key and not queue.full(): @@ -409,6 +468,9 @@ def _swap_queue(self, model_key: str) -> None: return def dispatch(self, request: InferenceRequest) -> None: + """Assign a request to a batch queue + :param request: the request to place + """ if request.raw_model is not None: logger.info("Direct inference requested, creating tmp queue") tmp_id = f"_tmp_{str(uuid.uuid4())}" @@ -429,17 +491,10 @@ def dispatch(self, request: InferenceRequest) -> None: except (Full, KeyError): self._swap_queue(request.model_key) - def _update_model_version(self, model: Model) -> None: - if not model.version: - return - if ( - model.name not in self._model_last_version - or Version(model.version) > self._model_last_version[model.name] - ): - self._model_last_version[model.name] = Version(model.version) - return - def flush_requests(self) -> None: + """Get all requests from queues which are ready to be flushed. Place all + aviable request batches in the outgoing queue. + """ for queue in self._queues: if queue.ready and queue.acquire(blocking=False): self._perf_timer.measure_time("find_queue") @@ -467,4 +522,4 @@ def flush_requests(self) -> None: self._perf_timer.measure_time("put") def _can_shutdown(self) -> bool: - return False \ No newline at end of file + return False diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 0ee146ef1b..74b36c3454 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -100,6 +100,7 @@ def build_reply(reply: InferenceReply) -> ResponseBuilder: custom_attributes=None, ) + def exception_handler( exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str ) -> None: @@ -139,20 +140,23 @@ def __init__( """Initialize the WorkerManager :param config_loader: Environment config loader that loads the task queue and feature store - :param workers: A worker to manage + :param worker_type: The type of worker to manage + :param task_queue: Queue from witch the batched requests have to be pulled :param as_service: Specifies run-once or run-until-complete behavior of service :param cooldown: Number of seconds to wait before shutting down after shutdown criteria are met :param comm_channel_type: The type of communication channel used for callbacks + :param device: The device on which the Worker should run. Every worker manager + is assigned one single GPU (if available), thus the device should have no index. """ super().__init__(as_service, cooldown) self._task_queue = task_queue - """the queue the manager monitors for new tasks""" + """The dispatcher queue the manager monitors for new tasks""" self._feature_store: t.Optional[FeatureStore] = ( config_loader.get_feature_store() ) - """a feature store to retrieve models from""" + """A feature store to retrieve models from""" self._worker = worker_type() """The ML Worker implementation""" self._comm_channel_type = comm_channel_type @@ -161,13 +165,14 @@ def __init__( """Device on which workers need to run""" self._device_manager: t.Optional[DeviceManager] = None - self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=False) + """Object responsible for model caching and device access""" + self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True) + """Performance timer""" def _on_start(self) -> None: - self._device_manager = DeviceManager( - [WorkerDevice(f"gpu:{idx}") for idx in [3]] - ) + self._device_manager = DeviceManager([WorkerDevice(self._device)]) + # pylint: disable-next=too-many-statements def _on_iteration(self) -> None: """Executes calls to the machine learning worker implementation to complete the inference pipeline""" @@ -220,9 +225,7 @@ def _on_iteration(self) -> None: self._perf_timer.measure_time("execute") try: - transformed_outputs = self._worker.transform_output( - batch, execute_result, self._perf_timer - ) + transformed_outputs = self._worker.transform_output(batch, execute_result) except Exception as e: for request in batch.requests: exception_handler( diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 37b8b7e843..6723573cfb 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -34,7 +34,6 @@ from .....error import SmartSimError from .....log import get_logger -from ....utils.timings import PerfTimer from ...mli_schemas.tensor import tensor_capnp from .worker import ( ExecuteResult, @@ -120,26 +119,14 @@ def transform_input( for result_tensor_idx, (dims, dtype) in enumerate(zip(all_dims, all_dtypes)): itemsize = np.empty((1), dtype=dtype).itemsize alloc_size = int(np.prod(dims) * itemsize) - try: - mem_alloc = mem_pool.alloc(alloc_size) - mem_view = mem_alloc.get_memview() - mem_view[:alloc_size] = b"".join( - [ - fetch_result.inputs[result_tensor_idx] - for fetch_result in fetch_results - ] - ) - except Exception as e: - print(e) - raise e - # results.append( - # torch.from_numpy( - # np.frombuffer( - # all_bytes, - # dtype=dtype, - # ).reshape(dims) - # ) - # ) + mem_alloc = mem_pool.alloc(alloc_size) + mem_view = mem_alloc.get_memview() + mem_view[:alloc_size] = b"".join( + [ + fetch_result.inputs[result_tensor_idx] + for fetch_result in fetch_results + ] + ) results.append(mem_alloc.serialize()) @@ -182,8 +169,6 @@ def execute( for tensor in tensors ] - torch.cuda.synchronize(3) - transform_result.transformed = [] execute_result = ExecuteResult(results, transform_result.slices) @@ -195,18 +180,15 @@ def execute( def transform_output( batch: InferenceBatch, execute_result: ExecuteResult, - perf_timer: PerfTimer, ) -> list[TransformOutputResult]: transformed_list: list[TransformOutputResult] = [] cpu_predictions = [ prediction.cpu() for prediction in execute_result.predictions ] - perf_timer.measure_time("to_cpu") for result_slice in execute_result.slices: transformed = [] for cpu_item in cpu_predictions: transformed.append(cpu_item[result_slice].numpy().tobytes()) - perf_timer.measure_time("serialize_tensor") # todo: need the shape from latest schemas added here. transformed_list.append( diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index bc96633204..01e2db6c86 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -34,10 +34,8 @@ from abc import ABC, abstractmethod from dataclasses import dataclass - from .....error import SmartSimError from .....log import get_logger -from ....utils.timings import PerfTimer from ...comm.channel.channel import CommChannelBase from ...infrastructure.storage.featurestore import FeatureStore from ...mli_schemas.model.model_capnp import Model @@ -101,15 +99,19 @@ def __init__(self, model: t.Any) -> None: class TransformInputResult: - """A wrapper around a transformed batchinput""" + """A wrapper around a transformed batch of input tensors""" def __init__( self, result: t.Any, slices: list[slice], dims: list[list[int]] ) -> None: """Initialize the object""" self.transformed = result + """List of Dragon MemoryAlloc objects on which the tensors are stored""" self.slices = slices + """Each slice represents which portion of the input tensors belongs to + which request""" self.dims = dims + """Dimension of the transformed tensors""" class ExecuteResult: @@ -174,7 +176,7 @@ def fetch_model( batch: InferenceBatch, feature_store: t.Optional[FeatureStore] ) -> FetchModelResult: """Given a resource key, retrieve the raw model from a feature store - :param batc: The batch of requests that triggered the pipeline + :param batch: The batch of requests that triggered the pipeline :param feature_store: The feature store used for persistence :return: Raw bytes of the model""" @@ -206,7 +208,7 @@ def fetch_inputs( ) -> t.List[FetchInputResult]: """Given a collection of ResourceKeys, identify the physical location and input metadata - :param request: The request that triggered the pipeline + :param batch: The batch of requests that triggered the pipeline :param feature_store: The feature store used for persistence :return: the fetched input""" fetch_results = [] @@ -290,9 +292,11 @@ def transform_input( fetch_results: list[FetchInputResult], mem_pool: MemoryPool, ) -> TransformInputResult: - """Given a collection of data, perform a transformation on the data + """Given a collection of data, perform a transformation on the data and put + the raw tensor data on a MemoryPool allocation. :param request: The request that triggered the pipeline :param fetch_result: Raw outputs from fetching inputs out of a feature store + :param mem_pool: The memory pool used to access batched input tensors :return: The transformed inputs wrapped in a InputTransformResult""" @staticmethod @@ -304,18 +308,19 @@ def execute( device: str, ) -> ExecuteResult: """Execute an ML model on inputs transformed for use by the model - :param request: The request that triggered the pipeline + :param batch: The batch of requests that triggered the pipeline :param load_result: The result of loading the model onto device memory :param transform_result: The result of transforming inputs for model consumption + :param device: The device on which the model will be executed :return: The result of inference wrapped in an ExecuteResult""" @staticmethod @abstractmethod def transform_output( - batch: InferenceBatch, execute_result: ExecuteResult, perf_timer: PerfTimer + batch: InferenceBatch, execute_result: ExecuteResult ) -> t.List[TransformOutputResult]: """Given inference results, perform transformations required to transmit results to the requestor. - :param request: The request that triggered the pipeline + :param batch: The batch of requests that triggered the pipeline :param execute_result: The result of inference wrapped in an ExecuteResult - :return:""" + :return: A list of transformed outputs""" From 63a0f31ecf58e03bc3cbefc7293e911c61b969a1 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 25 Jul 2024 16:27:48 -0500 Subject: [PATCH 53/84] Remove unused fake versioning function --- smartsim/_core/mli/mli_schemas/model/utils.py | 41 ------------------- 1 file changed, 41 deletions(-) delete mode 100644 smartsim/_core/mli/mli_schemas/model/utils.py diff --git a/smartsim/_core/mli/mli_schemas/model/utils.py b/smartsim/_core/mli/mli_schemas/model/utils.py deleted file mode 100644 index b16dc8f623..0000000000 --- a/smartsim/_core/mli/mli_schemas/model/utils.py +++ /dev/null @@ -1,41 +0,0 @@ -# BSD 2-Clause License -# -# Copyright (c) 2021-2024, Hewlett Packard Enterprise -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import typing as t -from collections import namedtuple - -from .model_capnp import Model - -ModelInfo = namedtuple("ModelInfo", ["Name", "Version"]) - - -def make_model_key(model: Model) -> str: - return f"{model.name}_{model.version}" - - -def get_model_name_and_version(key: str) -> t.NamedTuple: - split_key = key.rsplit("_", 1) - return ModelInfo(split_key[0], split_key[1]) From 6fb3efddfb12db46aee42e485100a2ba62bfea57 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Thu, 25 Jul 2024 16:41:23 -0500 Subject: [PATCH 54/84] Fix --- .../_core/mli/infrastructure/control/requestdispatcher.py | 6 +++--- smartsim/_core/mli/infrastructure/control/workermanager.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index 6a6f811fc9..a43290bf56 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -342,7 +342,7 @@ def __init__( :param worker_type: Type of worker to instantiate to batch inputs :param comm_channel_type: Type of channel used to get requests """ - super().__init__(as_service=False, cooldown=1) + super().__init__(as_service=True, cooldown=1) self._queues: list[BatchQueue] = [] """All batch queues""" self._active_queues: dict[str, BatchQueue] = {} @@ -368,7 +368,7 @@ def __init__( """The worker used to batch inputs""" self._mem_pool = mem_pool """Memory pool used to share batched input tensors with the Worker Managers""" - self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=False) + self._perf_timer = PerfTimer(prefix="r_", debug=True, timing_on=True) """Performance timer""" def _validate_request(self, request: InferenceRequest) -> bool: @@ -413,7 +413,7 @@ def _on_iteration(self) -> None: try: bytes_list: t.List[bytes] = self._incoming_channel.recv() except Exception: - pass + self._perf_timer.start_timings() else: if not bytes_list: exception_handler( diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 74b36c3454..12a7891914 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -166,7 +166,7 @@ def __init__( self._device_manager: t.Optional[DeviceManager] = None """Object responsible for model caching and device access""" - self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True) + self._perf_timer = PerfTimer(prefix="w_", debug=True, timing_on=True) """Performance timer""" def _on_start(self) -> None: From a0cd4ab44af718a493a03a7b87ca14507abcb456 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 13 Aug 2024 19:22:58 +0200 Subject: [PATCH 55/84] Address review --- ex/high_throughput_inference/mock_app.py | 45 +++-- ex/high_throughput_inference/redis_driver.py | 2 +- .../standalone_workermanager.py | 61 ++---- .../mli/infrastructure/control/commons.py | 65 ++++++ .../infrastructure/control/devicemanager.py | 114 +++++------ .../control/requestdispatcher.py | 188 +++++++++--------- .../infrastructure/control/workermanager.py | 54 ++--- .../mli/infrastructure/worker/torch_worker.py | 19 +- .../_core/mli/infrastructure/worker/worker.py | 29 ++- tests/dragon/test_error_handling.py | 4 +- 10 files changed, 299 insertions(+), 282 deletions(-) create mode 100644 smartsim/_core/mli/infrastructure/control/commons.py diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 2440aa87c4..69ff6afeac 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -72,14 +72,14 @@ def __init__(self, timing_on: bool): self._from_worker_ch_serialized = self._from_worker_ch.serialize() self._to_worker_ch = Channel.make_process_local() - self._perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"a{rank}_") + self.perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"a{rank}_") def run_model(self, model: bytes | str, batch: torch.Tensor): tensors = [batch.numpy()] - self._perf_timer.start_timings("batch_size", batch.shape[0]) + self.perf_timer.start_timings("batch_size", batch.shape[0]) built_tensor_desc = MessageHandler.build_tensor_descriptor( "c", "float32", list(batch.shape)) - self._perf_timer.measure_time("build_tensor_descriptor") + self.perf_timer.measure_time("build_tensor_descriptor") if isinstance(model, str): model_arg = MessageHandler.build_model_key(model) else: @@ -92,43 +92,41 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): output_descriptors=[], custom_attributes=None, ) - self._perf_timer.measure_time("build_request") + self.perf_timer.measure_time("build_request") request_bytes = MessageHandler.serialize_request(request) - self._perf_timer.measure_time("serialize_request") + self.perf_timer.measure_time("serialize_request") tensor_bytes = [bytes(tensor.data) for tensor in tensors] # tensor_bytes = [tensor.reshape(-1).view(numpy.uint8).data for tensor in tensors] - self._perf_timer.measure_time("serialize_tensor") + self.perf_timer.measure_time("serialize_tensor") with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: to_sendh.send_bytes(request_bytes) for tb in tensor_bytes: to_sendh.send_bytes(tb) #TODO NOT FAST ENOUGH!!! # to_sendh.send_bytes(bytes(t.data)) - self._perf_timer.measure_time("send") + self.perf_timer.measure_time("send") with self._from_worker_ch.recvh(timeout=None) as from_recvh: resp = from_recvh.recv_bytes(timeout=None) - self._perf_timer.measure_time("receive_response") + self.perf_timer.measure_time("receive_response") response = MessageHandler.deserialize_response(resp) - self._perf_timer.measure_time("deserialize_response") + self.perf_timer.measure_time("deserialize_response") # list of data blobs? recv depending on the len(response.result.descriptors)? data_blob: bytes = from_recvh.recv_bytes(timeout=None) - self._perf_timer.measure_time("receive_tensor") + self.perf_timer.measure_time("receive_tensor") result = torch.from_numpy( numpy.frombuffer( data_blob, dtype=str(response.result.descriptors[0].dataType), ) ) - self._perf_timer.measure_time("deserialize_tensor") + self.perf_timer.measure_time("deserialize_tensor") - self._perf_timer.end_timings() + self.perf_timer.end_timings() return result def set_model(self, key: str, model: bytes): self._ddict[key] = model - def print_timings(self, to_file: bool): - self._perf_timer.print_timings(to_file) class ResNetWrapper(): @@ -154,7 +152,7 @@ def name(self): if __name__ == "__main__": parser = argparse.ArgumentParser("Mock application") - parser.add_argument("--device", default="cpu") + parser.add_argument("--device", default="cpu", type=str) parser.add_argument("--log_max_batchsize", default=8, type=int) args = parser.parse_args() @@ -163,7 +161,10 @@ def name(self): client = ProtoClient(timing_on=True) client.set_model(resnet.name, resnet.model) - pt_model = torch.jit.load(io.BytesIO(initial_bytes=(resnet.model))).to("cuda:0") + if CHECK_RESULTS_AND_MAKE_ALL_SLOWER: + # TODO: adapt to non-Nvidia devices + torch_device = args.device.replace("gpu", "cuda") + pt_model = torch.jit.load(io.BytesIO(initial_bytes=(resnet.model))).to(torch_device) TOTAL_ITERATIONS = 100 @@ -172,15 +173,15 @@ def name(self): logger.info(f"Batch size: {b_size}") for iteration_number in range(TOTAL_ITERATIONS + int(b_size==1)): logger.info(f"Iteration: {iteration_number}") - batch = resnet.get_batch(b_size) - remote_result = client.run_model(resnet.name, batch) - logger.info(client._perf_timer.get_last("total_time")) + sample_batch = resnet.get_batch(b_size) + remote_result = client.run_model(resnet.name, sample_batch) + logger.info(client.perf_timer.get_last("total_time")) if CHECK_RESULTS_AND_MAKE_ALL_SLOWER: - local_res = pt_model(batch.to("cuda:0")) - err_norm = torch.linalg.vector_norm(torch.flatten(remote_result).to("cuda:0")-torch.flatten(local_res), ord=1).cpu() + local_res = pt_model(sample_batch.to(torch_device)) + err_norm = torch.linalg.vector_norm(torch.flatten(remote_result).to(torch_device)-torch.flatten(local_res), ord=1).cpu() res_norm = torch.linalg.vector_norm(remote_result, ord=1).item() local_res_norm = torch.linalg.vector_norm(local_res, ord=1).item() logger.info(f"Avg norm of error {err_norm.item()/b_size} compared to result norm of {res_norm/b_size}:{local_res_norm/b_size}") torch.cuda.synchronize() - client.print_timings(to_file=True) \ No newline at end of file + client.perf_timer.print_timings(to_file=True) \ No newline at end of file diff --git a/ex/high_throughput_inference/redis_driver.py b/ex/high_throughput_inference/redis_driver.py index 6a8b00c2a8..ff57725d40 100644 --- a/ex/high_throughput_inference/redis_driver.py +++ b/ex/high_throughput_inference/redis_driver.py @@ -33,7 +33,7 @@ DEVICE = "gpu" filedir = os.path.dirname(__file__) app_script_name = os.path.join(filedir, "mock_app_redis.py") -model_name = os.path.join(filedir, f"resnet50.{DEVICE.upper()}.pt") +model_name = os.path.join(filedir, f"resnet50.{DEVICE}.pt") exp_path = os.path.join(filedir, "redis_ai_multi") diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index 72e2bd20f0..a17039d0fd 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -28,7 +28,6 @@ import dragon # pylint disable=import-error -import dragon.globalservices.pool as dragon_gs_pool import dragon.infrastructure.policy as dragon_policy import dragon.infrastructure.process_desc as dragon_process_desc import dragon.native.process as dragon_process @@ -69,31 +68,16 @@ ) from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase +from smartsim.log import get_logger + +logger = get_logger("Worker Manager Entry Point") + mp.set_start_method("dragon") pid = os.getpid() affinity = os.sched_getaffinity(pid) -print("Entry point:", socket.gethostname(), affinity) -print("CPUS:", os.cpu_count()) - - -def create_request_dispatcher( - batch_size: int, - batch_timeout: float, - comm_channel_type: t.Type[CommChannelBase], - worker_type: t.Type[MachineLearningWorkerBase], - config_loader: EnvironmentConfigLoader, -) -> RequestDispatcher: - mem_pool = MemoryPool.attach(dragon_gs_pool.create(2 * 1024**3).sdesc) - - return RequestDispatcher( - batch_timeout=batch_timeout, - batch_size=batch_size, - config_loader=config_loader, - comm_channel_type=comm_channel_type, - mem_pool=mem_pool, - worker_type=worker_type, - ) +logger.log(f"Entry point: {socket.gethostname()}, {affinity}") +logger.log(f"CPUS: {os.cpu_count()}") def create_worker_manager( @@ -102,15 +86,7 @@ def create_worker_manager( device: str, dispatcher_queue: mp.Queue, ) -> WorkerManager: - return WorkerManager( - config_loader=config_loader, - worker_type=worker_type, - as_service=True, - cooldown=10, - comm_channel_type=DragonCommChannel, - device=device, - task_queue=dispatcher_queue, - ) + return def service_as_dragon_proc( @@ -191,31 +167,36 @@ def service_as_dragon_proc( ss_config_loader = EnvironmentConfigLoader() - dispatcher = create_request_dispatcher( - batch_size=args.batch_size, + dispatcher = RequestDispatcher( batch_timeout=args.batch_timeout, + batch_size=args.batch_size, + config_loader=ss_config_loader, comm_channel_type=DragonCommChannel, worker_type=arg_worker_type, - config_loader=ss_config_loader, ) wms = [] worker_device = args.device for wm_idx in range(args.num_workers): - # if args.num_workers > 0: - # worker_device = f"{args.device}:{wm_idx}" - worker_manager = create_worker_manager( - worker_type=arg_worker_type, + + worker_manager = WorkerManager( config_loader=ss_config_loader, + worker_type=arg_worker_type, + as_service=True, + cooldown=10, + comm_channel_type=DragonCommChannel, device=worker_device, - dispatcher_queue=dispatcher.task_queue, + task_queue=dispatcher.task_queue, ) + wms.append(worker_manager) wm_affinity: list[int] = [] disp_affinity: list[int] = [] - # This is hardcoded for a specific type of node! + # This is hardcoded for a specific type of node: + # the GPU-to-CPU mapping is taken from the nvidia-smi tool + # TODO can this be computed on the fly? gpu_to_cpu_aff: dict[int, list[int]] = {} gpu_to_cpu_aff[0] = list(range(48,64)) + list(range(112,128)) gpu_to_cpu_aff[1] = list(range(32,48)) + list(range(96,112)) diff --git a/smartsim/_core/mli/infrastructure/control/commons.py b/smartsim/_core/mli/infrastructure/control/commons.py new file mode 100644 index 0000000000..4c67fb47b0 --- /dev/null +++ b/smartsim/_core/mli/infrastructure/control/commons.py @@ -0,0 +1,65 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t + +from .....log import get_logger +from ...comm.channel.channel import CommChannelBase +from ...message_handler import MessageHandler +from ...mli_schemas.response.response_capnp import ResponseBuilder + + +logger = get_logger(__file__) + + +def build_failure_reply(status: "Status", message: str) -> ResponseBuilder: + return MessageHandler.build_response( + status=status, + message=message, + result=[], + custom_attributes=None, + ) + +def exception_handler( + exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str +) -> None: + """ + Logs exceptions and sends a failure response. + + :param exc: The exception to be logged + :param reply_channel: The channel used to send replies + :param failure_message: Failure message to log and send back + """ + logger.exception( + f"{failure_message}\n" + f"Exception type: {type(exc).__name__}\n" + f"Exception message: {str(exc)}" + ) + serialized_resp = MessageHandler.serialize_response( + build_failure_reply("fail", failure_message) + ) + if reply_channel: + reply_channel.send(serialized_resp) \ No newline at end of file diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py index 14b83a5044..9a56dd3ba5 100644 --- a/smartsim/_core/mli/infrastructure/control/devicemanager.py +++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py @@ -25,12 +25,10 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import typing as t -from threading import RLock -from types import TracebackType from ...infrastructure.storage.featurestore import FeatureStore from ..worker.worker import MachineLearningWorkerBase -from .requestdispatcher import InferenceBatch +from .requestdispatcher import RequestBatch class WorkerDevice: @@ -40,91 +38,83 @@ def __init__(self, name: str) -> None: """ self._name = name """The name used by the toolkit to identify this device""" - self._lock = RLock() - """Lock to ensure only one thread at the time accesses this device""" self._models: dict[str, t.Any] = {} + """Dict of keys to models which are loaded on this device""" - def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]: - return self._lock.acquire(blocking=blocking, timeout=timeout) - - def release(self) -> None: - self._lock.release() - - def __enter__(self) -> None: - self.acquire() @property def name(self) -> str: + """The identifier of the device represented by this object""" return self._name def add_model(self, key: str, model: t.Any) -> None: + """Add a reference to a model loaded on this device and assign it a key + + :param key: The key under which the model is saved + :param model: The model which is added + """ self._models[key] = model def remove_model(self, key: str) -> None: + """Remove the reference to a model loaded on this device + + :param key: The key of the model to remove + """ self._models.pop(key) def get_model(self, key: str) -> t.Any: + """Get the model corresponding to a given key + + :param key: the model key + """ return self._models[key] def __contains__(self, key: str) -> bool: return key in self._models - def __exit__( - self, - exc_type: t.Optional[t.Type[BaseException]], - exc_val: t.Optional[BaseException], - exc_tb: t.Optional[TracebackType], - ) -> None: - self.release() - class DeviceManager: - def __init__(self, devices: list[WorkerDevice]): - self._devices = devices - """Dictionary of model key to devices on which it is loaded""" + def __init__(self, device: WorkerDevice): + self._device = device + """Device managed by this object""" + + def _load_model_on_device(self, + worker: MachineLearningWorkerBase, + batch: RequestBatch, + feature_store: t.Optional[FeatureStore], + ) -> None: + model_bytes = worker.fetch_model(batch, feature_store) + loaded_model = worker.load_model( + batch, model_bytes, self._device.name + ) + self._device.add_model(batch.model_key, loaded_model.model) - def get_free_device( + def get_device( self, worker: MachineLearningWorkerBase, - batch: InferenceBatch, + batch: RequestBatch, feature_store: t.Optional[FeatureStore], ) -> t.Generator[WorkerDevice, None, None]: - return_device = None - sample_request = batch.requests[0] - direct_inference = sample_request.raw_model is not None - while return_device is None: - loaded_devices = [] - if not direct_inference: - # Look up devices to see if any of them already has a copy of the model - for device in self._devices: - if batch.model_key in device: - loaded_devices.append(device) - - # If a pre-loaded model is found on a device, try using that device - for device in loaded_devices: - if device.acquire(blocking=False): - return_device = device - break - - # If the model is not loaded on a free device, - # load it on another device (if available) - if return_device is None: - for candidate_device in self._devices: - if ( - candidate_device not in loaded_devices - and candidate_device.acquire(blocking=False) - ): - model_bytes = worker.fetch_model(batch, feature_store) - loaded_model = worker.load_model( - batch, model_bytes, candidate_device.name - ) - candidate_device.add_model(batch.model_key, loaded_model.model) - - return_device = candidate_device + """Get the device managed by this object + + the model needed to run the batch of requests is + guaranteed to be available on the model + + :param worker: The worker that wants to access the device + :param batch: The batch of requests + :param feature_store: The feature store on which part of the + data needed by the request may be stored + :return: A generator yielding the device + """ + model_in_request = batch.has_raw_model + + # Load model if not already loaded, or + # because it is sent with the request + if model_in_request or not batch.model_key in self._device: + self._load_model_on_device(worker, batch, feature_store) try: - yield return_device + yield self._device finally: - if direct_inference: - return_device.remove_model(batch.model_key) - return_device.release() + if model_in_request: + self._device.remove_model(batch.model_key) diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index a43290bf56..d050a646c3 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -29,6 +29,7 @@ import dragon from dragon.managed_memory import MemoryPool from dragon.mpbridge.queues import DragonQueue +import dragon.globalservices.pool as dragon_gs_pool # pylint: enable=import-error @@ -53,14 +54,14 @@ from ...infrastructure.environmentloader import EnvironmentConfigLoader from ...infrastructure.storage.featurestore import FeatureStore from ...infrastructure.worker.worker import ( - InferenceBatch, + RequestBatch, InferenceRequest, MachineLearningWorkerBase, ) from ...message_handler import MessageHandler from ...mli_schemas.model.model_capnp import Model -from ...mli_schemas.response.response_capnp import ResponseBuilder from ...mli_schemas.tensor.tensor_capnp import TensorDescriptor +from .commons import exception_handler if t.TYPE_CHECKING: from smartsim._core.mli.mli_schemas.response.response_capnp import Status @@ -73,18 +74,10 @@ def deserialize_message( channel_type: t.Type[CommChannelBase], ) -> InferenceRequest: """Deserialize a message from a byte stream into an InferenceRequest - :param data_blob: The byte stream to deserialize""" - # todo: consider moving to XxxCore and only making - # workers implement the inputs and model conversion? - - # alternatively, consider passing the capnproto models - # to this method instead of the data_blob... - - # something is definitely wrong here... client shouldn't have to touch - # callback (or batch size) + :param data_blob: The byte stream to deserialize + :param channel_type: The channel used to send the response""" request = MessageHandler.deserialize_request(data_blob) - # return request model_key: t.Optional[str] = None model_bytes: t.Optional[Model] = None @@ -126,37 +119,6 @@ def deserialize_message( return inference_request -def build_failure_reply(status: "Status", message: str) -> ResponseBuilder: - return MessageHandler.build_response( - status=status, - message=message, - result=[], - custom_attributes=None, - ) - - -def exception_handler( - exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str -) -> None: - """ - Logs exceptions and sends a failure response. - - :param exc: The exception to be logged - :param reply_channel: The channel used to send replies - :param failure_message: Failure message to log and send back - """ - logger.exception( - f"{failure_message}\n" - f"Exception type: {type(exc).__name__}\n" - f"Exception message: {str(exc)}" - ) - serialized_resp = MessageHandler.serialize_response( - build_failure_reply("fail", failure_message) - ) - if reply_channel: - reply_channel.send(serialized_resp) - - class WorkerDevice: def __init__(self, name: str) -> None: """Wrapper around a device to keep track of loaded Models and availability @@ -192,7 +154,7 @@ def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> Non """Queue used to store inference requests waiting to be batched and sent to Worker Managers. :param batch_timeout: Time in seconds that has to be waited before flushing a - non-full queue. The time of the firt item put is 0 seconds. + non-full queue. The time of the first item put is 0 seconds. :param batch_size: Total capacity of the queue. :param model_key: Key of the model which needs to be executed on the queued requests @@ -200,7 +162,7 @@ def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> Non super().__init__(maxsize=batch_size) self._batch_timeout = batch_timeout """Time in seconds that has to be waited before flushing a non-full queue. - The time of the firt item put is 0 seconds.""" + The time of the first item put is 0 seconds.""" self._batch_size = batch_size """Total capacity of the queue.""" self._first_put: t.Optional[float] = None @@ -212,13 +174,13 @@ def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> Non """Key of the model which needs to be executed on the queued requets""" self._flush_lock = RLock() """Lock used to make sure only one process can flush the queue (unused now)""" - self._id = str(uuid.uuid4()) - """Id of queue""" + self._uid = str(uuid.uuid4()) + """Unique ID of queue""" @property - def queue_id(self) -> str: + def uid(self) -> str: """ID of this queue""" - return self._id + return self._uid def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]: """Acquire queue lock to flush @@ -232,6 +194,7 @@ def release(self) -> None: self._flush_lock.release() def __enter__(self) -> None: + """Method to use the Queue as a Context Manager""" self.acquire() def __exit__( @@ -240,6 +203,7 @@ def __exit__( exc_val: t.Optional[BaseException], exc_tb: t.Optional[TracebackType], ) -> None: + """Method to release the Queue as a Context Manager""" self.release() @property @@ -256,7 +220,7 @@ def put( """Put an inference request in the queue :param item: The request :param block: Whether to block when trying to put the item - :param timeout: Time to wait if block==True + :param timeout: Time (in seconds) to wait if block==True """ if not self.acquire(blocking=False): raise Full @@ -270,8 +234,8 @@ def put( self.release() @property - def _waited_time(self) -> float: - if self._first_put is None: + def _elapsed_time(self) -> float: + if self.empty(): return 0 return time.time() - self._first_put @@ -280,15 +244,15 @@ def ready(self) -> bool: """True if the queue can be flushed""" if self.empty(): return False - return self.full() or (self._waited_time >= self._batch_timeout) + return self.full() or (self._elapsed_time >= self._batch_timeout) def make_disposable(self) -> None: """Set this queue as disposable, and never use it again after it gets flushed""" self._disposable = True @property - def disposable(self) -> bool: - """Whether this queue can be used to put items or should be deleted""" + def can_be_removed(self) -> bool: + """Whether this queue can be deleted and garbafe collected""" return self.empty() and self._disposable def flush(self) -> list[t.Any]: @@ -298,7 +262,6 @@ def flush(self) -> list[t.Any]: num_items = self.qsize() self._first_put = None items = [] - # Avoid (unlikely) race condition error for _ in range(num_items): try: items.append(self.get()) @@ -325,28 +288,27 @@ def __init__( self, batch_timeout: float, batch_size: int, - mem_pool: MemoryPool, config_loader: EnvironmentConfigLoader, worker_type: t.Type[MachineLearningWorkerBase], comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, ) -> None: - """The RquestDispatcher intercepts inference requests, stages them in + """The RequestDispatcher intercepts inference requests, stages them in queues and batches them together before making them available to Worker Managers. - :param batch_timeout: Time in seconds that has to be waited before flushing a - non-full queue after having put at least one item on it. + :param batch_timeout: Maximum elapsed time before flushing a complete or incomplete batch :param batch_size: Total capacity of each batch queue. :param mem_pool: Memory pool used to share batched input tensors with worker managers :param config_loader: Object to load configuration from environment :param worker_type: Type of worker to instantiate to batch inputs :param comm_channel_type: Type of channel used to get requests + :raises SmartSimError: If config_loaded.get_queue() does not return a channel """ super().__init__(as_service=True, cooldown=1) - self._queues: list[BatchQueue] = [] - """All batch queues""" + self._queues: dict[str, list[BatchQueue]] = [] + """Dict of all batch queues available for a given model key""" self._active_queues: dict[str, BatchQueue] = {} - """Mapping telling which queue is the recipient of requets for a given model + """Mapping telling which queue is the recipient of requests for a given model key""" self._batch_timeout = batch_timeout """Time in seconds that has to be waited before flushing a non-full queue""" @@ -354,7 +316,10 @@ def __init__( """Total capacity of each batch queue.""" self._queue_swap_lock: t.Optional[RLock] = None """Lock used to swap the active queue for a key""" - self._incoming_channel = config_loader.get_queue() + incoming_channel = config_loader.get_queue() + if incoming_channel is None: + raise SmartSimError("No incoming channel for dispatcher") + self._incoming_channel = incoming_channel """The channel the dispatcher monitors for new tasks""" self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0) """The queue on which batched inference requests are placed""" @@ -366,7 +331,7 @@ def __init__( """The type of the channel used to receive requests""" self._worker = worker_type() """The worker used to batch inputs""" - self._mem_pool = mem_pool + self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(2 * 1024**3).sdesc) """Memory pool used to share batched input tensors with the Worker Managers""" self._perf_timer = PerfTimer(prefix="r_", debug=True, timing_on=True) """Performance timer""" @@ -407,9 +372,6 @@ def _on_start(self) -> None: def _on_iteration(self) -> None: - if self._incoming_channel is None: - raise SmartSimError("No incoming channel for dispatcher") - try: bytes_list: t.List[bytes] = self._incoming_channel.recv() except Exception: @@ -454,16 +416,28 @@ def task_queue(self) -> DragonQueue: return self._outgoing_queue def _swap_queue(self, model_key: str) -> None: + """Get an empty queue or create a new one + + and make it the active one for a given model. + + :param model_key: The key of the model for which the + queue has to be swapped + :raises SmartSimError: If the queue is not locked. + """ if self._queue_swap_lock is None: raise SmartSimError("Queues were not locked") with self._queue_swap_lock: - for queue in self._queues: - if queue.model_key == model_key and not queue.full(): - self._active_queues[model_key] = queue - return + for queue_list in self._queues[model_key]: + for queue in queue_list: + if not queue.full(): + self._active_queues[model_key] = queue + return new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key) - self._queues.append(new_queue) + if model_key in self._queues: + self._queues[model_key].append(new_queue) + else: + self._queues[model_key] = [new_queue] self._active_queues[model_key] = new_queue return @@ -493,33 +467,49 @@ def dispatch(self, request: InferenceRequest) -> None: def flush_requests(self) -> None: """Get all requests from queues which are ready to be flushed. Place all - aviable request batches in the outgoing queue. + avaliable request batches in the outgoing queue. """ - for queue in self._queues: - if queue.ready and queue.acquire(blocking=False): - self._perf_timer.measure_time("find_queue") - try: - batch = InferenceBatch( - model_key=queue.model_key, requests=queue.flush(), inputs=None - ) - finally: - self._perf_timer.measure_time("flush_requests") - queue.release() - fetch_results = self._worker.fetch_inputs( - batch=batch, feature_store=self._feature_store - ) - self._perf_timer.measure_time("fetch_input") - transformed_inputs = self._worker.transform_input( - batch=batch, fetch_results=fetch_results, mem_pool=self._mem_pool - ) - self._perf_timer.measure_time("transform_input") - batch.inputs = transformed_inputs - for request in batch.requests: - request.raw_inputs = [] - request.input_meta = [] - - self._outgoing_queue.put(batch) - self._perf_timer.measure_time("put") + for queue_list in self._queues: + for queue in queue_list: + if queue.ready and queue.acquire(blocking=False): + self._perf_timer.measure_time("find_queue") + try: + batch = RequestBatch( + model_key=queue.model_key, requests=queue.flush(), inputs=None + ) + finally: + self._perf_timer.measure_time("flush_requests") + queue.release() + try: + fetch_results = self._worker.fetch_inputs( + batch=batch, feature_store=self._feature_store + ) + except Exception as exc: + exception_handler( + exc, + None, + "Error fetching input.", + ) + self._perf_timer.measure_time("fetch_input") + try: + transformed_inputs = self._worker.transform_input( + batch=batch, fetch_results=fetch_results, mem_pool=self._mem_pool + ) + except Exception as exc: + exception_handler( + exc, + None, + "Error Transforming input.", + ) + + self._perf_timer.measure_time("transform_input") + batch.inputs = transformed_inputs + for request in batch.requests: + request.raw_inputs = [] + request.input_meta = [] + + self._outgoing_queue.put(batch) + self._perf_timer.measure_time("put") def _can_shutdown(self) -> bool: return False diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 12a7891914..d0d1ca81fb 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -27,12 +27,13 @@ # pylint: disable=import-error # pylint: disable-next=unused-import import dragon - # pylint: enable=import-error # isort: off # isort: on +from queue import Empty + import multiprocessing as mp import time import typing as t @@ -45,13 +46,14 @@ from ...infrastructure.environmentloader import EnvironmentConfigLoader from ...infrastructure.storage.featurestore import FeatureStore from ...infrastructure.worker.worker import ( - InferenceBatch, + RequestBatch, InferenceReply, LoadModelResult, MachineLearningWorkerBase, ) from ...message_handler import MessageHandler from ...mli_schemas.response.response_capnp import ResponseBuilder +from .commons import build_failure_reply, exception_handler from .devicemanager import DeviceManager, WorkerDevice if t.TYPE_CHECKING: @@ -62,14 +64,6 @@ logger = get_logger(__name__) -def build_failure_reply(status: "Status", message: str) -> ResponseBuilder: - return MessageHandler.build_response( - status=status, - message=message, - result=[], - custom_attributes=None, - ) - def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: prepared_outputs: t.List[t.Any] = [] @@ -100,29 +94,6 @@ def build_reply(reply: InferenceReply) -> ResponseBuilder: custom_attributes=None, ) - -def exception_handler( - exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str -) -> None: - """ - Logs exceptions and sends a failure response. - - :param exc: The exception to be logged - :param reply_channel: The channel used to send replies - :param failure_message: Failure message to log and send back - """ - logger.exception( - f"{failure_message}\n" - f"Exception type: {type(exc).__name__}\n" - f"Exception message: {str(exc)}" - ) - serialized_resp = MessageHandler.serialize_response( - build_failure_reply("fail", failure_message) - ) - if reply_channel: - reply_channel.send(serialized_resp) - - class WorkerManager(Service): """An implementation of a service managing distribution of tasks to machine learning workers""" @@ -131,7 +102,7 @@ def __init__( self, config_loader: EnvironmentConfigLoader, worker_type: t.Type[MachineLearningWorkerBase], - task_queue: "mp.Queue[InferenceBatch]", + dispatcher_queue: "mp.Queue[InferenceBatch]", as_service: bool = False, cooldown: int = 0, comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, @@ -141,7 +112,7 @@ def __init__( :param config_loader: Environment config loader that loads the task queue and feature store :param worker_type: The type of worker to manage - :param task_queue: Queue from witch the batched requests have to be pulled + :param dispatcher_queue: Queue from which the batched requests have to be pulled :param as_service: Specifies run-once or run-until-complete behavior of service :param cooldown: Number of seconds to wait before shutting down after shutdown criteria are met @@ -151,7 +122,7 @@ def __init__( """ super().__init__(as_service, cooldown) - self._task_queue = task_queue + self._dispatcher_queue = dispatcher_queue """The dispatcher queue the manager monitors for new tasks""" self._feature_store: t.Optional[FeatureStore] = ( config_loader.get_feature_store() @@ -179,8 +150,8 @@ def _on_iteration(self) -> None: pre_batch_time = time.perf_counter() try: - batch: InferenceBatch = self._task_queue.get(timeout=0.0001) - except Exception: + batch: RequestBatch = self._dispatcher_queue.get(timeout=0.0001) + except Empty: return self._perf_timer.start_timings( @@ -188,12 +159,17 @@ def _on_iteration(self) -> None: ) if batch is None or 0 == len(batch.requests): + exception_handler( + ValueError("An empty batch was received"), + None, + "Error batching inputs, the batch was empty.", + ) return if self._device_manager is None: raise ValueError("No Device Manager available: did you call _on_start()") device: WorkerDevice = next( - self._device_manager.get_free_device( + self._device_manager.get_device( worker=self._worker, batch=batch, feature_store=self._feature_store, diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 6723573cfb..392e7e051e 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -39,7 +39,7 @@ ExecuteResult, FetchInputResult, FetchModelResult, - InferenceBatch, + RequestBatch, LoadModelResult, MachineLearningWorkerBase, TransformInputResult, @@ -56,13 +56,12 @@ class TorchWorker(MachineLearningWorkerBase): @staticmethod def load_model( - batch: InferenceBatch, fetch_result: FetchModelResult, device: str + batch: RequestBatch, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: - request = batch.requests[0] if fetch_result.model_bytes: model_bytes = fetch_result.model_bytes - elif request.raw_model and request.raw_model.data: - model_bytes = request.raw_model.data + elif batch.raw_model and batch.raw_model.data: + model_bytes = batch.raw_model.data else: raise ValueError("Unable to load model without reference object") @@ -79,7 +78,7 @@ def load_model( @staticmethod def transform_input( - batch: InferenceBatch, + batch: RequestBatch, fetch_results: list[FetchInputResult], mem_pool: MemoryPool, ) -> TransformInputResult: @@ -135,7 +134,7 @@ def transform_input( # pylint: disable-next=unused-argument @staticmethod def execute( - batch: InferenceBatch, + batch: RequestBatch, load_result: LoadModelResult, transform_result: TransformInputResult, device: str, @@ -165,8 +164,8 @@ def execute( with torch.no_grad(): model.eval() results = [ - model(tensor.to(device, non_blocking=True)).detach() - for tensor in tensors + model(*[tensor.to(device, non_blocking=True).detach() + for tensor in tensors]) ] transform_result.transformed = [] @@ -178,7 +177,7 @@ def execute( @staticmethod def transform_output( - batch: InferenceBatch, + batch: RequestBatch, execute_result: ExecuteResult, ) -> list[TransformOutputResult]: transformed_list: list[TransformOutputResult] = [] diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 01e2db6c86..0565146968 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -162,18 +162,33 @@ def __init__(self, result: bytes) -> None: @dataclass -class InferenceBatch: +class RequestBatch: + """A batch of aggregated inference requests + """ model_key: str requests: list[InferenceRequest] inputs: t.Optional[TransformInputResult] + @property + def has_valid_requests(self) -> bool: + return len(self.requests) > 0 + + @property + def has_raw_nodel(self) -> bool: + return self.raw_model is not None + + @property + def raw_model(self) -> t.Optional[t.Any]: + if self.has_valid_requests: + return self.requests[0].raw_model + return None class MachineLearningWorkerCore: """Basic functionality of ML worker that is shared across all worker types""" @staticmethod def fetch_model( - batch: InferenceBatch, feature_store: t.Optional[FeatureStore] + batch: RequestBatch, feature_store: t.Optional[FeatureStore] ) -> FetchModelResult: """Given a resource key, retrieve the raw model from a feature store :param batch: The batch of requests that triggered the pipeline @@ -204,7 +219,7 @@ def fetch_model( @staticmethod def fetch_inputs( - batch: InferenceBatch, feature_store: t.Optional[FeatureStore] + batch: RequestBatch, feature_store: t.Optional[FeatureStore] ) -> t.List[FetchInputResult]: """Given a collection of ResourceKeys, identify the physical location and input metadata @@ -276,7 +291,7 @@ class MachineLearningWorkerBase(MachineLearningWorkerCore, ABC): @staticmethod @abstractmethod def load_model( - batch: InferenceBatch, fetch_result: FetchModelResult, device: str + batch: RequestBatch, fetch_result: FetchModelResult, device: str ) -> LoadModelResult: """Given a loaded MachineLearningModel, ensure it is loaded into device memory @@ -288,7 +303,7 @@ def load_model( @staticmethod @abstractmethod def transform_input( - batch: InferenceBatch, + batch: RequestBatch, fetch_results: list[FetchInputResult], mem_pool: MemoryPool, ) -> TransformInputResult: @@ -302,7 +317,7 @@ def transform_input( @staticmethod @abstractmethod def execute( - batch: InferenceBatch, + batch: RequestBatch, load_result: LoadModelResult, transform_result: TransformInputResult, device: str, @@ -317,7 +332,7 @@ def execute( @staticmethod @abstractmethod def transform_output( - batch: InferenceBatch, execute_result: ExecuteResult + batch: RequestBatch, execute_result: ExecuteResult ) -> t.List[TransformOutputResult]: """Given inference results, perform transformations required to transmit results to the requestor. diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 151bdd2fcc..c178426b4f 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -90,7 +90,7 @@ def setup_worker_manager_model_bytes(test_dir, monkeypatch: pytest.MonkeyPatch): test_dir, model, [tensor_key], [tensor_key], [], None ) ser_request = MessageHandler.serialize_request(request) - worker_manager._task_queue.send(ser_request) + worker_manager._dispatcher_queue.send(ser_request) return worker_manager, integrated_worker @@ -122,7 +122,7 @@ def setup_worker_manager_model_key(test_dir, monkeypatch: pytest.MonkeyPatch): test_dir, model_key, [tensor_key], [tensor_key], [], None ) ser_request = MessageHandler.serialize_request(request) - worker_manager._task_queue.send(ser_request) + worker_manager._dispatcher_queue.send(ser_request) return worker_manager, integrated_worker From af8b639f1187f73b295f96fb102ef20598fd47e5 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 14 Aug 2024 11:10:55 -0500 Subject: [PATCH 56/84] Static checker passes --- .../mli/infrastructure/control/commons.py | 5 +- .../infrastructure/control/devicemanager.py | 22 +- .../control/requestdispatcher.py | 253 ++++++++++-------- .../infrastructure/control/workermanager.py | 145 ++-------- .../mli/infrastructure/worker/torch_worker.py | 10 +- .../_core/mli/infrastructure/worker/worker.py | 55 ++-- 6 files changed, 202 insertions(+), 288 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/commons.py b/smartsim/_core/mli/infrastructure/control/commons.py index 4c67fb47b0..a40ae014aa 100644 --- a/smartsim/_core/mli/infrastructure/control/commons.py +++ b/smartsim/_core/mli/infrastructure/control/commons.py @@ -31,6 +31,8 @@ from ...message_handler import MessageHandler from ...mli_schemas.response.response_capnp import ResponseBuilder +if t.TYPE_CHECKING: + from smartsim._core.mli.mli_schemas.response.response_capnp import Status logger = get_logger(__file__) @@ -43,6 +45,7 @@ def build_failure_reply(status: "Status", message: str) -> ResponseBuilder: custom_attributes=None, ) + def exception_handler( exc: Exception, reply_channel: t.Optional[CommChannelBase], failure_message: str ) -> None: @@ -62,4 +65,4 @@ def exception_handler( build_failure_reply("fail", failure_message) ) if reply_channel: - reply_channel.send(serialized_resp) \ No newline at end of file + reply_channel.send(serialized_resp) diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py index 9a56dd3ba5..c3dfcc0261 100644 --- a/smartsim/_core/mli/infrastructure/control/devicemanager.py +++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py @@ -41,7 +41,6 @@ def __init__(self, name: str) -> None: self._models: dict[str, t.Any] = {} """Dict of keys to models which are loaded on this device""" - @property def name(self) -> str: """The identifier of the device represented by this object""" @@ -78,22 +77,21 @@ def __init__(self, device: WorkerDevice): self._device = device """Device managed by this object""" - def _load_model_on_device(self, + def _load_model_on_device( + self, worker: MachineLearningWorkerBase, batch: RequestBatch, - feature_store: t.Optional[FeatureStore], + feature_stores: dict[str, FeatureStore], ) -> None: - model_bytes = worker.fetch_model(batch, feature_store) - loaded_model = worker.load_model( - batch, model_bytes, self._device.name - ) - self._device.add_model(batch.model_key, loaded_model.model) + model_bytes = worker.fetch_model(batch, feature_stores) + loaded_model = worker.load_model(batch, model_bytes, self._device.name) + self._device.add_model(batch.model_key.key, loaded_model.model) def get_device( self, worker: MachineLearningWorkerBase, batch: RequestBatch, - feature_store: t.Optional[FeatureStore], + feature_stores: dict[str, FeatureStore], ) -> t.Generator[WorkerDevice, None, None]: """Get the device managed by this object @@ -110,11 +108,11 @@ def get_device( # Load model if not already loaded, or # because it is sent with the request - if model_in_request or not batch.model_key in self._device: - self._load_model_on_device(worker, batch, feature_store) + if model_in_request or not batch.model_key.key in self._device: + self._load_model_on_device(worker, batch, feature_stores) try: yield self._device finally: if model_in_request: - self._device.remove_model(batch.model_key) + self._device.remove_model(batch.model_key.key) diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index d050a646c3..3c1105b501 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -27,9 +27,9 @@ # pylint: disable=import-error # pylint: disable-next=unused-import import dragon +import dragon.globalservices.pool as dragon_gs_pool from dragon.managed_memory import MemoryPool from dragon.mpbridge.queues import DragonQueue -import dragon.globalservices.pool as dragon_gs_pool # pylint: enable=import-error @@ -49,18 +49,13 @@ from .....error import SmartSimError from .....log import get_logger from ....utils.timings import PerfTimer -from ...comm.channel.channel import CommChannelBase -from ...comm.channel.dragonchannel import DragonCommChannel from ...infrastructure.environmentloader import EnvironmentConfigLoader -from ...infrastructure.storage.featurestore import FeatureStore +from ...infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey from ...infrastructure.worker.worker import ( - RequestBatch, InferenceRequest, MachineLearningWorkerBase, + RequestBatch, ) -from ...message_handler import MessageHandler -from ...mli_schemas.model.model_capnp import Model -from ...mli_schemas.tensor.tensor_capnp import TensorDescriptor from .commons import exception_handler if t.TYPE_CHECKING: @@ -69,56 +64,6 @@ logger = get_logger("Request Dispatcher") -def deserialize_message( - data_blob: bytes, - channel_type: t.Type[CommChannelBase], -) -> InferenceRequest: - """Deserialize a message from a byte stream into an InferenceRequest - :param data_blob: The byte stream to deserialize - :param channel_type: The channel used to send the response""" - - request = MessageHandler.deserialize_request(data_blob) - model_key: t.Optional[str] = None - model_bytes: t.Optional[Model] = None - - if request.model.which() == "key": - model_key = request.model.key.key - elif request.model.which() == "data": - model_bytes = request.model.data - - callback_key = request.replyChannel.descriptor - - # todo: shouldn't this be `CommChannel.find` instead of `DragonCommChannel` - comm_channel = channel_type(callback_key) - - input_keys: t.Optional[t.List[str]] = None - input_bytes: t.Optional[t.List[bytes]] = None - - output_keys: t.Optional[t.List[str]] = None - - input_meta: t.Optional[t.List[TensorDescriptor]] = None - - if request.input.which() == "keys": - input_keys = [input_key.key for input_key in request.input.keys] - elif request.input.which() == "descriptors": - input_meta = request.input.descriptors # type: ignore - - if request.output: - output_keys = [tensor_key.key for tensor_key in request.output] - - inference_request = InferenceRequest( - model_key=model_key, - callback=comm_channel, - raw_inputs=input_bytes, - input_keys=input_keys, - input_meta=input_meta, - output_keys=output_keys, - raw_model=model_bytes, - batch_size=0, - ) - return inference_request - - class WorkerDevice: def __init__(self, name: str) -> None: """Wrapper around a device to keep track of loaded Models and availability @@ -150,7 +95,9 @@ def __exit__( class BatchQueue(Queue[InferenceRequest]): - def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> None: + def __init__( + self, batch_timeout: float, batch_size: int, model_key: FeatureStoreKey + ) -> None: """Queue used to store inference requests waiting to be batched and sent to Worker Managers. :param batch_timeout: Time in seconds that has to be waited before flushing a @@ -170,7 +117,7 @@ def __init__(self, batch_timeout: float, batch_size: int, model_key: str) -> Non self._disposable = False """Whether the queue will not be used again and can be deleted. A disposable queue is always full.""" - self._model_key = model_key + self._model_key: FeatureStoreKey = model_key """Key of the model which needs to be executed on the queued requets""" self._flush_lock = RLock() """Lock used to make sure only one process can flush the queue (unused now)""" @@ -207,7 +154,7 @@ def __exit__( self.release() @property - def model_key(self) -> str: + def model_key(self) -> FeatureStoreKey: """Key of the model which needs to be run on the queued requests""" return self._model_key @@ -235,7 +182,7 @@ def put( @property def _elapsed_time(self) -> float: - if self.empty(): + if self.empty() or self._first_put is None: return 0 return time.time() - self._first_put @@ -290,22 +237,21 @@ def __init__( batch_size: int, config_loader: EnvironmentConfigLoader, worker_type: t.Type[MachineLearningWorkerBase], - comm_channel_type: t.Type[CommChannelBase] = DragonCommChannel, ) -> None: """The RequestDispatcher intercepts inference requests, stages them in queues and batches them together before making them available to Worker Managers. - :param batch_timeout: Maximum elapsed time before flushing a complete or incomplete batch + :param batch_timeout: Maximum elapsed time before flushing a complete or + incomplete batch :param batch_size: Total capacity of each batch queue. :param mem_pool: Memory pool used to share batched input tensors with worker managers :param config_loader: Object to load configuration from environment :param worker_type: Type of worker to instantiate to batch inputs - :param comm_channel_type: Type of channel used to get requests :raises SmartSimError: If config_loaded.get_queue() does not return a channel """ super().__init__(as_service=True, cooldown=1) - self._queues: dict[str, list[BatchQueue]] = [] + self._queues: dict[str, list[BatchQueue]] = {} """Dict of all batch queues available for a given model key""" self._active_queues: dict[str, BatchQueue] = {} """Mapping telling which queue is the recipient of requests for a given model @@ -323,12 +269,15 @@ def __init__( """The channel the dispatcher monitors for new tasks""" self._outgoing_queue: DragonQueue = mp.Queue(maxsize=0) """The queue on which batched inference requests are placed""" - self._feature_store: t.Optional[FeatureStore] = ( - config_loader.get_feature_store() - ) - """A feature store to retrieve models from""" - self._comm_channel_type = comm_channel_type - """The type of the channel used to receive requests""" + self._feature_stores: t.Dict[str, FeatureStore] = {} + """A collection of attached feature stores""" + self._featurestore_factory = config_loader._featurestore_factory + """A factory method to create a desired feature store client type""" + self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone() + """A standalone, system-created feature store used to share internal + information among MLI components""" + self._callback_factory = config_loader._callback_factory + """The type of communication channel to construct for callbacks""" self._worker = worker_type() """The worker used to batch inputs""" self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(2 * 1024**3).sdesc) @@ -336,37 +285,91 @@ def __init__( self._perf_timer = PerfTimer(prefix="r_", debug=True, timing_on=True) """Performance timer""" - def _validate_request(self, request: InferenceRequest) -> bool: - """Ensure the request can be processed. + def _check_feature_stores(self, request: InferenceRequest) -> bool: + """Ensures that all feature stores required by the request are available + :param request: The request to validate - :return: True if the request is valid, False otherwise""" - if not self._feature_store: - if request.model_key: - logger.error("Unable to load model by key without feature store") - return False - - if request.input_keys: - logger.error("Unable to load inputs by key without feature store") - return False - - if request.output_keys: - logger.error("Unable to persist outputs by key without feature store") - return False - - if not request.model_key and not request.raw_model: - logger.error("Unable to continue without model bytes or feature store key") - return False + :returns: False if feature store validation fails for the request, True + otherwise + """ + # collect all feature stores required by the request + fs_model: t.Set[str] = set() + if request.model_key: + fs_model = {request.model_key.descriptor} + fs_inputs = {key.descriptor for key in request.input_keys} + fs_outputs = {key.descriptor for key in request.output_keys} - if not request.input_keys and not request.raw_inputs: - logger.error("Unable to continue without input bytes or feature store keys") - return False + # identify which feature stores are requested and unknown + fs_desired = fs_model.union(fs_inputs).union(fs_outputs) + fs_actual = {item.descriptor for item in self._feature_stores.values()} + fs_missing = fs_desired - fs_actual - if request.callback is None: - logger.error("No callback channel provided in request") + if self._featurestore_factory is None: + logger.error("No feature store factory configured") return False + # create the feature stores we need to service request + if fs_missing: + logger.debug(f"Adding feature store(s): {fs_missing}") + for descriptor in fs_missing: + feature_store = self._featurestore_factory(descriptor) + self._feature_stores[descriptor] = feature_store + return True + # pylint: disable-next=no-self-use + def _check_model(self, request: InferenceRequest) -> bool: + """Ensure that a model is available for the request + + :param request: The request to validate + :returns: False if model validation fails for the request, True otherwise + """ + if request.model_key or request.raw_model: + return True + + logger.error("Unable to continue without model bytes or feature store key") + return False + + # pylint: disable-next=no-self-use + def _check_inputs(self, request: InferenceRequest) -> bool: + """Ensure that inputs are available for the request + + :param request: The request to validate + :returns: False if input validation fails for the request, True otherwise + """ + if request.input_keys or request.raw_inputs: + return True + + logger.error("Unable to continue without input bytes or feature store keys") + return False + + # pylint: disable-next=no-self-use + def _check_callback(self, request: InferenceRequest) -> bool: + """Ensure that a callback channel is available for the request + + :param request: The request to validate + :returns: False if callback validation fails for the request, True otherwise + """ + if request.callback is not None: + return True + + logger.error("No callback channel provided in request") + return False + + def _validate_request(self, request: InferenceRequest) -> bool: + """Ensure the request can be processed + + :param request: The request to validate + :return: False if the request fails any validation checks, True otherwise""" + checks = [ + self._check_feature_stores(request), + self._check_model(request), + self._check_inputs(request), + self._check_callback(request), + ] + + return all(checks) + def _on_start(self) -> None: self._queue_swap_lock = RLock() @@ -388,18 +391,25 @@ def _on_iteration(self) -> None: tensor_bytes_list = bytes_list[1:] self._perf_timer.start_timings() - request = deserialize_message(request_bytes, self._comm_channel_type) + request = self._worker.deserialize_message( + request_bytes, self._callback_factory + ) if request.input_meta and tensor_bytes_list: request.raw_inputs = tensor_bytes_list self._perf_timer.measure_time("deserialize_message") - if not self._validate_request(request): - return - self._perf_timer.measure_time("validate_request") - self.dispatch(request) - - self._perf_timer.measure_time("dispatch") + if not self._validate_request(request): + exception_handler( + ValueError("Error validating the request"), + request.callback, + "Error validating the request.", + ) + self._perf_timer.measure_time("validate_request") + else: + self._perf_timer.measure_time("validate_request") + self.dispatch(request) + self._perf_timer.measure_time("dispatch") finally: self.flush_requests() # TODO: implement this @@ -415,7 +425,7 @@ def task_queue(self) -> DragonQueue: """The queue on which batched requests are placed""" return self._outgoing_queue - def _swap_queue(self, model_key: str) -> None: + def _swap_queue(self, model_key: FeatureStoreKey) -> None: """Get an empty queue or create a new one and make it the active one for a given model. @@ -427,18 +437,17 @@ def _swap_queue(self, model_key: str) -> None: if self._queue_swap_lock is None: raise SmartSimError("Queues were not locked") with self._queue_swap_lock: - for queue_list in self._queues[model_key]: - for queue in queue_list: - if not queue.full(): - self._active_queues[model_key] = queue - return + for queue in self._queues[model_key.key]: + if not queue.full(): + self._active_queues[model_key.key] = queue + return new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key) if model_key in self._queues: - self._queues[model_key].append(new_queue) + self._queues[model_key.key].append(new_queue) else: - self._queues[model_key] = [new_queue] - self._active_queues[model_key] = new_queue + self._queues[model_key.key] = [new_queue] + self._active_queues[model_key.key] = new_queue return def dispatch(self, request: InferenceRequest) -> None: @@ -449,7 +458,9 @@ def dispatch(self, request: InferenceRequest) -> None: logger.info("Direct inference requested, creating tmp queue") tmp_id = f"_tmp_{str(uuid.uuid4())}" tmp_queue: BatchQueue = BatchQueue( - batch_timeout=0, batch_size=1, model_key=tmp_id + batch_timeout=0, + batch_size=1, + model_key=FeatureStoreKey(key=tmp_id, descriptor="TMP"), ) self._active_queues[tmp_id] = tmp_queue tmp_queue.put_nowait(request) @@ -460,7 +471,7 @@ def dispatch(self, request: InferenceRequest) -> None: success = False while not success: try: - self._active_queues[request.model_key].put_nowait(request) + self._active_queues[request.model_key.key].put_nowait(request) success = True except (Full, KeyError): self._swap_queue(request.model_key) @@ -469,20 +480,22 @@ def flush_requests(self) -> None: """Get all requests from queues which are ready to be flushed. Place all avaliable request batches in the outgoing queue. """ - for queue_list in self._queues: + for queue_list in self._queues.values(): for queue in queue_list: if queue.ready and queue.acquire(blocking=False): self._perf_timer.measure_time("find_queue") try: batch = RequestBatch( - model_key=queue.model_key, requests=queue.flush(), inputs=None + requests=queue.flush(), + inputs=None, + model_key=queue.model_key, ) finally: self._perf_timer.measure_time("flush_requests") queue.release() try: fetch_results = self._worker.fetch_inputs( - batch=batch, feature_store=self._feature_store + batch=batch, feature_stores=self._feature_stores ) except Exception as exc: exception_handler( @@ -490,10 +503,13 @@ def flush_requests(self) -> None: None, "Error fetching input.", ) + continue self._perf_timer.measure_time("fetch_input") try: transformed_inputs = self._worker.transform_input( - batch=batch, fetch_results=fetch_results, mem_pool=self._mem_pool + batch=batch, + fetch_results=fetch_results, + mem_pool=self._mem_pool, ) except Exception as exc: exception_handler( @@ -501,6 +517,7 @@ def flush_requests(self) -> None: None, "Error Transforming input.", ) + continue self._perf_timer.measure_time("transform_input") batch.inputs = transformed_inputs diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index fe0312e7ae..2459747ec0 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -27,33 +27,30 @@ # pylint: disable=import-error # pylint: disable-next=unused-import import dragon + # pylint: enable=import-error # isort: off # isort: on -from queue import Empty - import multiprocessing as mp import time import typing as t +from queue import Empty from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore from .....log import get_logger from ....entrypoints.service import Service from ....utils.timings import PerfTimer -from ...comm.channel.channel import CommChannelBase -from ...comm.channel.dragonchannel import DragonCommChannel from ...infrastructure.environmentloader import EnvironmentConfigLoader from ...infrastructure.worker.worker import ( - RequestBatch, InferenceReply, LoadModelResult, MachineLearningWorkerBase, + RequestBatch, ) from ...message_handler import MessageHandler -from ...mli_schemas.response.response_capnp import ResponseBuilder from .commons import build_failure_reply, exception_handler from .devicemanager import DeviceManager, WorkerDevice @@ -63,36 +60,6 @@ logger = get_logger(__name__) - -def prepare_outputs(reply: InferenceReply) -> t.List[t.Any]: - prepared_outputs: t.List[t.Any] = [] - if reply.output_keys: - for key in reply.output_keys: - if not key: - continue - msg_key = MessageHandler.build_tensor_key(key) - prepared_outputs.append(msg_key) - elif reply.outputs: - for _ in reply.outputs: - msg_tensor_desc = MessageHandler.build_tensor_descriptor( - "c", - "float32", - [1], - ) - prepared_outputs.append(msg_tensor_desc) - return prepared_outputs - - -def build_reply(reply: InferenceReply) -> ResponseBuilder: - results = prepare_outputs(reply) - - return MessageHandler.build_response( - status=reply.status_enum, - message=reply.message, - result=results, - custom_attributes=None, - ) - class WorkerManager(Service): """An implementation of a service managing distribution of tasks to machine learning workers""" @@ -101,7 +68,7 @@ def __init__( self, config_loader: EnvironmentConfigLoader, worker_type: t.Type[MachineLearningWorkerBase], - dispatcher_queue: "mp.Queue[InferenceBatch]", + dispatcher_queue: "mp.Queue[RequestBatch]", as_service: bool = False, cooldown: int = 0, device: t.Literal["cpu", "gpu"] = "cpu", @@ -123,10 +90,6 @@ def __init__( self._dispatcher_queue = dispatcher_queue """The dispatcher queue the manager monitors for new tasks""" - self._feature_store: t.Optional[FeatureStore] = ( - config_loader.get_feature_store() - ) - """A feature store to retrieve models from""" self._worker = worker_type() """The ML Worker implementation""" self._callback_factory = config_loader._callback_factory @@ -148,87 +111,8 @@ def __init__( self._perf_timer = PerfTimer(prefix="w_", debug=True, timing_on=True) """Performance timer""" - def _check_feature_stores(self, request: InferenceRequest) -> bool: - """Ensures that all feature stores required by the request are available - - :param request: The request to validate - :returns: False if feature store validation fails for the request, True otherwise - """ - # collect all feature stores required by the request - fs_model: t.Set[str] = set() - if request.model_key: - fs_model = {request.model_key.descriptor} - fs_inputs = {key.descriptor for key in request.input_keys} - fs_outputs = {key.descriptor for key in request.output_keys} - - # identify which feature stores are requested and unknown - fs_desired = fs_model.union(fs_inputs).union(fs_outputs) - fs_actual = {item.descriptor for item in self._feature_stores.values()} - fs_missing = fs_desired - fs_actual - - if self._featurestore_factory is None: - logger.error("No feature store factory configured") - return False - - # create the feature stores we need to service request - if fs_missing: - logger.debug(f"Adding feature store(s): {fs_missing}") - for descriptor in fs_missing: - feature_store = self._featurestore_factory(descriptor) - self._feature_stores[descriptor] = feature_store - - return True - - def _check_model(self, request: InferenceRequest) -> bool: - """Ensure that a model is available for the request - - :param request: The request to validate - :returns: False if model validation fails for the request, True otherwise - """ - if request.model_key or request.raw_model: - return True - - logger.error("Unable to continue without model bytes or feature store key") - return False - - def _check_inputs(self, request: InferenceRequest) -> bool: - """Ensure that inputs are available for the request - - :param request: The request to validate - :returns: False if input validation fails for the request, True otherwise - """ - if request.input_keys or request.raw_inputs: - return True - - logger.error("Unable to continue without input bytes or feature store keys") - return False - - def _check_callback(self, request: InferenceRequest) -> bool: - """Ensure that a callback channel is available for the request - - :param request: The request to validate - :returns: False if callback validation fails for the request, True otherwise - """ - if request.callback is not None: - return True - - logger.error("No callback channel provided in request") - return False - - def _validate_request(self, request: InferenceRequest) -> bool: - """Ensure the request can be processed - - :param request: The request to validate - :return: False if the request fails any validation checks, True otherwise""" - checks = [ - self._check_feature_stores(request), - self._check_model(request), - self._check_inputs(request), - self._check_callback(request), - ] - - return all(checks) - + # remove this when we are done with time measurements + # pylint: disable-next=too-many-statements def _on_iteration(self) -> None: """Executes calls to the machine learning worker implementation to complete @@ -258,12 +142,12 @@ def _on_iteration(self) -> None: self._device_manager.get_device( worker=self._worker, batch=batch, - feature_store=self._feature_store, + feature_stores=self._feature_stores, ) ) self._perf_timer.measure_time("fetch_model") - model_result = LoadModelResult(device.get_model(batch.model_key)) + model_result = LoadModelResult(device.get_model(batch.model_key.key)) self._perf_timer.measure_time("load_model") if batch.inputs is None: @@ -302,7 +186,7 @@ def _on_iteration(self) -> None: reply.output_keys = self._worker.place_output( request, transformed_output, - self._feature_store, + self._feature_stores, ) except Exception as e: exception_handler( @@ -313,12 +197,19 @@ def _on_iteration(self) -> None: reply.outputs = transformed_output.outputs self._perf_timer.measure_time("assign_output") - if reply.outputs is None: + if reply.outputs is None or not reply.outputs: response = build_failure_reply("fail", "Outputs not found.") else: reply.status_enum = "complete" reply.message = "Success" - response = build_reply(reply) + + results = self._worker.prepare_outputs(reply) + response = MessageHandler.build_response( + status=reply.status_enum, + message=reply.message, + result=results, + custom_attributes=None, + ) self._perf_timer.measure_time("build_reply") diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 392e7e051e..7ea09b9af9 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -39,9 +39,9 @@ ExecuteResult, FetchInputResult, FetchModelResult, - RequestBatch, LoadModelResult, MachineLearningWorkerBase, + RequestBatch, TransformInputResult, TransformOutputResult, ) @@ -164,8 +164,12 @@ def execute( with torch.no_grad(): model.eval() results = [ - model(*[tensor.to(device, non_blocking=True).detach() - for tensor in tensors]) + model( + *[ + tensor.to(device, non_blocking=True).detach() + for tensor in tensors + ] + ) ] transform_result.transformed = [] diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 6e5aafca3d..374f35b594 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -171,18 +171,18 @@ def __init__(self, result: bytes) -> None: @dataclass class RequestBatch: - """A batch of aggregated inference requests - """ - model_key: str + """A batch of aggregated inference requests""" + requests: list[InferenceRequest] inputs: t.Optional[TransformInputResult] + model_key: FeatureStoreKey @property def has_valid_requests(self) -> bool: return len(self.requests) > 0 @property - def has_raw_nodel(self) -> bool: + def has_raw_model(self) -> bool: return self.raw_model is not None @property @@ -191,6 +191,7 @@ def raw_model(self) -> t.Optional[t.Any]: return self.requests[0].raw_model return None + class MachineLearningWorkerCore: """Basic functionality of ML worker that is shared across all worker types""" @@ -279,27 +280,26 @@ def fetch_model( :return: Raw bytes of the model""" # All requests in the same batch share the model - sample_request = batch.requests[0] - if sample_request.raw_model: - return FetchModelResult(sample_request.raw_model.data) + if batch.raw_model: + return FetchModelResult(batch.raw_model.data) if not feature_stores: raise ValueError("Feature store is required for model retrieval") - if not sample_request.model_key: + if batch.model_key is None: raise SmartSimError( "Key must be provided to retrieve model from feature store" ) - key, fsd = request.model_key.key, request.model_key.descriptor + key, fsd = batch.model_key.key, batch.model_key.descriptor try: feature_store = feature_stores[fsd] - raw_bytes: bytes = t.cast(bytes, feature_store[sample_key]) + raw_bytes: bytes = t.cast(bytes, feature_store[key]) return FetchModelResult(raw_bytes) except FileNotFoundError as ex: logger.exception(ex) - raise SmartSimError(f"Model could not be retrieved with key {sample_key}") from ex + raise SmartSimError(f"Model could not be retrieved with key {key}") from ex @staticmethod def fetch_inputs( @@ -321,22 +321,23 @@ def fetch_inputs( if not feature_stores: raise ValueError("No input and no feature store provided") - if request.input_keys: - data: t.List[bytes] = [] - - for fs_key in request.input_keys: - try: - feature_store = feature_stores[fs_key.descriptor] - tensor_bytes = t.cast(bytes, feature_store[fs_key.key]) - data.append(tensor_bytes) - except KeyError as ex: - logger.exception(ex) - raise SmartSimError( - f"Model could not be retrieved with key {fs_key.key}" - ) from ex - return FetchInputResult( - data, meta=None - ) # fixme: need to get both tensor and descriptor + if request.input_keys: + data: t.List[bytes] = [] + + for fs_key in request.input_keys: + try: + feature_store = feature_stores[fs_key.descriptor] + tensor_bytes = t.cast(bytes, feature_store[fs_key.key]) + data.append(tensor_bytes) + except KeyError as ex: + logger.exception(ex) + raise SmartSimError( + f"Model could not be retrieved with key {fs_key.key}" + ) from ex + fetch_results.append( + FetchInputResult(data, meta=None) + ) # fixme: need to get both tensor and descriptor + continue raise ValueError("No input source") From e4a9db0f498f417db106b621aced006b1702f7f8 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 14 Aug 2024 16:55:08 -0500 Subject: [PATCH 57/84] Working version, still slow --- ex/high_throughput_inference/mli_driver.py | 2 +- ex/high_throughput_inference/mock_app.py | 3 +- .../standalone_workermanager.py | 22 ++--- .../infrastructure/control/devicemanager.py | 4 + .../control/requestdispatcher.py | 18 ++-- .../infrastructure/control/workermanager.py | 92 ++++++++++++++++--- .../_core/mli/infrastructure/worker/worker.py | 16 ++++ smartsim/_core/utils/timings.py | 4 + 8 files changed, 123 insertions(+), 38 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 8d31d7610f..8f25540078 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -16,7 +16,7 @@ filedir = os.path.dirname(__file__) worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") app_script_name = os.path.join(filedir, "mock_app.py") -model_name = os.path.join(filedir, f"resnet50.{DEVICE.upper()}.pt") +model_name = os.path.join(filedir, f"resnet50.{DEVICE}.pt") transport: t.Literal["hsta", "tcp"] = "hsta" diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 4033ad960b..77daafd5c5 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -154,7 +154,6 @@ def model(self): def name(self): return self._name - if __name__ == "__main__": parser = argparse.ArgumentParser("Mock application") @@ -162,7 +161,7 @@ def name(self): parser.add_argument("--log_max_batchsize", default=8, type=int) args = parser.parse_args() - resnet = ResNetWrapper("resnet50", f"resnet50.{args.device.upper()}.pt") + resnet = ResNetWrapper("resnet50", f"resnet50.{args.device}.pt") client = ProtoClient(timing_on=True) client.set_model(resnet.name, resnet.model) diff --git a/ex/high_throughput_inference/standalone_workermanager.py b/ex/high_throughput_inference/standalone_workermanager.py index b30945fef3..952cf2dc5c 100644 --- a/ex/high_throughput_inference/standalone_workermanager.py +++ b/ex/high_throughput_inference/standalone_workermanager.py @@ -81,18 +81,10 @@ pid = os.getpid() affinity = os.sched_getaffinity(pid) -logger.log(f"Entry point: {socket.gethostname()}, {affinity}") -logger.log(f"CPUS: {os.cpu_count()}") +logger.info(f"Entry point: {socket.gethostname()}, {affinity}") +logger.info(f"CPUS: {os.cpu_count()}") -def create_worker_manager( - worker_type: t.Type[MachineLearningWorkerBase], - config_loader: EnvironmentConfigLoader, - device: str, - dispatcher_queue: mp.Queue, -) -> WorkerManager: - return - def service_as_dragon_proc( service: Service, cpu_affinity: list[int], gpu_affinity: list[int] @@ -102,7 +94,6 @@ def service_as_dragon_proc( local_policy = dragon_policy.Policy( placement=dragon_policy.Policy.Placement.HOST_NAME, host_name=socket.gethostname(), - affinity=dragon_policy.Policy.Affinity.SPECIFIC, cpu_affinity=cpu_affinity, gpu_affinity=gpu_affinity, ) @@ -179,8 +170,7 @@ def service_as_dragon_proc( dispatcher = RequestDispatcher( batch_timeout=args.batch_timeout, batch_size=args.batch_size, - config_loader=ss_config_loader, - comm_channel_type=DragonCommChannel, + config_loader=config_loader, worker_type=arg_worker_type, ) @@ -189,13 +179,12 @@ def service_as_dragon_proc( for wm_idx in range(args.num_workers): worker_manager = WorkerManager( - config_loader=ss_config_loader, + config_loader=config_loader, worker_type=arg_worker_type, as_service=True, cooldown=10, - comm_channel_type=DragonCommChannel, device=worker_device, - task_queue=dispatcher.task_queue, + dispatcher_queue=dispatcher.task_queue, ) wms.append(worker_manager) @@ -226,6 +215,7 @@ def service_as_dragon_proc( # TODO: use ProcessGroup and restart=True? all_procs = [dispatcher_proc, *worker_manager_procs] + print(f"Dispatcher proc: {dispatcher_proc}") for proc in all_procs: proc.start() diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py index c3dfcc0261..382eca6b13 100644 --- a/smartsim/_core/mli/infrastructure/control/devicemanager.py +++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py @@ -29,6 +29,9 @@ from ...infrastructure.storage.featurestore import FeatureStore from ..worker.worker import MachineLearningWorkerBase from .requestdispatcher import RequestBatch +from .....log import get_logger + +logger = get_logger(__name__) class WorkerDevice: @@ -83,6 +86,7 @@ def _load_model_on_device( batch: RequestBatch, feature_stores: dict[str, FeatureStore], ) -> None: + model_bytes = worker.fetch_model(batch, feature_stores) loaded_model = worker.load_model(batch, model_bytes, self._device.name) self._device.add_model(batch.model_key.key, loaded_model.model) diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index 3c1105b501..151c04496d 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -282,7 +282,7 @@ def __init__( """The worker used to batch inputs""" self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(2 * 1024**3).sdesc) """Memory pool used to share batched input tensors with the Worker Managers""" - self._perf_timer = PerfTimer(prefix="r_", debug=True, timing_on=True) + self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True) """Performance timer""" def _check_feature_stores(self, request: InferenceRequest) -> bool: @@ -376,9 +376,10 @@ def _on_start(self) -> None: def _on_iteration(self) -> None: try: + self._perf_timer.set_active(True) bytes_list: t.List[bytes] = self._incoming_channel.recv() except Exception: - self._perf_timer.start_timings() + self._perf_timer.set_active(False) else: if not bytes_list: exception_handler( @@ -437,13 +438,14 @@ def _swap_queue(self, model_key: FeatureStoreKey) -> None: if self._queue_swap_lock is None: raise SmartSimError("Queues were not locked") with self._queue_swap_lock: - for queue in self._queues[model_key.key]: - if not queue.full(): - self._active_queues[model_key.key] = queue - return + if model_key.key in self._queues: + for queue in self._queues[model_key.key]: + if not queue.full(): + self._active_queues[model_key.key] = queue + return new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key) - if model_key in self._queues: + if model_key.key in self._queues: self._queues[model_key.key].append(new_queue) else: self._queues[model_key.key] = [new_queue] @@ -455,7 +457,7 @@ def dispatch(self, request: InferenceRequest) -> None: :param request: the request to place """ if request.raw_model is not None: - logger.info("Direct inference requested, creating tmp queue") + logger.debug("Direct inference requested, creating tmp queue") tmp_id = f"_tmp_{str(uuid.uuid4())}" tmp_queue: BatchQueue = BatchQueue( batch_timeout=0, diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 2459747ec0..fa508b3230 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -105,12 +105,56 @@ def __init__( self._backbone: t.Optional[FeatureStore] = config_loader.get_backbone() """A standalone, system-created feature store used to share internal information among MLI components""" - self._device_manager: t.Optional[DeviceManager] = None """Object responsible for model caching and device access""" - self._perf_timer = PerfTimer(prefix="w_", debug=True, timing_on=True) + self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True) """Performance timer""" + def _on_start(self) -> None: + self._device_manager = DeviceManager(WorkerDevice(self._device)) + + def _check_feature_stores(self, batch: RequestBatch) -> bool: + """Ensures that all feature stores required by the request are available + + :param batch: The batch of requests to validate + :returns: False if feature store validation fails for the batch, True otherwise + """ + # collect all feature stores required by the request + fs_model: t.Set[str] = set() + if batch.model_key: + fs_model = {batch.model_key.descriptor} + fs_inputs = {key.descriptor for key in batch.input_keys} + fs_outputs = {key.descriptor for key in batch.output_keys} + + # identify which feature stores are requested and unknown + fs_desired = fs_model.union(fs_inputs).union(fs_outputs) + fs_actual = {item.descriptor for item in self._feature_stores.values()} + fs_missing = fs_desired - fs_actual + + if self._featurestore_factory is None: + logger.error("No feature store factory configured") + return False + + # create the feature stores we need to service request + if fs_missing: + logger.debug(f"Adding feature store(s): {fs_missing}") + for descriptor in fs_missing: + feature_store = self._featurestore_factory(descriptor) + self._feature_stores[descriptor] = feature_store + + return True + + def _validate_batch(self, batch: RequestBatch) -> bool: + """Ensure the request can be processed + + :param batch: The batch of requests to validate + :return: False if the request fails any validation checks, True otherwise""" + + if batch is None or len(batch.requests)==0: + return False + + return self._check_feature_stores(batch) + # remove this when we are done with time measurements # pylint: disable-next=too-many-statements def _on_iteration(self) -> None: @@ -128,7 +172,7 @@ def _on_iteration(self) -> None: "flush_requests", time.perf_counter() - pre_batch_time ) - if batch is None or 0 == len(batch.requests): + if not self._validate_batch(batch): exception_handler( ValueError("An empty batch was received"), None, @@ -136,18 +180,44 @@ def _on_iteration(self) -> None: ) return + if self._device_manager is None: - raise ValueError("No Device Manager available: did you call _on_start()") - device: WorkerDevice = next( - self._device_manager.get_device( - worker=self._worker, - batch=batch, - feature_stores=self._feature_stores, + for request in batch.requests: + exception_handler( + ValueError("No Device Manager available: did you call _on_start()"), + request.callback, + "Error acquiring device manager" + ) + return + + try: + device: WorkerDevice = next( + self._device_manager.get_device( + worker=self._worker, + batch=batch, + feature_stores=self._feature_stores, + ) ) - ) + except Exception as exc: + for request in batch.requests: + exception_handler( + exc, + request.callback, + "Error loading model on device or getting device" + ) + return self._perf_timer.measure_time("fetch_model") - model_result = LoadModelResult(device.get_model(batch.model_key.key)) + try: + model_result = LoadModelResult(device.get_model(batch.model_key.key)) + except Exception as exc: + for request in batch.requests: + exception_handler( + exc, + request.callback, + "Error getting model from device" + ) + return self._perf_timer.measure_time("load_model") if batch.inputs is None: diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 374f35b594..b3d47b13c7 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -191,6 +191,22 @@ def raw_model(self) -> t.Optional[t.Any]: return self.requests[0].raw_model return None + @property + def input_keys(self) -> t.List[FeatureStoreKey]: + keys = [] + for request in self.requests: + keys.extend(request.input_keys) + + return keys + + @property + def output_keys(self) -> t.List[FeatureStoreKey]: + keys = [] + for request in self.requests: + keys.extend(request.output_keys) + + return keys + class MachineLearningWorkerCore: """Basic functionality of ML worker that is shared across all worker types""" diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py index 286bd4f4a8..79e51e4f8c 100644 --- a/smartsim/_core/utils/timings.py +++ b/smartsim/_core/utils/timings.py @@ -132,3 +132,7 @@ def print_timings(self, to_file: bool = False) -> None: print(" ".join(self._format_number(value) for value in value_array[i])) if to_file: np.save(self._prefix + self._filename + ".npy", value_array) + + def set_active(self, active: bool = True) -> None: + """Set whether the timer will record time""" + self._timing_on = active From 0c0637c657d6d158878ed34445296df43d398fde Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 14 Aug 2024 17:39:40 -0500 Subject: [PATCH 58/84] Last fixes --- .../_core/mli/infrastructure/control/requestdispatcher.py | 4 +--- smartsim/_core/utils/timings.py | 4 ++++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index 151c04496d..056dc73f0e 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -374,7 +374,6 @@ def _on_start(self) -> None: self._queue_swap_lock = RLock() def _on_iteration(self) -> None: - try: self._perf_timer.set_active(True) bytes_list: t.List[bytes] = self._incoming_channel.recv() @@ -418,7 +417,7 @@ def _on_iteration(self) -> None: self._perf_timer.end_timings() - if self._perf_timer.max_length == 801: + if self._perf_timer.max_length == 801 and self._perf_timer.is_active: self._perf_timer.print_timings(True) @property @@ -430,7 +429,6 @@ def _swap_queue(self, model_key: FeatureStoreKey) -> None: """Get an empty queue or create a new one and make it the active one for a given model. - :param model_key: The key of the model for which the queue has to be swapped :raises SmartSimError: If the queue is not locked. diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py index 79e51e4f8c..2bf266e5a9 100644 --- a/smartsim/_core/utils/timings.py +++ b/smartsim/_core/utils/timings.py @@ -136,3 +136,7 @@ def print_timings(self, to_file: bool = False) -> None: def set_active(self, active: bool = True) -> None: """Set whether the timer will record time""" self._timing_on = active + + def is_active(self) -> bool: + """Returns true if the timer will record time""" + return self._timing_on From 7dbeded8663def673087c1f7ec3a07c58bf6e734 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sat, 17 Aug 2024 10:25:32 -0500 Subject: [PATCH 59/84] Fixing tests --- .../infrastructure/control/devicemanager.py | 6 +- .../control/requestdispatcher.py | 22 +++++- .../infrastructure/control/workermanager.py | 10 +-- tests/dragon/test_error_handling.py | 67 ++++++++++++++----- 4 files changed, 77 insertions(+), 28 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py index 382eca6b13..a42efa1d9d 100644 --- a/smartsim/_core/mli/infrastructure/control/devicemanager.py +++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py @@ -113,8 +113,10 @@ def get_device( # Load model if not already loaded, or # because it is sent with the request if model_in_request or not batch.model_key.key in self._device: - self._load_model_on_device(worker, batch, feature_stores) - + try: + self._load_model_on_device(worker, batch, feature_stores) + except Exception as exc: + raise exc try: yield self._device finally: diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index 056dc73f0e..c59951204f 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -63,6 +63,8 @@ logger = get_logger("Request Dispatcher") +# Placeholder +ModelIdentifier = FeatureStoreKey class WorkerDevice: def __init__(self, name: str) -> None: @@ -77,12 +79,23 @@ def __init__(self, name: str) -> None: """Lock to ensure only one thread at the time accesses this device""" def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]: + """Acquire and lock this device to prevent other threads + + from acquiring it concurrently. + :param blocking: If set to True, the call will block + for the time specified by ``timeout`` until the lock + can be acquired + :param timeout: Time (in seconds) to wait to acquire lock. + Ignored if ``blocking`` is set to False. + """ return self._lock.acquire(blocking=blocking, timeout=timeout) def release(self) -> None: + """Release device to allow other threads to acquire it""" self._lock.release() def __enter__(self) -> None: + """Locked context creator for this device""" self.acquire() def __exit__( @@ -91,12 +104,13 @@ def __exit__( exc_val: t.Optional[BaseException], exc_tb: t.Optional[TracebackType], ) -> None: + """Locked context destructor for this device""" self.release() class BatchQueue(Queue[InferenceRequest]): def __init__( - self, batch_timeout: float, batch_size: int, model_key: FeatureStoreKey + self, batch_timeout: float, batch_size: int, model_key: ModelIdentifier ) -> None: """Queue used to store inference requests waiting to be batched and sent to Worker Managers. @@ -154,7 +168,7 @@ def __exit__( self.release() @property - def model_key(self) -> FeatureStoreKey: + def model_key(self) -> ModelIdentifier: """Key of the model which needs to be run on the queued requests""" return self._model_key @@ -168,6 +182,7 @@ def put( :param item: The request :param block: Whether to block when trying to put the item :param timeout: Time (in seconds) to wait if block==True + :raises Full: If an item cannot be put on the queue """ if not self.acquire(blocking=False): raise Full @@ -182,6 +197,7 @@ def put( @property def _elapsed_time(self) -> float: + """Time elapsed since the first item was put on this queue""" if self.empty() or self._first_put is None: return 0 return time.time() - self._first_put @@ -199,7 +215,7 @@ def make_disposable(self) -> None: @property def can_be_removed(self) -> bool: - """Whether this queue can be deleted and garbafe collected""" + """Whether this queue can be deleted and garbage collected""" return self.empty() and self._disposable def flush(self) -> list[t.Any]: diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index fa508b3230..80145fb8e1 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -107,7 +107,7 @@ def __init__( information among MLI components""" self._device_manager: t.Optional[DeviceManager] = None """Object responsible for model caching and device access""" - self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True) + self._perf_timer = PerfTimer(prefix="w_", debug=True, timing_on=True) """Performance timer""" def _on_start(self) -> None: @@ -166,6 +166,7 @@ def _on_iteration(self) -> None: try: batch: RequestBatch = self._dispatcher_queue.get(timeout=0.0001) except Empty: + logger.info("Empty queue") return self._perf_timer.start_timings( @@ -174,17 +175,16 @@ def _on_iteration(self) -> None: if not self._validate_batch(batch): exception_handler( - ValueError("An empty batch was received"), + ValueError("An invalid batch was received"), None, - "Error batching inputs, the batch was empty.", + "Error batching inputs, the batch was invalid.", ) return - if self._device_manager is None: for request in batch.requests: exception_handler( - ValueError("No Device Manager available: did you call _on_start()"), + ValueError("No Device Manager available: did you call _on_start()?"), request.callback, "Error acquiring device manager" ) diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index bacffba933..345ebba5e9 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -40,17 +40,22 @@ WorkerManager, exception_handler, ) +from smartsim._core.mli.infrastructure.control.requestdispatcher import ( + RequestDispatcher, +) from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( DragonFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey from smartsim._core.mli.infrastructure.worker.worker import ( ExecuteResult, FetchInputResult, FetchModelResult, InferenceReply, + InferenceRequest, LoadModelResult, + RequestBatch, TransformInputResult, TransformOutputResult, ) @@ -85,35 +90,56 @@ def setup_worker_manager_model_bytes( backbone_descriptor: str, app_feature_store: FeatureStore, ): - integrated_worker = IntegratedTorchWorker() + integrated_worker_type = IntegratedTorchWorker chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) + wrapped_queue = DragonFLIChannel(queue.serialize()) + monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())) # Put backbone descriptor into env var for the `EnvironmentConfigLoader` monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor) + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) + + dispatcher = RequestDispatcher( + batch_timeout=0, + batch_size=1, + config_loader=config_loader, + worker_type=integrated_worker_type, + ) + worker_manager = WorkerManager( - EnvironmentConfigLoader( - featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_descriptor, - ), - integrated_worker, + config_loader=config_loader, + worker_type=integrated_worker_type, + dispatcher_queue=dispatcher.task_queue, as_service=False, cooldown=3, ) - tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) - output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) - model = MessageHandler.build_model(b"model", "model name", "v 0.0.1") - request = MessageHandler.build_request( - test_dir, model, [tensor_key], [output_key], [], None + tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) + output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) + + request = InferenceRequest(model_key= None, callback = None, raw_inputs= None, input_keys=[tensor_key], input_meta = None, output_keys=[output_key], raw_model=b'model', batch_size=0) + + model_id = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) + + request_batch = RequestBatch( + [request], + TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]]), + model_id, ) - ser_request = MessageHandler.serialize_request(request) - worker_manager._dispatcher_queue.send(ser_request) - return worker_manager, integrated_worker + dispatcher.task_queue.put(request_batch) + + # + # wrapped_queue.send(ser_request) + + return worker_manager, integrated_worker_type @pytest.fixture @@ -147,7 +173,8 @@ def setup_worker_manager_model_key( model_key = MessageHandler.build_model_key( "model key", app_feature_store.descriptor ) - request = MessageHandler.build_request( + + MessageHandler.build_request( test_dir, model_key, [tensor_key], [output_key], [], None ) ser_request = MessageHandler.serialize_request(request) @@ -223,7 +250,10 @@ def test_pipeline_stage_errors_handled( error_message: str, ): """Ensures that the worker manager does not crash after a failure in various pipeline stages""" - worker_manager, integrated_worker = request.getfixturevalue(setup_worker_manager) + worker_manager, integrated_worker_type = request.getfixturevalue( + setup_worker_manager + ) + integrated_worker = worker_manager._worker mock_reply_fn = mock_pipeline_stage(monkeypatch, integrated_worker, stage) if stage not in ["fetch_model"]: @@ -279,6 +309,7 @@ def test_pipeline_stage_errors_handled( ), ) + worker_manager._on_start() worker_manager._on_iteration() mock_reply_fn.assert_called_once() From 0eadc63f1881ca63c212ba7dda735c904d0cef2c Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Sat, 17 Aug 2024 10:26:25 -0500 Subject: [PATCH 60/84] MLI driver multi-client --- ex/high_throughput_inference/mli_driver.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ex/high_throughput_inference/mli_driver.py b/ex/high_throughput_inference/mli_driver.py index 8f25540078..807a70b219 100644 --- a/ex/high_throughput_inference/mli_driver.py +++ b/ex/high_throughput_inference/mli_driver.py @@ -1,4 +1,3 @@ -import argparse import os import base64 import cloudpickle @@ -11,7 +10,7 @@ import typing as t DEVICE = "gpu" -NUM_RANKS = 1 +NUM_RANKS = 4 NUM_WORKERS = 1 filedir = os.path.dirname(__file__) worker_manager_script_name = os.path.join(filedir, "standalone_workermanager.py") @@ -39,7 +38,7 @@ "--batch_size", str(NUM_RANKS//NUM_WORKERS), "--batch_timeout", - str(0.002), + str(0.00), "--num_workers", str(NUM_WORKERS) ], @@ -54,7 +53,7 @@ app_rs: DragonRunSettings = exp.create_run_settings( sys.executable, - exe_args=[app_script_name, "--device", DEVICE, "--log_max_batchsize", str(7)], + exe_args=[app_script_name, "--device", DEVICE, "--log_max_batchsize", str(6)], ) app_rs.set_tasks_per_node(NUM_RANKS) From 8e178d938db00303f8288d5d7dfc3a375432aac7 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 20 Aug 2024 12:10:02 -0500 Subject: [PATCH 61/84] Fixed broken test --- .../infrastructure/control/devicemanager.py | 2 +- .../control/requestdispatcher.py | 1 + .../infrastructure/control/workermanager.py | 16 +- tests/dragon/test_error_handling.py | 139 +++++++++--------- 4 files changed, 82 insertions(+), 76 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py index a42efa1d9d..09fab32f95 100644 --- a/smartsim/_core/mli/infrastructure/control/devicemanager.py +++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py @@ -26,10 +26,10 @@ import typing as t +from .....log import get_logger from ...infrastructure.storage.featurestore import FeatureStore from ..worker.worker import MachineLearningWorkerBase from .requestdispatcher import RequestBatch -from .....log import get_logger logger = get_logger(__name__) diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index c59951204f..f4e02dfc02 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -66,6 +66,7 @@ # Placeholder ModelIdentifier = FeatureStoreKey + class WorkerDevice: def __init__(self, name: str) -> None: """Wrapper around a device to keep track of loaded Models and availability diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 80145fb8e1..c89ed211ee 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -150,7 +150,7 @@ def _validate_batch(self, batch: RequestBatch) -> bool: :param batch: The batch of requests to validate :return: False if the request fails any validation checks, True otherwise""" - if batch is None or len(batch.requests)==0: + if batch is None or len(batch.requests) == 0: return False return self._check_feature_stores(batch) @@ -184,9 +184,11 @@ def _on_iteration(self) -> None: if self._device_manager is None: for request in batch.requests: exception_handler( - ValueError("No Device Manager available: did you call _on_start()?"), + ValueError( + "No Device Manager available: did you call _on_start()?" + ), request.callback, - "Error acquiring device manager" + "Error acquiring device manager", ) return @@ -203,7 +205,7 @@ def _on_iteration(self) -> None: exception_handler( exc, request.callback, - "Error loading model on device or getting device" + "Error loading model on device or getting device.", ) return self._perf_timer.measure_time("fetch_model") @@ -213,9 +215,7 @@ def _on_iteration(self) -> None: except Exception as exc: for request in batch.requests: exception_handler( - exc, - request.callback, - "Error getting model from device" + exc, request.callback, "Error getting model from device." ) return self._perf_timer.measure_time("load_model") @@ -236,7 +236,7 @@ def _on_iteration(self) -> None: ) except Exception as e: for request in batch.requests: - exception_handler(e, request.callback, "Error executing worker.") + exception_handler(e, request.callback, "Failed while executing.") return self._perf_timer.measure_time("execute") diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 345ebba5e9..17cd344c1e 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -24,30 +24,37 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest dragon = pytest.importorskip("dragon") +import multiprocessing as mp + import dragon.utils as du from dragon.channels import Channel from dragon.data.ddict.ddict import DDict from dragon.fli import FLInterface +from dragon.mpbridge.queues import DragonQueue from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel +from smartsim._core.mli.infrastructure.control.devicemanager import WorkerDevice +from smartsim._core.mli.infrastructure.control.requestdispatcher import ( + RequestDispatcher, +) from smartsim._core.mli.infrastructure.control.workermanager import ( WorkerManager, exception_handler, ) -from smartsim._core.mli.infrastructure.control.requestdispatcher import ( - RequestDispatcher, -) from smartsim._core.mli.infrastructure.environmentloader import EnvironmentConfigLoader from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( DragonFeatureStore, ) -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey +from smartsim._core.mli.infrastructure.storage.featurestore import ( + FeatureStore, + FeatureStoreKey, +) from smartsim._core.mli.infrastructure.worker.worker import ( ExecuteResult, FetchInputResult, @@ -106,17 +113,12 @@ def setup_worker_manager_model_bytes( queue_factory=DragonFLIChannel.from_descriptor, ) - dispatcher = RequestDispatcher( - batch_timeout=0, - batch_size=1, - config_loader=config_loader, - worker_type=integrated_worker_type, - ) + dispatcher_task_queue = mp.Queue(maxsize=0) worker_manager = WorkerManager( config_loader=config_loader, worker_type=integrated_worker_type, - dispatcher_queue=dispatcher.task_queue, + dispatcher_queue=dispatcher_task_queue, as_service=False, cooldown=3, ) @@ -124,7 +126,16 @@ def setup_worker_manager_model_bytes( tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) - request = InferenceRequest(model_key= None, callback = None, raw_inputs= None, input_keys=[tensor_key], input_meta = None, output_keys=[output_key], raw_model=b'model', batch_size=0) + request = InferenceRequest( + model_key=None, + callback=None, + raw_inputs=None, + input_keys=[tensor_key], + input_meta=None, + output_keys=[output_key], + raw_model=b"model", + batch_size=0, + ) model_id = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) @@ -134,11 +145,7 @@ def setup_worker_manager_model_bytes( model_id, ) - dispatcher.task_queue.put(request_batch) - - # - # wrapped_queue.send(ser_request) - + dispatcher_task_queue.put(request_batch) return worker_manager, integrated_worker_type @@ -149,7 +156,7 @@ def setup_worker_manager_model_key( backbone_descriptor: str, app_feature_store: FeatureStore, ): - integrated_worker = IntegratedTorchWorker() + integrated_worker_type = IntegratedTorchWorker chan = Channel.make_process_local() queue = FLInterface(main_ch=chan) @@ -157,30 +164,46 @@ def setup_worker_manager_model_key( # Put backbone descriptor into env var for the `EnvironmentConfigLoader` monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor) + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) + + dispatcher_task_queue = mp.Queue(maxsize=0) + worker_manager = WorkerManager( - EnvironmentConfigLoader( - featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=FileSystemCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_descriptor, - ), - integrated_worker, + config_loader=config_loader, + worker_type=integrated_worker_type, + dispatcher_queue=dispatcher_task_queue, as_service=False, cooldown=3, ) - tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) - output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) - model_key = MessageHandler.build_model_key( - "model key", app_feature_store.descriptor + tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) + output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) + model_key = FeatureStoreKey( + key="model key", descriptor=app_feature_store.descriptor ) - MessageHandler.build_request( - test_dir, model_key, [tensor_key], [output_key], [], None + request = InferenceRequest( + model_key=model_key, + callback=None, + raw_inputs=None, + input_keys=[tensor_key], + input_meta=None, + output_keys=[output_key], + raw_model=b"model", + batch_size=0, + ) + request_batch = RequestBatch( + [request], + TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]]), + model_key=model_key, ) - ser_request = MessageHandler.serialize_request(request) - worker_manager._dispatcher_queue.send(ser_request) - return worker_manager, integrated_worker + dispatcher_task_queue.put(request_batch) + return worker_manager, integrated_worker_type def mock_pipeline_stage(monkeypatch: pytest.MonkeyPatch, integrated_worker, stage): @@ -190,7 +213,7 @@ def mock_stage(*args, **kwargs): monkeypatch.setattr(integrated_worker, stage, mock_stage) mock_reply_fn = MagicMock() monkeypatch.setattr( - "smartsim._core.mli.infrastructure.control.workermanager.build_failure_reply", + "smartsim._core.mli.infrastructure.control.commons.build_failure_reply", mock_reply_fn, ) @@ -216,21 +239,15 @@ def mock_exception_handler(exc, reply_channel, failure_message): "stage, error_message", [ pytest.param( - "fetch_model", "Failed while fetching the model.", id="fetch model" + "fetch_model", + "Error loading model on device or getting device.", + id="fetch model", ), pytest.param( "load_model", - "Failed while loading model from feature store.", + "Error loading model on device or getting device.", id="load model", ), - pytest.param( - "fetch_inputs", "Failed while fetching the inputs.", id="fetch inputs" - ), - pytest.param( - "transform_input", - "Failed while transforming the input.", - id="transform inputs", - ), pytest.param("execute", "Failed while executing.", id="execute"), pytest.param( "transform_output", @@ -242,7 +259,7 @@ def mock_exception_handler(exc, reply_channel, failure_message): ), ], ) -def test_pipeline_stage_errors_handled( +def test_wm_pipeline_stage_errors_handled( request, setup_worker_manager, monkeypatch: pytest.MonkeyPatch, @@ -254,6 +271,9 @@ def test_pipeline_stage_errors_handled( setup_worker_manager ) integrated_worker = worker_manager._worker + + worker_manager._on_start() + device = worker_manager._device_manager._device mock_reply_fn = mock_pipeline_stage(monkeypatch, integrated_worker, stage) if stage not in ["fetch_model"]: @@ -262,42 +282,28 @@ def test_pipeline_stage_errors_handled( "fetch_model", MagicMock(return_value=FetchModelResult(b"result_bytes")), ) - if stage not in ["fetch_model", "load_model"]: monkeypatch.setattr( integrated_worker, "load_model", MagicMock(return_value=LoadModelResult(b"result_bytes")), ) - if stage not in ["fetch_model", "load_model", "fetch_inputs"]: monkeypatch.setattr( - integrated_worker, - "fetch_inputs", - MagicMock(return_value=FetchInputResult([b"result_bytes"], None)), - ) - if stage not in ["fetch_model", "load_model", "fetch_inputs", "transform_input"]: - monkeypatch.setattr( - integrated_worker, - "transform_input", - MagicMock(return_value=TransformInputResult(b"result_bytes")), + device, + "get_model", + MagicMock(return_value=b"result_bytes"), ) if stage not in [ "fetch_model", - "load_model", - "fetch_inputs", - "transform_input", "execute", ]: monkeypatch.setattr( integrated_worker, "execute", - MagicMock(return_value=ExecuteResult(b"result_bytes")), + MagicMock(return_value=ExecuteResult(b"result_bytes", [slice(0, 1)])), ) if stage not in [ "fetch_model", - "load_model", - "fetch_inputs", - "transform_input", "execute", "transform_output", ]: @@ -305,11 +311,10 @@ def test_pipeline_stage_errors_handled( integrated_worker, "transform_output", MagicMock( - return_value=TransformOutputResult(b"result", [], "c", "float32") + return_value=[TransformOutputResult(b"result", [], "c", "float32")] ), ) - worker_manager._on_start() worker_manager._on_iteration() mock_reply_fn.assert_called_once() @@ -323,7 +328,7 @@ def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch): mock_reply_fn = MagicMock() monkeypatch.setattr( - "smartsim._core.mli.infrastructure.control.workermanager.build_failure_reply", + "smartsim._core.mli.infrastructure.control.commons.build_failure_reply", mock_reply_fn, ) From 5fb822494872c51b96aa3890cb6d0a84e12260c2 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 20 Aug 2024 12:21:15 -0500 Subject: [PATCH 62/84] MyPy --- .../infrastructure/control/workermanager.py | 23 +++++++++---------- smartsim/_core/utils/timings.py | 1 + 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index c89ed211ee..fad470c80c 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -121,7 +121,7 @@ def _check_feature_stores(self, batch: RequestBatch) -> bool: """ # collect all feature stores required by the request fs_model: t.Set[str] = set() - if batch.model_key: + if batch.model_key.key: fs_model = {batch.model_key.descriptor} fs_inputs = {key.descriptor for key in batch.input_keys} fs_outputs = {key.descriptor for key in batch.output_keys} @@ -181,18 +181,17 @@ def _on_iteration(self) -> None: ) return - if self._device_manager is None: - for request in batch.requests: - exception_handler( - ValueError( - "No Device Manager available: did you call _on_start()?" - ), - request.callback, - "Error acquiring device manager", - ) - return - try: + if self._device_manager is None: + for request in batch.requests: + exception_handler( + ValueError( + "No Device Manager available: did you call _on_start()?" + ), + request.callback, + "Error acquiring device manager", + ) + return device: WorkerDevice = next( self._device_manager.get_device( worker=self._worker, diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py index 2bf266e5a9..34595c8586 100644 --- a/smartsim/_core/utils/timings.py +++ b/smartsim/_core/utils/timings.py @@ -137,6 +137,7 @@ def set_active(self, active: bool = True) -> None: """Set whether the timer will record time""" self._timing_on = active + @property def is_active(self) -> bool: """Returns true if the timer will record time""" return self._timing_on From b6ea732bc236dbd1ac441aa589b33c051b00e66b Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Wed, 21 Aug 2024 15:43:22 -0500 Subject: [PATCH 63/84] Fix WM test and add dispatcher error handling --- .../control/requestdispatcher.py | 4 +- .../infrastructure/control/workermanager.py | 1 - tests/dragon/test_error_handling.py | 94 ++++++++++++++++++- tests/dragon/test_worker_manager.py | 9 +- 4 files changed, 102 insertions(+), 6 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index f4e02dfc02..6fb4b7d084 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -299,7 +299,7 @@ def __init__( """The worker used to batch inputs""" self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(2 * 1024**3).sdesc) """Memory pool used to share batched input tensors with the Worker Managers""" - self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True) + self._perf_timer = PerfTimer(prefix="r_", debug=True, timing_on=True) """Performance timer""" def _check_feature_stores(self, request: InferenceRequest) -> bool: @@ -480,6 +480,7 @@ def dispatch(self, request: InferenceRequest) -> None: model_key=FeatureStoreKey(key=tmp_id, descriptor="TMP"), ) self._active_queues[tmp_id] = tmp_queue + self._queues[tmp_id] = [tmp_queue] tmp_queue.put_nowait(request) tmp_queue.make_disposable() return @@ -497,6 +498,7 @@ def flush_requests(self) -> None: """Get all requests from queues which are ready to be flushed. Place all avaliable request batches in the outgoing queue. """ + print(self._queues.items()) for queue_list in self._queues.values(): for queue in queue_list: if queue.ready and queue.acquire(blocking=False): diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index fad470c80c..3949476b6b 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -166,7 +166,6 @@ def _on_iteration(self) -> None: try: batch: RequestBatch = self._dispatcher_queue.get(timeout=0.0001) except Empty: - logger.info("Empty queue") return self._perf_timer.start_timings( diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 17cd344c1e..1d9391212d 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -24,7 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock import pytest @@ -206,6 +206,48 @@ def setup_worker_manager_model_key( return worker_manager, integrated_worker_type +@pytest.fixture +def setup_request_dispatcher_model_bytes( + test_dir, + monkeypatch: pytest.MonkeyPatch, + backbone_descriptor: str, + app_feature_store: FeatureStore, +): + integrated_worker_type = IntegratedTorchWorker + + chan = Channel.make_process_local() + queue = FLInterface(main_ch=chan) + monkeypatch.setenv("SS_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize())) + # Put backbone descriptor into env var for the `EnvironmentConfigLoader` + monkeypatch.setenv("SS_INFRA_BACKBONE", backbone_descriptor) + + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) + + request_dispatcher = RequestDispatcher( + batch_timeout=0, + batch_size=0, + config_loader=config_loader, + worker_type=integrated_worker_type, + ) + request_dispatcher._on_start() + + tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) + output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) + model = MessageHandler.build_model(b"model", "model name", "v 0.0.1") + request = MessageHandler.build_request( + test_dir, model, [tensor_key], [output_key], [], None + ) + ser_request = MessageHandler.serialize_request(request) + + request_dispatcher._incoming_channel.send(ser_request) + + return request_dispatcher, integrated_worker_type + + def mock_pipeline_stage(monkeypatch: pytest.MonkeyPatch, integrated_worker, stage): def mock_stage(*args, **kwargs): raise ValueError(f"Simulated error in {stage}") @@ -321,6 +363,56 @@ def test_wm_pipeline_stage_errors_handled( mock_reply_fn.assert_called_with("fail", error_message) +@pytest.mark.parametrize( + "setup_request_dispatcher", + [ + pytest.param("setup_request_dispatcher_model_bytes"), + # pytest.param("setup_worker_manager_model_key"), + ], +) +@pytest.mark.parametrize( + "stage, error_message", + [ + pytest.param( + "fetch_inputs", + "Error fetching input.", + id="fetch input", + ), + pytest.param( + "transform_input", + "Error Transforming input.", + id="transform input", + ), + ], +) +def test_dispatcher_pipeline_stage_errors_handled( + request, + setup_request_dispatcher, + monkeypatch: pytest.MonkeyPatch, + stage: str, + error_message: str, +): + """Ensures that the request dispatcher does not crash after a failure in various pipeline stages""" + request_dispatcher, integrated_worker_type = request.getfixturevalue( + setup_request_dispatcher + ) + integrated_worker = request_dispatcher._worker + + mock_reply_fn = mock_pipeline_stage(monkeypatch, integrated_worker, stage) + + if stage not in ["fetch_inputs"]: + monkeypatch.setattr( + integrated_worker, + "fetch_inputs", + MagicMock(return_value=[FetchInputResult(result=[b"result"], meta=None)]), + ) + + request_dispatcher._on_iteration() + + mock_reply_fn.assert_called_once() + mock_reply_fn.assert_called_with("fail", error_message) + + def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch): """Ensures that the worker manager does not crash after a failure in the execute pipeline stage""" diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index 864e14993c..52e7a84d51 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -26,7 +26,6 @@ import io import logging -import multiprocessing as mp import pathlib import time @@ -40,6 +39,9 @@ import dragon.channels as dch from dragon import fli +from dragon.mpbridge.queues import DragonQueue + +import multiprocessing as mp from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel @@ -174,14 +176,15 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: callback_factory=FileSystemCommChannel.from_descriptor, queue_factory=DragonFLIChannel.from_descriptor, ) - integrated_worker = TorchWorker() + integrated_worker_type = TorchWorker worker_manager = WorkerManager( config_loader, - integrated_worker, + integrated_worker_type, as_service=True, cooldown=5, device="cpu", + dispatcher_queue=mp.Queue(maxsize=0) ) worker_queue = config_loader.get_queue() From 67242ecf7969ff4d4423fbfc9469d05e41549295 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 26 Aug 2024 06:08:41 -0500 Subject: [PATCH 64/84] Add RequestDispatcher tests --- .../control/requestdispatcher.py | 130 ++---- .../mli/infrastructure/worker/torch_worker.py | 13 +- .../_core/mli/infrastructure/worker/worker.py | 8 +- tests/dragon/test_request_dispatcher.py | 395 ++++++++++++++++++ tests/dragon/test_worker_manager.py | 5 +- 5 files changed, 448 insertions(+), 103 deletions(-) create mode 100644 tests/dragon/test_request_dispatcher.py diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index 6fb4b7d084..20786fdf9a 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -41,7 +41,6 @@ import typing as t import uuid from queue import Empty, Full, Queue -from threading import RLock from types import TracebackType from smartsim._core.entrypoints.service import Service @@ -76,37 +75,6 @@ def __init__(self, name: str) -> None: """The name used by the toolkit to identify this device""" self._models: dict[str, t.Any] = {} """Dictionary of model key to model for models stored on this device""" - self._lock = RLock() - """Lock to ensure only one thread at the time accesses this device""" - - def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]: - """Acquire and lock this device to prevent other threads - - from acquiring it concurrently. - :param blocking: If set to True, the call will block - for the time specified by ``timeout`` until the lock - can be acquired - :param timeout: Time (in seconds) to wait to acquire lock. - Ignored if ``blocking`` is set to False. - """ - return self._lock.acquire(blocking=blocking, timeout=timeout) - - def release(self) -> None: - """Release device to allow other threads to acquire it""" - self._lock.release() - - def __enter__(self) -> None: - """Locked context creator for this device""" - self.acquire() - - def __exit__( - self, - exc_type: t.Optional[t.Type[BaseException]], - exc_val: t.Optional[BaseException], - exc_tb: t.Optional[TracebackType], - ) -> None: - """Locked context destructor for this device""" - self.release() class BatchQueue(Queue[InferenceRequest]): @@ -134,8 +102,6 @@ def __init__( A disposable queue is always full.""" self._model_key: FeatureStoreKey = model_key """Key of the model which needs to be executed on the queued requets""" - self._flush_lock = RLock() - """Lock used to make sure only one process can flush the queue (unused now)""" self._uid = str(uuid.uuid4()) """Unique ID of queue""" @@ -144,30 +110,6 @@ def uid(self) -> str: """ID of this queue""" return self._uid - def acquire(self, blocking: bool = True, timeout: float = -1) -> t.Optional[bool]: - """Acquire queue lock to flush - :param blocking: whether to block on lock acquisition - :param timeout: Time to wait if blocking, before raising exception - """ - return self._flush_lock.acquire(blocking=blocking, timeout=timeout) - - def release(self) -> None: - """Release queue lock""" - self._flush_lock.release() - - def __enter__(self) -> None: - """Method to use the Queue as a Context Manager""" - self.acquire() - - def __exit__( - self, - exc_type: t.Optional[t.Type[BaseException]], - exc_val: t.Optional[BaseException], - exc_tb: t.Optional[TracebackType], - ) -> None: - """Method to release the Queue as a Context Manager""" - self.release() - @property def model_key(self) -> ModelIdentifier: """Key of the model which needs to be run on the queued requests""" @@ -185,16 +127,11 @@ def put( :param timeout: Time (in seconds) to wait if block==True :raises Full: If an item cannot be put on the queue """ - if not self.acquire(blocking=False): + if self.full(): raise Full - try: - if self.full(): - raise Full - if self._first_put is None: - self._first_put = time.time() - super().put(item, block=block, timeout=timeout) - finally: - self.release() + if self._first_put is None: + self._first_put = time.time() + super().put(item, block=block, timeout=timeout) @property def _elapsed_time(self) -> float: @@ -208,7 +145,12 @@ def ready(self) -> bool: """True if the queue can be flushed""" if self.empty(): return False - return self.full() or (self._elapsed_time >= self._batch_timeout) + + timed_out = ( + self._batch_timeout > 0 and self._elapsed_time >= self._batch_timeout + ) + logger.debug(f"Is full: {self.full()} or has timed out: {timed_out}") + return self.full() or timed_out def make_disposable(self) -> None: """Set this queue as disposable, and never use it again after it gets flushed""" @@ -277,8 +219,6 @@ def __init__( """Time in seconds that has to be waited before flushing a non-full queue""" self._batch_size = batch_size """Total capacity of each batch queue.""" - self._queue_swap_lock: t.Optional[RLock] = None - """Lock used to swap the active queue for a key""" incoming_channel = config_loader.get_queue() if incoming_channel is None: raise SmartSimError("No incoming channel for dispatcher") @@ -299,7 +239,7 @@ def __init__( """The worker used to batch inputs""" self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(2 * 1024**3).sdesc) """Memory pool used to share batched input tensors with the Worker Managers""" - self._perf_timer = PerfTimer(prefix="r_", debug=True, timing_on=True) + self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True) """Performance timer""" def _check_feature_stores(self, request: InferenceRequest) -> bool: @@ -387,9 +327,6 @@ def _validate_request(self, request: InferenceRequest) -> bool: return all(checks) - def _on_start(self) -> None: - self._queue_swap_lock = RLock() - def _on_iteration(self) -> None: try: self._perf_timer.set_active(True) @@ -448,24 +385,21 @@ def _swap_queue(self, model_key: FeatureStoreKey) -> None: and make it the active one for a given model. :param model_key: The key of the model for which the queue has to be swapped - :raises SmartSimError: If the queue is not locked. """ - if self._queue_swap_lock is None: - raise SmartSimError("Queues were not locked") - with self._queue_swap_lock: - if model_key.key in self._queues: - for queue in self._queues[model_key.key]: - if not queue.full(): - self._active_queues[model_key.key] = queue - return - - new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key) - if model_key.key in self._queues: - self._queues[model_key.key].append(new_queue) - else: - self._queues[model_key.key] = [new_queue] - self._active_queues[model_key.key] = new_queue - return + + if model_key.key in self._queues: + for queue in self._queues[model_key.key]: + if not queue.full(): + self._active_queues[model_key.key] = queue + return + + new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key) + if model_key.key in self._queues: + self._queues[model_key.key].append(new_queue) + else: + self._queues[model_key.key] = [new_queue] + self._active_queues[model_key.key] = new_queue + return def dispatch(self, request: InferenceRequest) -> None: """Assign a request to a batch queue @@ -498,10 +432,9 @@ def flush_requests(self) -> None: """Get all requests from queues which are ready to be flushed. Place all avaliable request batches in the outgoing queue. """ - print(self._queues.items()) for queue_list in self._queues.values(): for queue in queue_list: - if queue.ready and queue.acquire(blocking=False): + if queue.ready: self._perf_timer.measure_time("find_queue") try: batch = RequestBatch( @@ -511,7 +444,6 @@ def flush_requests(self) -> None: ) finally: self._perf_timer.measure_time("flush_requests") - queue.release() try: fetch_results = self._worker.fetch_inputs( batch=batch, feature_stores=self._feature_stores @@ -544,7 +476,15 @@ def flush_requests(self) -> None: request.raw_inputs = [] request.input_meta = [] - self._outgoing_queue.put(batch) + try: + self._outgoing_queue.put(batch) + except Exception as exc: + exception_handler( + exc, + None, + "Error placing batch on task queue.", + ) + continue self._perf_timer.measure_time("put") def _can_shutdown(self) -> bool: diff --git a/smartsim/_core/mli/infrastructure/worker/torch_worker.py b/smartsim/_core/mli/infrastructure/worker/torch_worker.py index 7ea09b9af9..0639d59696 100644 --- a/smartsim/_core/mli/infrastructure/worker/torch_worker.py +++ b/smartsim/_core/mli/infrastructure/worker/torch_worker.py @@ -46,6 +46,9 @@ TransformOutputResult, ) +# pylint: enable=import-error + + torch.set_num_threads(1) torch.set_num_interop_threads(4) logger = get_logger(__name__) @@ -129,7 +132,7 @@ def transform_input( results.append(mem_alloc.serialize()) - return TransformInputResult(results, slices, all_dims) + return TransformInputResult(results, slices, all_dims, all_dtypes) # pylint: disable-next=unused-argument @staticmethod @@ -147,15 +150,17 @@ def execute( tensors = [] mem_allocs = [] - for transformed, dims in zip( - transform_result.transformed, transform_result.dims + for transformed, dims, dtype in zip( + transform_result.transformed, transform_result.dims, transform_result.dtypes ): mem_alloc = MemoryAlloc.attach(transformed) mem_allocs.append(mem_alloc) + itemsize = np.empty((1), dtype=dtype).itemsize tensors.append( torch.from_numpy( np.frombuffer( - mem_alloc.get_memview()[0 : np.prod(dims) * 4], dtype=np.float32 + mem_alloc.get_memview()[0 : np.prod(dims) * itemsize], + dtype=dtype, ).reshape(dims) ) ) diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index b3d47b13c7..30d41c0285 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -110,7 +110,11 @@ class TransformInputResult: """A wrapper around a transformed batch of input tensors""" def __init__( - self, result: t.Any, slices: list[slice], dims: list[list[int]] + self, + result: t.Any, + slices: list[slice], + dims: list[list[int]], + dtypes: list[str], ) -> None: """Initialize the object""" self.transformed = result @@ -120,6 +124,8 @@ def __init__( which request""" self.dims = dims """Dimension of the transformed tensors""" + self.dtypes = dtypes + """Data type of transformed tensors""" class ExecuteResult: diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py new file mode 100644 index 0000000000..8bed9fc16d --- /dev/null +++ b/tests/dragon/test_request_dispatcher.py @@ -0,0 +1,395 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import io +import logging +import pathlib +import socket +import time +import typing as t +from queue import Empty + +import numpy as np +import pytest + +torch = pytest.importorskip("torch") +dragon = pytest.importorskip("dragon") + +import base64 +import multiprocessing as mp + +try: + mp.set_start_method("dragon") +except Exception: + pass + +import os + +import dragon.channels as dch +import dragon.infrastructure.policy as dragon_policy +import dragon.infrastructure.process_desc as dragon_process_desc +import dragon.native.process as dragon_process +from dragon import fli +from dragon.channels import Channel +from dragon.data.ddict.ddict import DDict +from dragon.managed_memory import MemoryAlloc, MemoryPool +from dragon.mpbridge.queues import DragonQueue + +from smartsim._core.entrypoints.service import Service +from smartsim._core.mli.comm.channel.channel import CommChannelBase +from smartsim._core.mli.comm.channel.dragonchannel import DragonCommChannel +from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel +from smartsim._core.mli.infrastructure.control.requestdispatcher import ( + RequestBatch, + RequestDispatcher, +) +from smartsim._core.mli.infrastructure.control.workermanager import ( + EnvironmentConfigLoader, +) +from smartsim._core.mli.infrastructure.storage.dragonfeaturestore import ( + DragonFeatureStore, +) +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore +from smartsim._core.mli.infrastructure.worker.torch_worker import TorchWorker +from smartsim._core.mli.message_handler import MessageHandler +from smartsim.log import get_logger + +from .featurestore import FileSystemFeatureStore +from .utils.channel import FileSystemCommChannel + +logger = get_logger(__name__) +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon + + +def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: + """Create a simple torch model and persist to disk for + testing purposes. + + TODO: remove once unit tests are in place""" + # test_path = pathlib.Path(work_dir) + if not model_path.parent.exists(): + model_path.parent.mkdir(parents=True, exist_ok=True) + + model_path.unlink(missing_ok=True) + # model_path = test_path / "basic.pt" + + model = torch.nn.Linear(2, 1) + torch.save(model, model_path) + + return model_path + + +def mock_messages( + request_dispatcher_queue: DragonFLIChannel, + feature_store: FeatureStore, + feature_store_root_dir: pathlib.Path, + comm_channel_root_dir: pathlib.Path, +) -> None: + """Mock event producer for triggering the inference pipeline""" + feature_store_root_dir.mkdir(parents=True, exist_ok=True) + comm_channel_root_dir.mkdir(parents=True, exist_ok=True) + + model_path = persist_model_file(feature_store_root_dir.parent / "model_original.pt") + model_bytes = model_path.read_bytes() + model_key = str(feature_store_root_dir / "model_fs.pt") + + feature_store[model_key] = model_bytes + + for iteration_number in range(2): + time.sleep(1) + # 1. for demo, ignore upstream and just put stuff into downstream + # 2. for demo, only one downstream but we'd normally have to filter + # msg content and send to the correct downstream (worker) queue + # timestamp = time.time_ns() + # mock_channel = test_path / f"brainstorm-{timestamp}.txt" + # mock_channel.touch() + + # thread - just look for key (wait for keys) + # call checkpoint, try to get non-persistent key, it blocks + # working set size > 1 has side-effects + # only incurs cost when working set size has been exceeded + + channel_key = Channel.make_process_local().serialize() + callback_channel = DragonCommChannel(channel_key) + + input_path = feature_store_root_dir / f"{iteration_number}/input.pt" + output_path = feature_store_root_dir / f"{iteration_number}/output.pt" + + input_key = str(input_path) + output_key = str(output_path) + + tensor = ( + (iteration_number + 1) * torch.ones((1, 2), dtype=torch.float32) + ).numpy() + fsd = feature_store.descriptor + + tensor_desc = MessageHandler.build_tensor_descriptor( + "c", "float32", list(tensor.shape) + ) + + message_tensor_output_key = MessageHandler.build_tensor_key(output_key, fsd) + message_tensor_input_key = MessageHandler.build_tensor_key(input_key, fsd) + message_model_key = MessageHandler.build_model_key(model_key, fsd) + + request = MessageHandler.build_request( + reply_channel=callback_channel.descriptor, + model=message_model_key, + inputs=[tensor_desc], + outputs=[message_tensor_output_key], + output_descriptors=[], + custom_attributes=None, + ) + request_bytes = MessageHandler.serialize_request(request) + with request_dispatcher_queue._fli.sendh( + timeout=None, stream_channel=request_dispatcher_queue._channel + ) as sendh: + sendh.send_bytes(request_bytes) + sendh.send_bytes(tensor.tobytes()) + + +@pytest.fixture +def prepare_environment(test_dir: str) -> pathlib.Path: + """Cleanup prior outputs to run demo repeatedly""" + path = pathlib.Path(f"{test_dir}/workermanager.log") + logging.basicConfig(filename=path.absolute(), level=logging.DEBUG) + return path + + +def service_as_dragon_proc( + service: Service, cpu_affinity: list[int], gpu_affinity: list[int] +) -> dragon_process.Process: + + options = dragon_process_desc.ProcessOptions(make_inf_channels=True) + local_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=socket.gethostname(), + cpu_affinity=cpu_affinity, + gpu_affinity=gpu_affinity, + ) + return dragon_process.Process( + target=service.execute, + args=[], + cwd=os.getcwd(), + policy=local_policy, + options=options, + stderr=dragon_process.Popen.STDOUT, + stdout=dragon_process.Popen.STDOUT, + ) + + +def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None: + """Test dispatcher's batching of requests""" + + test_path = prepare_environment + fs_path = test_path / "feature_store" + comm_path = test_path / "comm_store" + + to_worker_channel = dch.Channel.make_process_local() + to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) + to_worker_fli_serialized = to_worker_fli.serialize() + + # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader + # or test environment may be unable to send messages w/queue + descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor + + ddict = DDict(1, 1) + dd_descriptor = ddict.serialize() + + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=DragonCommChannel, + queue_factory=DragonFLIChannel.from_descriptor, + ) + integrated_worker_type = TorchWorker + + request_dispatcher = RequestDispatcher( + batch_timeout=0, + batch_size=2, + config_loader=config_loader, + worker_type=integrated_worker_type, + ) + + worker_queue = config_loader.get_queue() + if worker_queue is None: + logger.warn( + "FLI input queue not loaded correctly from config_loader: " + f"{config_loader._queue_descriptor}" + ) + + # create a mock client application to populate the request queue + msg_pump = mp.Process( + target=mock_messages, + args=( + worker_queue, + DragonFeatureStore(ddict), + fs_path, + comm_path, + ), + ) + msg_pump.start() + + # create a process to execute commands + process = service_as_dragon_proc(request_dispatcher, [], []) + process.start() + + batch: RequestBatch = request_dispatcher.task_queue.get(timeout=None) + + try: + + assert batch.has_valid_requests + tensors = [] + mem_allocs = [] + + transform_result = batch.inputs + for transformed, dims, dtype in zip( + transform_result.transformed, transform_result.dims, transform_result.dtypes + ): + mem_alloc = MemoryAlloc.attach(transformed) + mem_allocs.append(mem_alloc) + itemsize = np.empty((1), dtype=dtype).itemsize + tensors.append( + torch.from_numpy( + np.frombuffer( + mem_alloc.get_memview()[0 : np.prod(dims) * itemsize], + dtype=dtype, + ).reshape(dims) + ) + ) + + assert len(batch.requests) == 2 + assert len(tensors) == 1 + assert tensors[0].shape == torch.Size([2, 2]) + model_key = str(fs_path / "model_fs.pt") + assert batch.model_key.key == model_key + + for tensor in tensors: + for sample_idx in range(tensor.shape[0]): + tensor_in = tensor[sample_idx] + tensor_out = (sample_idx + 1) * torch.ones((2,), dtype=torch.float32) + assert torch.equal(tensor_in, tensor_out) + + except Exception as exc: + raise exc + finally: + for mem_alloc in mem_allocs: + mem_alloc.free() + + process.join(timeout=5) + process.kill() + msg_pump.kill() + + +def test_request_dispatcher_queues(prepare_environment: pathlib.Path) -> None: + """Test the request dispatcher internal queues""" + + test_path = prepare_environment + fs_path = test_path / "feature_store" + comm_path = test_path / "comm_store" + + to_worker_channel = dch.Channel.make_process_local() + to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) + to_worker_fli_serialized = to_worker_fli.serialize() + + # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader + # or test environment may be unable to send messages w/queue + descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") + os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor + + ddict = DDict(1, 1) + dd_descriptor = ddict.serialize() + + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=DragonCommChannel, + queue_factory=DragonFLIChannel.from_descriptor, + ) + integrated_worker_type = TorchWorker + + request_dispatcher = RequestDispatcher( + batch_timeout=0, + batch_size=2, + config_loader=config_loader, + worker_type=integrated_worker_type, + ) + + worker_queue = config_loader.get_queue() + if worker_queue is None: + logger.warn( + "FLI input queue not loaded correctly from config_loader: " + f"{config_loader._queue_descriptor}" + ) + + request_dispatcher._on_start() + + # create a mock client application to populate the request queue + msg_pump = mp.Process( + target=mock_messages, + args=( + worker_queue, + DragonFeatureStore(ddict), + fs_path, + comm_path, + ), + ) + msg_pump.start() + + batch: t.Optional[RequestBatch] = None + for attempts in range(10): + try: + request_dispatcher._on_iteration() + batch = request_dispatcher.task_queue.get(timeout=1) + break + except Empty as exc: + continue + + try: + assert batch is not None + assert batch.has_valid_requests + mem_allocs = [] + + transform_result = batch.inputs + for transformed in transform_result.transformed: + mem_alloc = MemoryAlloc.attach(transformed) + mem_allocs.append(mem_alloc) + + assert len(batch.requests) == 2 + model_key = str(fs_path / "model_fs.pt") + assert batch.model_key.key == model_key + assert model_key in request_dispatcher._queues + assert model_key in request_dispatcher._active_queues + assert len(request_dispatcher._queues[model_key]) == 1 + assert request_dispatcher._queues[model_key][0].empty() + assert request_dispatcher._queues[model_key][0].model_key.key == model_key + + except Exception as exc: + raise exc + finally: + for mem_alloc in mem_allocs: + mem_alloc.free() + + msg_pump.kill() diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index fcbcc20b77..ac466491d7 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -35,14 +35,13 @@ dragon = pytest.importorskip("dragon") import base64 +import multiprocessing as mp import os import dragon.channels as dch from dragon import fli from dragon.mpbridge.queues import DragonQueue -import multiprocessing as mp - from smartsim._core.mli.comm.channel.channel import CommChannelBase from smartsim._core.mli.comm.channel.dragonfli import DragonFLIChannel from smartsim._core.mli.infrastructure.control.workermanager import ( @@ -184,7 +183,7 @@ def test_worker_manager(prepare_environment: pathlib.Path) -> None: as_service=True, cooldown=5, device="cpu", - dispatcher_queue=mp.Queue(maxsize=0) + dispatcher_queue=mp.Queue(maxsize=0), ) worker_queue = config_loader.get_queue() From 4a5185bcf8c75d7fa116432644d5bc1a0258b1f3 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 26 Aug 2024 11:17:16 -0500 Subject: [PATCH 65/84] Added tests for device manager --- .../infrastructure/control/devicemanager.py | 43 +++-- .../control/requestdispatcher.py | 40 +++-- .../infrastructure/control/workermanager.py | 153 +++++++++-------- .../_core/mli/infrastructure/worker/worker.py | 1 - tests/dragon/test_error_handling.py | 4 +- tests/dragon/test_request_dispatcher.py | 130 +++++++------- tests/mli/test_device_manager.py | 162 ++++++++++++++++++ 7 files changed, 366 insertions(+), 167 deletions(-) create mode 100644 tests/mli/test_device_manager.py diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py index 09fab32f95..49f8403b8c 100644 --- a/smartsim/_core/mli/infrastructure/control/devicemanager.py +++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py @@ -24,12 +24,12 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from contextlib import contextmanager import typing as t from .....log import get_logger from ...infrastructure.storage.featurestore import FeatureStore -from ..worker.worker import MachineLearningWorkerBase -from .requestdispatcher import RequestBatch +from ..worker.worker import MachineLearningWorkerBase, RequestBatch logger = get_logger(__name__) @@ -68,15 +68,33 @@ def get_model(self, key: str) -> t.Any: """Get the model corresponding to a given key :param key: the model key + :returns: the model for the given key """ return self._models[key] def __contains__(self, key: str) -> bool: + """Check if model with a given key is available on the device + + :param key: the key of the model to check for existence + :returns: whether the model is available on the device + """ return key in self._models + @contextmanager + def get(self, key_to_remove: t.Optional[str]): + yield self + if key_to_remove is not None: + self.remove_model(key_to_remove) class DeviceManager: def __init__(self, device: WorkerDevice): + """An object to manage devices such as GPUs and CPUs. + + The main goal of the ``DeviceManager`` is to ensure that + the managed device is ready to be used by a worker to + run a given model + :param device: The managed device + """ self._device = device """Device managed by this object""" @@ -86,6 +104,14 @@ def _load_model_on_device( batch: RequestBatch, feature_stores: dict[str, FeatureStore], ) -> None: + """Load the model needed to execute on a batch on the managed device. + + The model is loaded by the worker. + + :param worker: the worker that loads the model + :param batch: the batch for which the model is needed + :param feature_stores: feature stores where the model could be stored + """ model_bytes = worker.fetch_model(batch, feature_stores) loaded_model = worker.load_model(batch, model_bytes, self._device.name) @@ -113,12 +139,7 @@ def get_device( # Load model if not already loaded, or # because it is sent with the request if model_in_request or not batch.model_key.key in self._device: - try: - self._load_model_on_device(worker, batch, feature_stores) - except Exception as exc: - raise exc - try: - yield self._device - finally: - if model_in_request: - self._device.remove_model(batch.model_key.key) + self._load_model_on_device(worker, batch, feature_stores) + + key_to_remove = batch.model_key.key if model_in_request else None + return self._device.get(key_to_remove) diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index 20786fdf9a..0016c18a9b 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -41,7 +41,6 @@ import typing as t import uuid from queue import Empty, Full, Queue -from types import TracebackType from smartsim._core.entrypoints.service import Service @@ -65,18 +64,6 @@ # Placeholder ModelIdentifier = FeatureStoreKey - -class WorkerDevice: - def __init__(self, name: str) -> None: - """Wrapper around a device to keep track of loaded Models and availability - :param name: name used by the toolkit to identify this device, e.g. ``cuda:0`` - """ - self._name = name - """The name used by the toolkit to identify this device""" - self._models: dict[str, t.Any] = {} - """Dictionary of model key to model for models stored on this device""" - - class BatchQueue(Queue[InferenceRequest]): def __init__( self, batch_timeout: float, batch_size: int, model_key: ModelIdentifier @@ -366,14 +353,37 @@ def _on_iteration(self) -> None: self._perf_timer.measure_time("dispatch") finally: self.flush_requests() - # TODO: implement this - # self.remove_queues() + self.remove_queues() self._perf_timer.end_timings() if self._perf_timer.max_length == 801 and self._perf_timer.is_active: self._perf_timer.print_timings(True) + def remove_queues(self) -> None: + """Remove references to queues that can be removed + and allow them to be garbage collected""" + queue_lists_to_remove = [] + for key, queues in self._queues.items(): + queues_to_remove = [] + for queue in queues: + if queue.can_be_removed: + queues_to_remove.append(queue) + + for queue_to_remove in queues_to_remove: + queues.remove(queue_to_remove) + if ( + key in self._active_queues + and self._active_queues[key] == queue_to_remove + ): + del self._active_queues[key] + + if len(queues) == 0: + queue_lists_to_remove.append(key) + + for key in queue_lists_to_remove: + del self._queues[key] + @property def task_queue(self) -> DragonQueue: """The queue on which batched requests are placed""" diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 3949476b6b..8256ce4f55 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -191,13 +191,12 @@ def _on_iteration(self) -> None: "Error acquiring device manager", ) return - device: WorkerDevice = next( - self._device_manager.get_device( + device_cm = self._device_manager.get_device( worker=self._worker, batch=batch, feature_stores=self._feature_stores, ) - ) + except Exception as exc: for request in batch.requests: exception_handler( @@ -208,90 +207,92 @@ def _on_iteration(self) -> None: return self._perf_timer.measure_time("fetch_model") - try: - model_result = LoadModelResult(device.get_model(batch.model_key.key)) - except Exception as exc: - for request in batch.requests: - exception_handler( - exc, request.callback, "Error getting model from device." - ) - return - self._perf_timer.measure_time("load_model") + with device_cm as device: - if batch.inputs is None: - for request in batch.requests: - exception_handler( - ValueError("Error batching inputs"), - request.callback, - "Error batching inputs.", - ) - return - transformed_input = batch.inputs + try: + model_result = LoadModelResult(device.get_model(batch.model_key.key)) + except Exception as exc: + for request in batch.requests: + exception_handler( + exc, request.callback, "Error getting model from device." + ) + return + self._perf_timer.measure_time("load_model") - try: - execute_result = self._worker.execute( - batch, model_result, transformed_input, device.name - ) - except Exception as e: - for request in batch.requests: - exception_handler(e, request.callback, "Failed while executing.") - return - self._perf_timer.measure_time("execute") + if batch.inputs is None: + for request in batch.requests: + exception_handler( + ValueError("Error batching inputs"), + request.callback, + "Error batching inputs.", + ) + return + transformed_input = batch.inputs - try: - transformed_outputs = self._worker.transform_output(batch, execute_result) - except Exception as e: - for request in batch.requests: - exception_handler( - e, request.callback, "Failed while transforming the output." + try: + execute_result = self._worker.execute( + batch, model_result, transformed_input, device.name ) - return + except Exception as e: + for request in batch.requests: + exception_handler(e, request.callback, "Failed while executing.") + return + self._perf_timer.measure_time("execute") - for request, transformed_output in zip(batch.requests, transformed_outputs): - reply = InferenceReply() - if request.output_keys: - try: - reply.output_keys = self._worker.place_output( - request, - transformed_output, - self._feature_stores, - ) - except Exception as e: + try: + transformed_outputs = self._worker.transform_output(batch, execute_result) + except Exception as e: + for request in batch.requests: exception_handler( - e, request.callback, "Failed while placing the output." + e, request.callback, "Failed while transforming the output." + ) + return + + for request, transformed_output in zip(batch.requests, transformed_outputs): + reply = InferenceReply() + if request.output_keys: + try: + reply.output_keys = self._worker.place_output( + request, + transformed_output, + self._feature_stores, + ) + except Exception as e: + exception_handler( + e, request.callback, "Failed while placing the output." + ) + continue + else: + reply.outputs = transformed_output.outputs + self._perf_timer.measure_time("assign_output") + + if reply.outputs is None or not reply.outputs: + response = build_failure_reply("fail", "Outputs not found.") + else: + reply.status_enum = "complete" + reply.message = "Success" + + results = self._worker.prepare_outputs(reply) + response = MessageHandler.build_response( + status=reply.status_enum, + message=reply.message, + result=results, + custom_attributes=None, ) - continue - else: - reply.outputs = transformed_output.outputs - self._perf_timer.measure_time("assign_output") - - if reply.outputs is None or not reply.outputs: - response = build_failure_reply("fail", "Outputs not found.") - else: - reply.status_enum = "complete" - reply.message = "Success" - - results = self._worker.prepare_outputs(reply) - response = MessageHandler.build_response( - status=reply.status_enum, - message=reply.message, - result=results, - custom_attributes=None, - ) - self._perf_timer.measure_time("build_reply") + self._perf_timer.measure_time("build_reply") - serialized_resp = MessageHandler.serialize_response(response) + serialized_resp = MessageHandler.serialize_response(response) - self._perf_timer.measure_time("serialize_resp") + self._perf_timer.measure_time("serialize_resp") - if request.callback: - request.callback.send(serialized_resp) - if reply.outputs: - # send tensor data after response - for output in reply.outputs: - request.callback.send(output) - self._perf_timer.measure_time("send") + if request.callback: + request.callback.send(serialized_resp) + if reply.outputs: + # send tensor data after response + for output in reply.outputs: + request.callback.send(output) + self._perf_timer.measure_time("send") self._perf_timer.end_timings() diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 30d41c0285..008b6202be 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -406,7 +406,6 @@ def load_model( device memory :param request: The request that triggered the pipeline :param device: The device on which the model must be placed - :param device: The device on which the model must be placed :return: ModelLoadResult wrapping the model loaded for the request""" @staticmethod diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 21a5758311..113f7ccba0 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -141,7 +141,7 @@ def setup_worker_manager_model_bytes( request_batch = RequestBatch( [request], - TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]]), + TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]), model_id, ) @@ -200,7 +200,7 @@ def setup_worker_manager_model_key( ) request_batch = RequestBatch( [request], - TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]]), + TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]), model_key=model_key, ) diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index 8bed9fc16d..d1e97a8b5b 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -120,18 +120,7 @@ def mock_messages( feature_store[model_key] = model_bytes for iteration_number in range(2): - time.sleep(1) - # 1. for demo, ignore upstream and just put stuff into downstream - # 2. for demo, only one downstream but we'd normally have to filter - # msg content and send to the correct downstream (worker) queue - # timestamp = time.time_ns() - # mock_channel = test_path / f"brainstorm-{timestamp}.txt" - # mock_channel.touch() - - # thread - just look for key (wait for keys) - # call checkpoint, try to get non-persistent key, it blocks - # working set size > 1 has side-effects - # only incurs cost when working set size has been exceeded + time.sleep(0.1) channel_key = Channel.make_process_local().serialize() callback_channel = DragonCommChannel(channel_key) @@ -156,7 +145,7 @@ def mock_messages( message_model_key = MessageHandler.build_model_key(model_key, fsd) request = MessageHandler.build_request( - reply_channel=callback_channel.descriptor, + reply_channel=base64.b64encode(callback_channel.descriptor).decode("utf-8"), model=message_model_key, inputs=[tensor_desc], outputs=[message_tensor_output_key], @@ -218,11 +207,10 @@ def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None: os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor ddict = DDict(1, 1) - dd_descriptor = ddict.serialize() config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=DragonCommChannel, + callback_factory=DragonCommChannel.from_descriptor, queue_factory=DragonFLIChannel.from_descriptor, ) integrated_worker_type = TorchWorker @@ -260,7 +248,6 @@ def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None: batch: RequestBatch = request_dispatcher.task_queue.get(timeout=None) try: - assert batch.has_valid_requests tensors = [] mem_allocs = [] @@ -305,7 +292,11 @@ def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None: def test_request_dispatcher_queues(prepare_environment: pathlib.Path) -> None: - """Test the request dispatcher internal queues""" + """Test the request dispatcher internal queues + + This also includes setting a queue to disposable, checking that it is no + longer referenced and that it is re-created when needed. + """ test_path = prepare_environment fs_path = test_path / "feature_store" @@ -321,11 +312,10 @@ def test_request_dispatcher_queues(prepare_environment: pathlib.Path) -> None: os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor ddict = DDict(1, 1) - dd_descriptor = ddict.serialize() config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=DragonCommChannel, + callback_factory=DragonCommChannel.from_descriptor, queue_factory=DragonFLIChannel.from_descriptor, ) integrated_worker_type = TorchWorker @@ -346,50 +336,66 @@ def test_request_dispatcher_queues(prepare_environment: pathlib.Path) -> None: request_dispatcher._on_start() - # create a mock client application to populate the request queue - msg_pump = mp.Process( - target=mock_messages, - args=( - worker_queue, - DragonFeatureStore(ddict), - fs_path, - comm_path, - ), - ) - msg_pump.start() + model_key = str(fs_path / "model_fs.pt") - batch: t.Optional[RequestBatch] = None - for attempts in range(10): - try: - request_dispatcher._on_iteration() - batch = request_dispatcher.task_queue.get(timeout=1) - break - except Empty as exc: - continue - - try: - assert batch is not None - assert batch.has_valid_requests + for iteration in range(2): + batch: t.Optional[RequestBatch] = None mem_allocs = [] - transform_result = batch.inputs - for transformed in transform_result.transformed: - mem_alloc = MemoryAlloc.attach(transformed) - mem_allocs.append(mem_alloc) - - assert len(batch.requests) == 2 - model_key = str(fs_path / "model_fs.pt") - assert batch.model_key.key == model_key - assert model_key in request_dispatcher._queues - assert model_key in request_dispatcher._active_queues - assert len(request_dispatcher._queues[model_key]) == 1 - assert request_dispatcher._queues[model_key][0].empty() - assert request_dispatcher._queues[model_key][0].model_key.key == model_key - - except Exception as exc: - raise exc - finally: - for mem_alloc in mem_allocs: - mem_alloc.free() + # create a mock client application to populate the request queue + msg_pump = mp.Process( + target=mock_messages, + args=( + worker_queue, + DragonFeatureStore(ddict), + fs_path, + comm_path, + ), + ) + msg_pump.start() + + for attempts in range(15): + try: + request_dispatcher._on_iteration() + batch = request_dispatcher.task_queue.get(timeout=1) + break + except Empty: + logger.info("Empty queue") + continue + except Exception as exc: + logger.info(f"Failed at iteration #{iteration}") + raise exc - msg_pump.kill() + try: + assert batch is not None + assert batch.has_valid_requests + + transform_result = batch.inputs + for transformed in transform_result.transformed: + mem_alloc = MemoryAlloc.attach(transformed) + mem_allocs.append(mem_alloc) + + assert len(batch.requests) == 2 + assert batch.model_key.key == model_key + assert model_key in request_dispatcher._queues + assert model_key in request_dispatcher._active_queues + assert len(request_dispatcher._queues[model_key]) == 1 + assert request_dispatcher._queues[model_key][0].empty() + assert request_dispatcher._queues[model_key][0].model_key.key == model_key + + except Exception as exc: + logger.log(f"Failed at iteration #{iteration}") + raise exc + finally: + for mem_alloc in mem_allocs: + mem_alloc.free() + + msg_pump.kill() + + request_dispatcher._active_queues[model_key].make_disposable() + assert request_dispatcher._active_queues[model_key].can_be_removed + + request_dispatcher._on_iteration() + + assert model_key not in request_dispatcher._active_queues + assert model_key not in request_dispatcher._queues diff --git a/tests/mli/test_device_manager.py b/tests/mli/test_device_manager.py new file mode 100644 index 0000000000..12fe2578af --- /dev/null +++ b/tests/mli/test_device_manager.py @@ -0,0 +1,162 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t + + +from smartsim._core.mli.infrastructure.control.devicemanager import DeviceManager, WorkerDevice +from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey +from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase, ExecuteResult, FetchInputResult, FetchModelResult, InferenceRequest, LoadModelResult, RequestBatch, TransformInputResult, TransformOutputResult + +class MockWorker(MachineLearningWorkerBase): + @staticmethod + def fetch_model( + batch: RequestBatch, feature_stores: t.Dict[str, FeatureStore] + ) -> FetchModelResult: + if batch.has_raw_model: + return FetchModelResult(batch.raw_model) + return FetchModelResult(b'fetched_model') + + @staticmethod + def load_model( + batch: RequestBatch, fetch_result: FetchModelResult, device: str + ) -> LoadModelResult: + return LoadModelResult(fetch_result.model_bytes) + + @staticmethod + def transform_input( + batch: RequestBatch, + fetch_results: list[FetchInputResult], + mem_pool: "MemoryPool", + ) -> TransformInputResult: + return TransformInputResult(b'result', [slice(0,1)], [[1,2]], ["float32"]) + + @staticmethod + def execute( + batch: RequestBatch, + load_result: LoadModelResult, + transform_result: TransformInputResult, + device: str, + ) -> ExecuteResult: + return ExecuteResult(b'result', [slice(0,1)]) + + @staticmethod + def transform_output( + batch: RequestBatch, execute_result: ExecuteResult + ) -> t.List[TransformOutputResult]: + return [TransformOutputResult(b'result', None, "c", "float32")] + + +def test_worker_device(): + worker_device = WorkerDevice("gpu:0") + assert worker_device.name == "gpu:0" + + model_key = "my_model_key" + model = b"the model" + + worker_device.add_model(model_key, model) + + assert model_key in worker_device + assert worker_device.get_model(model_key) == model + worker_device.remove_model(model_key) + + assert model_key not in worker_device + + +def test_device_manager_model_in_request(): + + worker_device = WorkerDevice("gpu:0") + device_manager = DeviceManager(worker_device) + + worker = MockWorker() + + tensor_key = FeatureStoreKey(key="key", descriptor="desc") + output_key = FeatureStoreKey(key="key", descriptor="desc") + model_key = FeatureStoreKey( + key="model key", descriptor="desc" + ) + + request = InferenceRequest( + model_key=model_key, + callback=None, + raw_inputs=None, + input_keys=[tensor_key], + input_meta=None, + output_keys=[output_key], + raw_model=b"raw model", + batch_size=0, + ) + + request_batch = RequestBatch( + [request], + TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]), + model_key=model_key, + ) + + with device_manager.get_device(worker=worker, batch=request_batch, feature_stores={}) as returned_device: + + assert returned_device == worker_device + assert worker_device.get_model(model_key.key) == b"raw model" + + assert model_key.key not in worker_device + + +def test_device_manager_model_key(): + + worker_device = WorkerDevice("gpu:0") + device_manager = DeviceManager(worker_device) + + worker = MockWorker() + + tensor_key = FeatureStoreKey(key="key", descriptor="desc") + output_key = FeatureStoreKey(key="key", descriptor="desc") + model_key = FeatureStoreKey( + key="model key", descriptor="desc" + ) + + request = InferenceRequest( + model_key=model_key, + callback=None, + raw_inputs=None, + input_keys=[tensor_key], + input_meta=None, + output_keys=[output_key], + raw_model=None, + batch_size=0, + ) + + request_batch = RequestBatch( + [request], + TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]), + model_key=model_key, + ) + + with device_manager.get_device(worker=worker, batch=request_batch, feature_stores={}) as returned_device: + + assert returned_device == worker_device + assert worker_device.get_model(model_key.key) == b"fetched_model" + + assert model_key.key in worker_device \ No newline at end of file From 9d0ba309d1c61d6a3bb86c7c1fa90084e13ac5fa Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 26 Aug 2024 13:00:11 -0500 Subject: [PATCH 66/84] Fix tests --- tests/dragon/test_request_dispatcher.py | 167 +++++++----------------- tests/dragon/test_worker_manager.py | 6 + 2 files changed, 50 insertions(+), 123 deletions(-) diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index d1e97a8b5b..f47ef46d7a 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -95,7 +95,6 @@ def persist_model_file(model_path: pathlib.Path) -> pathlib.Path: model_path.parent.mkdir(parents=True, exist_ok=True) model_path.unlink(missing_ok=True) - # model_path = test_path / "basic.pt" model = torch.nn.Linear(2, 1) torch.save(model, model_path) @@ -110,6 +109,7 @@ def mock_messages( comm_channel_root_dir: pathlib.Path, ) -> None: """Mock event producer for triggering the inference pipeline""" + logger.info("Mocking messages") feature_store_root_dir.mkdir(parents=True, exist_ok=True) comm_channel_root_dir.mkdir(parents=True, exist_ok=True) @@ -117,10 +117,11 @@ def mock_messages( model_bytes = model_path.read_bytes() model_key = str(feature_store_root_dir / "model_fs.pt") + logger.info("Putting model on FS") feature_store[model_key] = model_bytes for iteration_number in range(2): - time.sleep(0.1) + logger.info(f"Message #{iteration_number}") channel_key = Channel.make_process_local().serialize() callback_channel = DragonCommChannel(channel_key) @@ -158,6 +159,7 @@ def mock_messages( ) as sendh: sendh.send_bytes(request_bytes) sendh.send_bytes(tensor.tobytes()) + time.sleep(1) @pytest.fixture @@ -189,9 +191,12 @@ def service_as_dragon_proc( stdout=dragon_process.Popen.STDOUT, ) +def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: + """Test the request dispatcher batching and queueing system -def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None: - """Test dispatcher's batching of requests""" + This also includes setting a queue to disposable, checking that it is no + longer referenced by the dispatcher. + """ test_path = prepare_environment fs_path = test_path / "feature_store" @@ -206,7 +211,8 @@ def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None: descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor - ddict = DDict(1, 1) + ddict = DDict(1, 1, 2*1024**2) + dragon_fs = DragonFeatureStore(ddict) config_loader = EnvironmentConfigLoader( featurestore_factory=DragonFeatureStore.from_descriptor, @@ -229,28 +235,43 @@ def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None: f"{config_loader._queue_descriptor}" ) + request_dispatcher._on_start() + + batch: t.Optional[RequestBatch] = None + mem_allocs = [] + tensors = [] + fs_path = test_path / f"feature_store" + comm_path = test_path / f"comm_store" + model_key = str(fs_path / "model_fs.pt") + # create a mock client application to populate the request queue msg_pump = mp.Process( target=mock_messages, args=( worker_queue, - DragonFeatureStore(ddict), + dragon_fs, fs_path, comm_path, ), ) + msg_pump.start() - # create a process to execute commands - process = service_as_dragon_proc(request_dispatcher, [], []) - process.start() + time.sleep(1) - batch: RequestBatch = request_dispatcher.task_queue.get(timeout=None) + for attempts in range(15): + try: + request_dispatcher._on_iteration() + batch = request_dispatcher.task_queue.get(timeout=1) + break + except Empty: + continue + except Exception as exc: + raise exc try: + assert batch is not None assert batch.has_valid_requests - tensors = [] - mem_allocs = [] transform_result = batch.inputs for transformed, dims, dtype in zip( @@ -269,10 +290,14 @@ def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None: ) assert len(batch.requests) == 2 + assert batch.model_key.key == model_key + assert model_key in request_dispatcher._queues + assert model_key in request_dispatcher._active_queues + assert len(request_dispatcher._queues[model_key]) == 1 + assert request_dispatcher._queues[model_key][0].empty() + assert request_dispatcher._queues[model_key][0].model_key.key == model_key assert len(tensors) == 1 assert tensors[0].shape == torch.Size([2, 2]) - model_key = str(fs_path / "model_fs.pt") - assert batch.model_key.key == model_key for tensor in tensors: for sample_idx in range(tensor.shape[0]): @@ -286,116 +311,12 @@ def test_request_dispatcher_batching(prepare_environment: pathlib.Path) -> None: for mem_alloc in mem_allocs: mem_alloc.free() - process.join(timeout=5) - process.kill() msg_pump.kill() + request_dispatcher._active_queues[model_key].make_disposable() + assert request_dispatcher._active_queues[model_key].can_be_removed -def test_request_dispatcher_queues(prepare_environment: pathlib.Path) -> None: - """Test the request dispatcher internal queues - - This also includes setting a queue to disposable, checking that it is no - longer referenced and that it is re-created when needed. - """ - - test_path = prepare_environment - fs_path = test_path / "feature_store" - comm_path = test_path / "comm_store" - - to_worker_channel = dch.Channel.make_process_local() - to_worker_fli = fli.FLInterface(main_ch=to_worker_channel, manager_ch=None) - to_worker_fli_serialized = to_worker_fli.serialize() - - # NOTE: env vars should be set prior to instantiating EnvironmentConfigLoader - # or test environment may be unable to send messages w/queue - descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") - os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor - - ddict = DDict(1, 1) - - config_loader = EnvironmentConfigLoader( - featurestore_factory=DragonFeatureStore.from_descriptor, - callback_factory=DragonCommChannel.from_descriptor, - queue_factory=DragonFLIChannel.from_descriptor, - ) - integrated_worker_type = TorchWorker - - request_dispatcher = RequestDispatcher( - batch_timeout=0, - batch_size=2, - config_loader=config_loader, - worker_type=integrated_worker_type, - ) - - worker_queue = config_loader.get_queue() - if worker_queue is None: - logger.warn( - "FLI input queue not loaded correctly from config_loader: " - f"{config_loader._queue_descriptor}" - ) - - request_dispatcher._on_start() - - model_key = str(fs_path / "model_fs.pt") - - for iteration in range(2): - batch: t.Optional[RequestBatch] = None - mem_allocs = [] - - # create a mock client application to populate the request queue - msg_pump = mp.Process( - target=mock_messages, - args=( - worker_queue, - DragonFeatureStore(ddict), - fs_path, - comm_path, - ), - ) - msg_pump.start() - - for attempts in range(15): - try: - request_dispatcher._on_iteration() - batch = request_dispatcher.task_queue.get(timeout=1) - break - except Empty: - logger.info("Empty queue") - continue - except Exception as exc: - logger.info(f"Failed at iteration #{iteration}") - raise exc - - try: - assert batch is not None - assert batch.has_valid_requests - - transform_result = batch.inputs - for transformed in transform_result.transformed: - mem_alloc = MemoryAlloc.attach(transformed) - mem_allocs.append(mem_alloc) - - assert len(batch.requests) == 2 - assert batch.model_key.key == model_key - assert model_key in request_dispatcher._queues - assert model_key in request_dispatcher._active_queues - assert len(request_dispatcher._queues[model_key]) == 1 - assert request_dispatcher._queues[model_key][0].empty() - assert request_dispatcher._queues[model_key][0].model_key.key == model_key - - except Exception as exc: - logger.log(f"Failed at iteration #{iteration}") - raise exc - finally: - for mem_alloc in mem_allocs: - mem_alloc.free() - - msg_pump.kill() - - request_dispatcher._active_queues[model_key].make_disposable() - assert request_dispatcher._active_queues[model_key].can_be_removed - - request_dispatcher._on_iteration() + request_dispatcher._on_iteration() - assert model_key not in request_dispatcher._active_queues - assert model_key not in request_dispatcher._queues + assert model_key not in request_dispatcher._active_queues + assert model_key not in request_dispatcher._queues diff --git a/tests/dragon/test_worker_manager.py b/tests/dragon/test_worker_manager.py index ac466491d7..a334164257 100644 --- a/tests/dragon/test_worker_manager.py +++ b/tests/dragon/test_worker_manager.py @@ -36,6 +36,12 @@ import base64 import multiprocessing as mp + +try: + mp.set_start_method("dragon") +except Exception: + pass + import os import dragon.channels as dch From 99da3558d080018497c4e90e96c6854dfc8b67e4 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 26 Aug 2024 13:08:35 -0500 Subject: [PATCH 67/84] Style and type --- .../infrastructure/control/devicemanager.py | 7 +-- .../control/requestdispatcher.py | 1 + .../infrastructure/control/workermanager.py | 12 +++-- tests/dragon/test_request_dispatcher.py | 3 +- tests/mli/test_device_manager.py | 48 ++++++++++++------- 5 files changed, 46 insertions(+), 25 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py index 49f8403b8c..37256581db 100644 --- a/smartsim/_core/mli/infrastructure/control/devicemanager.py +++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py @@ -24,8 +24,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from contextlib import contextmanager import typing as t +from contextlib import contextmanager, _GeneratorContextManager from .....log import get_logger from ...infrastructure.storage.featurestore import FeatureStore @@ -81,11 +81,12 @@ def __contains__(self, key: str) -> bool: return key in self._models @contextmanager - def get(self, key_to_remove: t.Optional[str]): + def get(self, key_to_remove: t.Optional[str]) -> t.Iterator[t.Self]: yield self if key_to_remove is not None: self.remove_model(key_to_remove) + class DeviceManager: def __init__(self, device: WorkerDevice): """An object to manage devices such as GPUs and CPUs. @@ -122,7 +123,7 @@ def get_device( worker: MachineLearningWorkerBase, batch: RequestBatch, feature_stores: dict[str, FeatureStore], - ) -> t.Generator[WorkerDevice, None, None]: + ) -> _GeneratorContextManager[WorkerDevice]: """Get the device managed by this object the model needed to run the batch of requests is diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index 0016c18a9b..a4de00a9f0 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -64,6 +64,7 @@ # Placeholder ModelIdentifier = FeatureStoreKey + class BatchQueue(Queue[InferenceRequest]): def __init__( self, batch_timeout: float, batch_size: int, model_key: ModelIdentifier diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 8256ce4f55..e2ce19dd6d 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -192,10 +192,10 @@ def _on_iteration(self) -> None: ) return device_cm = self._device_manager.get_device( - worker=self._worker, - batch=batch, - feature_stores=self._feature_stores, - ) + worker=self._worker, + batch=batch, + feature_stores=self._feature_stores, + ) except Exception as exc: for request in batch.requests: @@ -240,7 +240,9 @@ def _on_iteration(self) -> None: self._perf_timer.measure_time("execute") try: - transformed_outputs = self._worker.transform_output(batch, execute_result) + transformed_outputs = self._worker.transform_output( + batch, execute_result + ) except Exception as e: for request in batch.requests: exception_handler( diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index f47ef46d7a..768467c245 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -191,6 +191,7 @@ def service_as_dragon_proc( stdout=dragon_process.Popen.STDOUT, ) + def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: """Test the request dispatcher batching and queueing system @@ -211,7 +212,7 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor - ddict = DDict(1, 1, 2*1024**2) + ddict = DDict(1, 1, 2 * 1024**2) dragon_fs = DragonFeatureStore(ddict) config_loader = EnvironmentConfigLoader( diff --git a/tests/mli/test_device_manager.py b/tests/mli/test_device_manager.py index 12fe2578af..1c8b9172da 100644 --- a/tests/mli/test_device_manager.py +++ b/tests/mli/test_device_manager.py @@ -26,10 +26,26 @@ import typing as t +from smartsim._core.mli.infrastructure.control.devicemanager import ( + DeviceManager, + WorkerDevice, +) +from smartsim._core.mli.infrastructure.storage.featurestore import ( + FeatureStore, + FeatureStoreKey, +) +from smartsim._core.mli.infrastructure.worker.worker import ( + ExecuteResult, + FetchInputResult, + FetchModelResult, + InferenceRequest, + LoadModelResult, + MachineLearningWorkerBase, + RequestBatch, + TransformInputResult, + TransformOutputResult, +) -from smartsim._core.mli.infrastructure.control.devicemanager import DeviceManager, WorkerDevice -from smartsim._core.mli.infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey -from smartsim._core.mli.infrastructure.worker.worker import MachineLearningWorkerBase, ExecuteResult, FetchInputResult, FetchModelResult, InferenceRequest, LoadModelResult, RequestBatch, TransformInputResult, TransformOutputResult class MockWorker(MachineLearningWorkerBase): @staticmethod @@ -38,7 +54,7 @@ def fetch_model( ) -> FetchModelResult: if batch.has_raw_model: return FetchModelResult(batch.raw_model) - return FetchModelResult(b'fetched_model') + return FetchModelResult(b"fetched_model") @staticmethod def load_model( @@ -52,7 +68,7 @@ def transform_input( fetch_results: list[FetchInputResult], mem_pool: "MemoryPool", ) -> TransformInputResult: - return TransformInputResult(b'result', [slice(0,1)], [[1,2]], ["float32"]) + return TransformInputResult(b"result", [slice(0, 1)], [[1, 2]], ["float32"]) @staticmethod def execute( @@ -61,13 +77,13 @@ def execute( transform_result: TransformInputResult, device: str, ) -> ExecuteResult: - return ExecuteResult(b'result', [slice(0,1)]) + return ExecuteResult(b"result", [slice(0, 1)]) @staticmethod def transform_output( batch: RequestBatch, execute_result: ExecuteResult ) -> t.List[TransformOutputResult]: - return [TransformOutputResult(b'result', None, "c", "float32")] + return [TransformOutputResult(b"result", None, "c", "float32")] def test_worker_device(): @@ -95,9 +111,7 @@ def test_device_manager_model_in_request(): tensor_key = FeatureStoreKey(key="key", descriptor="desc") output_key = FeatureStoreKey(key="key", descriptor="desc") - model_key = FeatureStoreKey( - key="model key", descriptor="desc" - ) + model_key = FeatureStoreKey(key="model key", descriptor="desc") request = InferenceRequest( model_key=model_key, @@ -116,7 +130,9 @@ def test_device_manager_model_in_request(): model_key=model_key, ) - with device_manager.get_device(worker=worker, batch=request_batch, feature_stores={}) as returned_device: + with device_manager.get_device( + worker=worker, batch=request_batch, feature_stores={} + ) as returned_device: assert returned_device == worker_device assert worker_device.get_model(model_key.key) == b"raw model" @@ -133,9 +149,7 @@ def test_device_manager_model_key(): tensor_key = FeatureStoreKey(key="key", descriptor="desc") output_key = FeatureStoreKey(key="key", descriptor="desc") - model_key = FeatureStoreKey( - key="model key", descriptor="desc" - ) + model_key = FeatureStoreKey(key="model key", descriptor="desc") request = InferenceRequest( model_key=model_key, @@ -154,9 +168,11 @@ def test_device_manager_model_key(): model_key=model_key, ) - with device_manager.get_device(worker=worker, batch=request_batch, feature_stores={}) as returned_device: + with device_manager.get_device( + worker=worker, batch=request_batch, feature_stores={} + ) as returned_device: assert returned_device == worker_device assert worker_device.get_model(model_key.key) == b"fetched_model" - assert model_key.key in worker_device \ No newline at end of file + assert model_key.key in worker_device From c3646d7b477b0aa20448d3acdf25aba9ae343049 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 26 Aug 2024 13:15:58 -0500 Subject: [PATCH 68/84] Fix mock app --- ex/high_throughput_inference/mock_app.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 26045f9020..ea72b3dc16 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -100,14 +100,11 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.perf_timer.measure_time("build_request") request_bytes = MessageHandler.serialize_request(request) self.perf_timer.measure_time("serialize_request") - tensor_bytes = [bytes(tensor.data) for tensor in tensors] - # tensor_bytes = [tensor.reshape(-1).view(numpy.uint8).data for tensor in tensors] self.perf_timer.measure_time("serialize_tensor") with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: to_sendh.send_bytes(request_bytes) - for tb in tensor_bytes: - to_sendh.send_bytes(tb) #TODO NOT FAST ENOUGH!!! - # to_sendh.send_bytes(bytes(t.data)) + for tensor in tensors: + to_sendh.send_bytes(tensor.tobytes()) #TODO NOT FAST ENOUGH!!! self.perf_timer.measure_time("send") with self._from_worker_ch.recvh(timeout=None) as from_recvh: From c54e8802dace9768625ce89aa6322280bd5148c1 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 26 Aug 2024 13:28:06 -0500 Subject: [PATCH 69/84] Small change to app --- ex/high_throughput_inference/mock_app.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index ea72b3dc16..aaa1ee86ca 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -54,6 +54,7 @@ torch.set_num_threads(1) logger = get_logger("App") +logger.info("Started app") CHECK_RESULTS_AND_MAKE_ALL_SLOWER = False From 093d70621efdb007707463dcd000fb8bd2a52d8a Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Mon, 26 Aug 2024 16:53:56 -0500 Subject: [PATCH 70/84] Small change to app --- smartsim/_core/mli/infrastructure/control/workermanager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index e2ce19dd6d..7f6eb8edbf 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -107,7 +107,7 @@ def __init__( information among MLI components""" self._device_manager: t.Optional[DeviceManager] = None """Object responsible for model caching and device access""" - self._perf_timer = PerfTimer(prefix="w_", debug=True, timing_on=True) + self._perf_timer = PerfTimer(prefix="w_", debug=False, timing_on=True) """Performance timer""" def _on_start(self) -> None: From d9de5c13f6a91bbc26a80c482525c6d687900fba Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 27 Aug 2024 13:05:27 -0500 Subject: [PATCH 71/84] Last fixes! --- doc/changelog.md | 1 + .../mock_app_redis.py | 16 +- .../infrastructure/control/devicemanager.py | 6 +- .../control/{commons.py => error_handling.py} | 2 + .../control/requestdispatcher.py | 64 +++---- .../infrastructure/control/workermanager.py | 48 ++--- .../_core/mli/infrastructure/worker/worker.py | 41 ++++- tests/dragon/test_error_handling.py | 59 ++++++- tests/dragon/test_request_dispatcher.py | 164 +++++++++--------- 9 files changed, 243 insertions(+), 158 deletions(-) rename smartsim/_core/mli/infrastructure/control/{commons.py => error_handling.py} (96%) diff --git a/doc/changelog.md b/doc/changelog.md index 964e62b49d..ac09ecf604 100644 --- a/doc/changelog.md +++ b/doc/changelog.md @@ -13,6 +13,7 @@ Jump to: Description +- Add RequestDispatcher and the possibility of batching inference requests - Enable hostname selection for dragon tasks - Remove pydantic dependency from MLI code - Update MLI environment variables using new naming convention diff --git a/ex/high_throughput_inference/mock_app_redis.py b/ex/high_throughput_inference/mock_app_redis.py index c0e67f82df..8978bcea23 100644 --- a/ex/high_throughput_inference/mock_app_redis.py +++ b/ex/high_throughput_inference/mock_app_redis.py @@ -31,6 +31,7 @@ import torch from mpi4py import MPI from smartsim.log import get_logger +from smartsim._core.utils.timings import PerfTimer from smartredis import Client logger = get_logger("App") @@ -69,26 +70,21 @@ def name(self): client = Client(cluster=False, address=None) client.set_model(resnet.name, resnet.model, backend='TORCH', device=args.device.upper()) + perf_timer: PerfTimer = PerfTimer(debug=False, timing_on=timing_on, prefix=f"redis{rank}_") + total_iterations = 100 timings=[] for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]: logger.info(f"Batch size: {batch_size}") for iteration_number in range(total_iterations + int(batch_size==1)): - timing = [batch_size] + perf_timer.start_timings("batch_size", batch_size) logger.info(f"Iteration: {iteration_number}") - start = time.perf_counter() input_name = f"batch_{rank}" output_name = f"result_{rank}" client.put_tensor(name=input_name, data=resnet.get_batch(batch_size).numpy()) client.run_model(name=resnet.name, inputs=[input_name], outputs=[output_name]) result = client.get_tensor(name=output_name) - end = time.perf_counter() - timing.append(end-start) - timings.append(timing) - + perf_timer.end_timings() - timings_np = numpy.asarray(timings) - numpy.save(f"timings_{rank}.npy", timings_np) - for timing in timings: - print(" ".join(str(t) for t in timing)) + perf_timer.print_timings(True) diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py index 37256581db..d716d756e4 100644 --- a/smartsim/_core/mli/infrastructure/control/devicemanager.py +++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py @@ -116,7 +116,7 @@ def _load_model_on_device( model_bytes = worker.fetch_model(batch, feature_stores) loaded_model = worker.load_model(batch, model_bytes, self._device.name) - self._device.add_model(batch.model_key.key, loaded_model.model) + self._device.add_model(batch.model_id.key, loaded_model.model) def get_device( self, @@ -139,8 +139,8 @@ def get_device( # Load model if not already loaded, or # because it is sent with the request - if model_in_request or not batch.model_key.key in self._device: + if model_in_request or not batch.model_id.key in self._device: self._load_model_on_device(worker, batch, feature_stores) - key_to_remove = batch.model_key.key if model_in_request else None + key_to_remove = batch.model_id.key if model_in_request else None return self._device.get(key_to_remove) diff --git a/smartsim/_core/mli/infrastructure/control/commons.py b/smartsim/_core/mli/infrastructure/control/error_handling.py similarity index 96% rename from smartsim/_core/mli/infrastructure/control/commons.py rename to smartsim/_core/mli/infrastructure/control/error_handling.py index a40ae014aa..e2c5bcd9e1 100644 --- a/smartsim/_core/mli/infrastructure/control/commons.py +++ b/smartsim/_core/mli/infrastructure/control/error_handling.py @@ -66,3 +66,5 @@ def exception_handler( ) if reply_channel: reply_channel.send(serialized_resp) + else: + logger.warning("Unable to notify client of error without reply_channel") diff --git a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py index a4de00a9f0..d56912a8f0 100644 --- a/smartsim/_core/mli/infrastructure/control/requestdispatcher.py +++ b/smartsim/_core/mli/infrastructure/control/requestdispatcher.py @@ -48,33 +48,31 @@ from .....log import get_logger from ....utils.timings import PerfTimer from ...infrastructure.environmentloader import EnvironmentConfigLoader -from ...infrastructure.storage.featurestore import FeatureStore, FeatureStoreKey +from ...infrastructure.storage.featurestore import FeatureStore from ...infrastructure.worker.worker import ( InferenceRequest, MachineLearningWorkerBase, + ModelIdentifier, RequestBatch, ) -from .commons import exception_handler +from .error_handling import exception_handler if t.TYPE_CHECKING: from smartsim._core.mli.mli_schemas.response.response_capnp import Status logger = get_logger("Request Dispatcher") -# Placeholder -ModelIdentifier = FeatureStoreKey - class BatchQueue(Queue[InferenceRequest]): def __init__( - self, batch_timeout: float, batch_size: int, model_key: ModelIdentifier + self, batch_timeout: float, batch_size: int, model_id: ModelIdentifier ) -> None: """Queue used to store inference requests waiting to be batched and sent to Worker Managers. :param batch_timeout: Time in seconds that has to be waited before flushing a non-full queue. The time of the first item put is 0 seconds. :param batch_size: Total capacity of the queue. - :param model_key: Key of the model which needs to be executed on the queued + :param model_id: Key of the model which needs to be executed on the queued requests """ super().__init__(maxsize=batch_size) @@ -88,8 +86,8 @@ def __init__( self._disposable = False """Whether the queue will not be used again and can be deleted. A disposable queue is always full.""" - self._model_key: FeatureStoreKey = model_key - """Key of the model which needs to be executed on the queued requets""" + self._model_id: ModelIdentifier = model_id + """Key of the model which needs to be executed on the queued requests""" self._uid = str(uuid.uuid4()) """Unique ID of queue""" @@ -99,9 +97,9 @@ def uid(self) -> str: return self._uid @property - def model_key(self) -> ModelIdentifier: + def model_id(self) -> ModelIdentifier: """Key of the model which needs to be run on the queued requests""" - return self._model_key + return self._model_id def put( self, @@ -115,11 +113,9 @@ def put( :param timeout: Time (in seconds) to wait if block==True :raises Full: If an item cannot be put on the queue """ - if self.full(): - raise Full + super().put(item, block=block, timeout=timeout) if self._first_put is None: self._first_put = time.time() - super().put(item, block=block, timeout=timeout) @property def _elapsed_time(self) -> float: @@ -168,8 +164,6 @@ def full(self) -> bool: """Return True if the queue has reached its maximum capacity""" if self._disposable: return True - if self._batch_size <= 0: - return False return self.qsize() >= self._batch_size def empty(self) -> bool: @@ -184,6 +178,7 @@ def __init__( batch_size: int, config_loader: EnvironmentConfigLoader, worker_type: t.Type[MachineLearningWorkerBase], + mem_pool_size: int = 2 * 1024**3, ) -> None: """The RequestDispatcher intercepts inference requests, stages them in queues and batches them together before making them available to Worker @@ -195,11 +190,12 @@ def __init__( managers :param config_loader: Object to load configuration from environment :param worker_type: Type of worker to instantiate to batch inputs + :param mem_pool_size: Size of the memory pool used to allocate tensors :raises SmartSimError: If config_loaded.get_queue() does not return a channel """ super().__init__(as_service=True, cooldown=1) self._queues: dict[str, list[BatchQueue]] = {} - """Dict of all batch queues available for a given model key""" + """Dict of all batch queues available for a given model id""" self._active_queues: dict[str, BatchQueue] = {} """Mapping telling which queue is the recipient of requests for a given model key""" @@ -225,7 +221,7 @@ def __init__( """The type of communication channel to construct for callbacks""" self._worker = worker_type() """The worker used to batch inputs""" - self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(2 * 1024**3).sdesc) + self._mem_pool = MemoryPool.attach(dragon_gs_pool.create(mem_pool_size).sdesc) """Memory pool used to share batched input tensors with the Worker Managers""" self._perf_timer = PerfTimer(prefix="r_", debug=False, timing_on=True) """Performance timer""" @@ -316,6 +312,9 @@ def _validate_request(self, request: InferenceRequest) -> bool: return all(checks) def _on_iteration(self) -> None: + """This method is executed repeatedly until ``Service`` shutdown + conditions are satisfied and cooldown is elapsed. + """ try: self._perf_timer.set_active(True) bytes_list: t.List[bytes] = self._incoming_channel.recv() @@ -390,26 +389,25 @@ def task_queue(self) -> DragonQueue: """The queue on which batched requests are placed""" return self._outgoing_queue - def _swap_queue(self, model_key: FeatureStoreKey) -> None: + def _swap_queue(self, model_id: ModelIdentifier) -> None: """Get an empty queue or create a new one and make it the active one for a given model. - :param model_key: The key of the model for which the + :param model_id: The id of the model for which the queue has to be swapped """ - - if model_key.key in self._queues: - for queue in self._queues[model_key.key]: + if model_id.key in self._queues: + for queue in self._queues[model_id.key]: if not queue.full(): - self._active_queues[model_key.key] = queue + self._active_queues[model_id.key] = queue return - new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_key) - if model_key.key in self._queues: - self._queues[model_key.key].append(new_queue) + new_queue = BatchQueue(self._batch_timeout, self._batch_size, model_id) + if model_id.key in self._queues: + self._queues[model_id.key].append(new_queue) else: - self._queues[model_key.key] = [new_queue] - self._active_queues[model_key.key] = new_queue + self._queues[model_id.key] = [new_queue] + self._active_queues[model_id.key] = new_queue return def dispatch(self, request: InferenceRequest) -> None: @@ -422,7 +420,7 @@ def dispatch(self, request: InferenceRequest) -> None: tmp_queue: BatchQueue = BatchQueue( batch_timeout=0, batch_size=1, - model_key=FeatureStoreKey(key=tmp_id, descriptor="TMP"), + model_id=ModelIdentifier(key=tmp_id, descriptor="TMP"), ) self._active_queues[tmp_id] = tmp_queue self._queues[tmp_id] = [tmp_queue] @@ -451,7 +449,7 @@ def flush_requests(self) -> None: batch = RequestBatch( requests=queue.flush(), inputs=None, - model_key=queue.model_key, + model_id=queue.model_id, ) finally: self._perf_timer.measure_time("flush_requests") @@ -499,4 +497,8 @@ def flush_requests(self) -> None: self._perf_timer.measure_time("put") def _can_shutdown(self) -> bool: + """Whether the Service can be shut down""" return False + + def __del__(self) -> None: + self._mem_pool.destroy() diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index 7f6eb8edbf..da65412d23 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -51,7 +51,7 @@ RequestBatch, ) from ...message_handler import MessageHandler -from .commons import build_failure_reply, exception_handler +from .error_handling import build_failure_reply, exception_handler from .devicemanager import DeviceManager, WorkerDevice if t.TYPE_CHECKING: @@ -75,21 +75,20 @@ def __init__( ) -> None: """Initialize the WorkerManager - :param config_loader: Environment config loader that loads the task queue and - feature store + :param config_loader: Environment config loader for loading queues + and feature stores :param worker_type: The type of worker to manage - :param dispatcher_queue: Queue from which the batched requests have to be pulled + :param dispatcher_queue: Queue from which the batched requests are pulled :param as_service: Specifies run-once or run-until-complete behavior of service :param cooldown: Number of seconds to wait before shutting down after shutdown criteria are met - :param comm_channel_type: The type of communication channel used for callbacks :param device: The device on which the Worker should run. Every worker manager is assigned one single GPU (if available), thus the device should have no index. """ super().__init__(as_service, cooldown) self._dispatcher_queue = dispatcher_queue - """The dispatcher queue the manager monitors for new tasks""" + """The Dispatcher queue that the WorkerManager monitors for new batches""" self._worker = worker_type() """The ML Worker implementation""" self._callback_factory = config_loader._callback_factory @@ -111,6 +110,8 @@ def __init__( """Performance timer""" def _on_start(self) -> None: + """Called on initial entry into Service `execute` event loop before + `_on_iteration` is invoked.""" self._device_manager = DeviceManager(WorkerDevice(self._device)) def _check_feature_stores(self, batch: RequestBatch) -> bool: @@ -121,8 +122,8 @@ def _check_feature_stores(self, batch: RequestBatch) -> bool: """ # collect all feature stores required by the request fs_model: t.Set[str] = set() - if batch.model_key.key: - fs_model = {batch.model_key.descriptor} + if batch.model_id.key: + fs_model = {batch.model_id.descriptor} fs_inputs = {key.descriptor for key in batch.input_keys} fs_outputs = {key.descriptor for key in batch.output_keys} @@ -180,23 +181,30 @@ def _on_iteration(self) -> None: ) return + if self._device_manager is None: + for request in batch.requests: + msg = "No Device Manager found. WorkerManager._on_start() " + "must be called after initialization. If possible, " + "you should use `WorkerManager.execute()` instead of " + "directly calling `_on_iteration()`." + try: + self._dispatcher_queue.put(batch) + except Exception: + msg += "\nThe batch could not be put back in the queue " + "and will not be processed." + exception_handler( + RuntimeError(msg), + request.callback, + "Error acquiring device manager", + ) + return + try: - if self._device_manager is None: - for request in batch.requests: - exception_handler( - ValueError( - "No Device Manager available: did you call _on_start()?" - ), - request.callback, - "Error acquiring device manager", - ) - return device_cm = self._device_manager.get_device( worker=self._worker, batch=batch, feature_stores=self._feature_stores, ) - except Exception as exc: for request in batch.requests: exception_handler( @@ -210,7 +218,7 @@ def _on_iteration(self) -> None: with device_cm as device: try: - model_result = LoadModelResult(device.get_model(batch.model_key.key)) + model_result = LoadModelResult(device.get_model(batch.model_id.key)) except Exception as exc: for request in batch.requests: exception_handler( diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 008b6202be..6ce3323407 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -47,6 +47,8 @@ logger = get_logger(__name__) +# Placeholder +ModelIdentifier = FeatureStoreKey class InferenceRequest: """Internal representation of an inference request from a client""" @@ -181,24 +183,38 @@ class RequestBatch: requests: list[InferenceRequest] inputs: t.Optional[TransformInputResult] - model_key: FeatureStoreKey + model_id: ModelIdentifier @property def has_valid_requests(self) -> bool: + """Returns whether the batch contains at least one request. + + :return: True if at least one request is available + """ return len(self.requests) > 0 @property def has_raw_model(self) -> bool: + """Returns whether the batch has a raw model + + :return: True if the batch has a raw model + """ return self.raw_model is not None @property def raw_model(self) -> t.Optional[t.Any]: + """Returns the raw model to use to execute for this batch + if it is available. + :return: A model if available, otherwise None""" if self.has_valid_requests: return self.requests[0].raw_model return None @property def input_keys(self) -> t.List[FeatureStoreKey]: + """All input keys available in this batch's requests + + :return: All input keys belonging to requests in this batch""" keys = [] for request in self.requests: keys.extend(request.input_keys) @@ -207,6 +223,9 @@ def input_keys(self) -> t.List[FeatureStoreKey]: @property def output_keys(self) -> t.List[FeatureStoreKey]: + """All output keys available in this batch's requests + + :return: All output keys belonging to requests in this batch""" keys = [] for request in self.requests: keys.extend(request.output_keys) @@ -299,7 +318,11 @@ def fetch_model( """Given a resource key, retrieve the raw model from a feature store :param batch: The batch of requests that triggered the pipeline :param feature_stores: Available feature stores used for persistence - :return: Raw bytes of the model""" + :return: Raw bytes of the model + :raises SmartSimError: if neither a key or a model are provided or the + model cannot be retrieved from the feature store + :raises ValueError: if a feature store is not available and a raw + model is not provided""" # All requests in the same batch share the model if batch.raw_model: @@ -308,12 +331,12 @@ def fetch_model( if not feature_stores: raise ValueError("Feature store is required for model retrieval") - if batch.model_key is None: + if batch.model_id is None: raise SmartSimError( "Key must be provided to retrieve model from feature store" ) - key, fsd = batch.model_key.key, batch.model_key.descriptor + key, fsd = batch.model_id.key, batch.model_id.descriptor try: feature_store = feature_stores[fsd] @@ -331,7 +354,9 @@ def fetch_inputs( and input metadata :param batch: The batch of requests that triggered the pipeline :param feature_stores: Available feature stores used for persistence - :return: the fetched input""" + :return: the fetched input + :raises ValueError: If neither an input key or an input tensor are provided + :raises SmartSimError: If a tensor for a given key cannot be retrieved""" fetch_results = [] for request in batch.requests: if request.raw_inputs: @@ -354,7 +379,7 @@ def fetch_inputs( except KeyError as ex: logger.exception(ex) raise SmartSimError( - f"Model could not be retrieved with key {fs_key.key}" + f"Tensor could not be retrieved with key {fs_key.key}" ) from ex fetch_results.append( FetchInputResult(data, meta=None) @@ -376,7 +401,9 @@ def place_output( :param request: The request that triggered the pipeline :param execute_result: Results from inference :param feature_stores: Available feature stores used for persistence - :return: A collection of keys that were placed in the feature store""" + :return: A collection of keys that were placed in the feature store + :raises ValueError: If a feature store is not provided + """ if not feature_stores: raise ValueError("Feature store is required for output persistence") diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 113f7ccba0..9544768447 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -142,7 +142,7 @@ def setup_worker_manager_model_bytes( request_batch = RequestBatch( [request], TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]), - model_id, + model_id=model_id, ) dispatcher_task_queue.put(request_batch) @@ -184,12 +184,12 @@ def setup_worker_manager_model_key( tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) - model_key = FeatureStoreKey( + model_id = FeatureStoreKey( key="model key", descriptor=app_feature_store.descriptor ) request = InferenceRequest( - model_key=model_key, + model_key=model_id, callback=None, raw_inputs=None, input_keys=[tensor_key], @@ -201,7 +201,7 @@ def setup_worker_manager_model_key( request_batch = RequestBatch( [request], TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]), - model_key=model_key, + model_id=model_id, ) dispatcher_task_queue.put(request_batch) @@ -252,6 +252,51 @@ def setup_request_dispatcher_model_bytes( return request_dispatcher, integrated_worker_type +@pytest.fixture +def setup_request_dispatcher_model_key( + test_dir, + monkeypatch: pytest.MonkeyPatch, + backbone_descriptor: str, + app_feature_store: FeatureStore, +): + integrated_worker_type = IntegratedTorchWorker + + chan = Channel.make_process_local() + queue = FLInterface(main_ch=chan) + monkeypatch.setenv( + "_SMARTSIM_REQUEST_QUEUE", du.B64.bytes_to_str(queue.serialize()) + ) + # Put backbone descriptor into env var for the `EnvironmentConfigLoader` + monkeypatch.setenv("_SMARTSIM_INFRA_BACKBONE", backbone_descriptor) + + config_loader = EnvironmentConfigLoader( + featurestore_factory=DragonFeatureStore.from_descriptor, + callback_factory=FileSystemCommChannel.from_descriptor, + queue_factory=DragonFLIChannel.from_descriptor, + ) + + request_dispatcher = RequestDispatcher( + batch_timeout=0, + batch_size=0, + config_loader=config_loader, + worker_type=integrated_worker_type, + ) + request_dispatcher._on_start() + + tensor_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) + output_key = MessageHandler.build_tensor_key("key", app_feature_store.descriptor) + model_key = MessageHandler.build_model_key( + key="model key", feature_store_descriptor=app_feature_store.descriptor + ) + request = MessageHandler.build_request( + test_dir, model_key, [tensor_key], [output_key], [], None + ) + ser_request = MessageHandler.serialize_request(request) + + request_dispatcher._incoming_channel.send(ser_request) + + return request_dispatcher, integrated_worker_type + def mock_pipeline_stage(monkeypatch: pytest.MonkeyPatch, integrated_worker, stage): def mock_stage(*args, **kwargs): raise ValueError(f"Simulated error in {stage}") @@ -259,7 +304,7 @@ def mock_stage(*args, **kwargs): monkeypatch.setattr(integrated_worker, stage, mock_stage) mock_reply_fn = MagicMock() monkeypatch.setattr( - "smartsim._core.mli.infrastructure.control.commons.build_failure_reply", + "smartsim._core.mli.infrastructure.control.error_handling.build_failure_reply", mock_reply_fn, ) @@ -371,7 +416,7 @@ def test_wm_pipeline_stage_errors_handled( "setup_request_dispatcher", [ pytest.param("setup_request_dispatcher_model_bytes"), - # pytest.param("setup_worker_manager_model_key"), + pytest.param("setup_request_dispatcher_model_key"), ], ) @pytest.mark.parametrize( @@ -424,7 +469,7 @@ def test_exception_handling_helper(monkeypatch: pytest.MonkeyPatch): mock_reply_fn = MagicMock() monkeypatch.setattr( - "smartsim._core.mli.infrastructure.control.commons.build_failure_reply", + "smartsim._core.mli.infrastructure.control.error_handling.build_failure_reply", mock_reply_fn, ) diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index 768467c245..8ccd55f634 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import gc import io import logging import pathlib @@ -109,7 +110,6 @@ def mock_messages( comm_channel_root_dir: pathlib.Path, ) -> None: """Mock event producer for triggering the inference pipeline""" - logger.info("Mocking messages") feature_store_root_dir.mkdir(parents=True, exist_ok=True) comm_channel_root_dir.mkdir(parents=True, exist_ok=True) @@ -117,11 +117,9 @@ def mock_messages( model_bytes = model_path.read_bytes() model_key = str(feature_store_root_dir / "model_fs.pt") - logger.info("Putting model on FS") feature_store[model_key] = model_bytes for iteration_number in range(2): - logger.info(f"Message #{iteration_number}") channel_key = Channel.make_process_local().serialize() callback_channel = DragonCommChannel(channel_key) @@ -212,7 +210,7 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: descriptor = base64.b64encode(to_worker_fli_serialized).decode("utf-8") os.environ["_SMARTSIM_REQUEST_QUEUE"] = descriptor - ddict = DDict(1, 1, 2 * 1024**2) + ddict = DDict(1, 2, 4 * 1024**2) dragon_fs = DragonFeatureStore(ddict) config_loader = EnvironmentConfigLoader( @@ -227,6 +225,7 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: batch_size=2, config_loader=config_loader, worker_type=integrated_worker_type, + mem_pool_size=2*1024**2, ) worker_queue = config_loader.get_queue() @@ -238,86 +237,91 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: request_dispatcher._on_start() - batch: t.Optional[RequestBatch] = None - mem_allocs = [] - tensors = [] - fs_path = test_path / f"feature_store" - comm_path = test_path / f"comm_store" - model_key = str(fs_path / "model_fs.pt") - - # create a mock client application to populate the request queue - msg_pump = mp.Process( - target=mock_messages, - args=( - worker_queue, - dragon_fs, - fs_path, - comm_path, - ), - ) + for _ in range(2): + batch: t.Optional[RequestBatch] = None + mem_allocs = [] + tensors = [] + fs_path = test_path / f"feature_store" + comm_path = test_path / f"comm_store" + model_key = str(fs_path / "model_fs.pt") + + # create a mock client application to populate the request queue + msg_pump = mp.Process( + target=mock_messages, + args=( + worker_queue, + dragon_fs, + fs_path, + comm_path, + ), + ) + + msg_pump.start() - msg_pump.start() + time.sleep(1) - time.sleep(1) + for attempts in range(15): + try: + request_dispatcher._on_iteration() + batch = request_dispatcher.task_queue.get(timeout=1) + break + except Empty: + continue + except Exception as exc: + raise exc - for attempts in range(15): try: - request_dispatcher._on_iteration() - batch = request_dispatcher.task_queue.get(timeout=1) - break - except Empty: - continue + assert batch is not None + assert batch.has_valid_requests + + transform_result = batch.inputs + for transformed, dims, dtype in zip( + transform_result.transformed, transform_result.dims, transform_result.dtypes + ): + mem_alloc = MemoryAlloc.attach(transformed) + mem_allocs.append(mem_alloc) + itemsize = np.empty((1), dtype=dtype).itemsize + tensors.append( + torch.from_numpy( + np.frombuffer( + mem_alloc.get_memview()[0 : np.prod(dims) * itemsize], + dtype=dtype, + ).reshape(dims) + ) + ) + + assert len(batch.requests) == 2 + assert batch.model_id.key == model_key + assert model_key in request_dispatcher._queues + assert model_key in request_dispatcher._active_queues + assert len(request_dispatcher._queues[model_key]) == 1 + assert request_dispatcher._queues[model_key][0].empty() + assert request_dispatcher._queues[model_key][0].model_id.key == model_key + assert len(tensors) == 1 + assert tensors[0].shape == torch.Size([2, 2]) + + for tensor in tensors: + for sample_idx in range(tensor.shape[0]): + tensor_in = tensor[sample_idx] + tensor_out = (sample_idx + 1) * torch.ones((2,), dtype=torch.float32) + assert torch.equal(tensor_in, tensor_out) + except Exception as exc: raise exc + finally: + for mem_alloc in mem_allocs: + mem_alloc.free() - try: - assert batch is not None - assert batch.has_valid_requests - - transform_result = batch.inputs - for transformed, dims, dtype in zip( - transform_result.transformed, transform_result.dims, transform_result.dtypes - ): - mem_alloc = MemoryAlloc.attach(transformed) - mem_allocs.append(mem_alloc) - itemsize = np.empty((1), dtype=dtype).itemsize - tensors.append( - torch.from_numpy( - np.frombuffer( - mem_alloc.get_memview()[0 : np.prod(dims) * itemsize], - dtype=dtype, - ).reshape(dims) - ) - ) - - assert len(batch.requests) == 2 - assert batch.model_key.key == model_key - assert model_key in request_dispatcher._queues - assert model_key in request_dispatcher._active_queues - assert len(request_dispatcher._queues[model_key]) == 1 - assert request_dispatcher._queues[model_key][0].empty() - assert request_dispatcher._queues[model_key][0].model_key.key == model_key - assert len(tensors) == 1 - assert tensors[0].shape == torch.Size([2, 2]) - - for tensor in tensors: - for sample_idx in range(tensor.shape[0]): - tensor_in = tensor[sample_idx] - tensor_out = (sample_idx + 1) * torch.ones((2,), dtype=torch.float32) - assert torch.equal(tensor_in, tensor_out) - - except Exception as exc: - raise exc - finally: - for mem_alloc in mem_allocs: - mem_alloc.free() - - msg_pump.kill() - - request_dispatcher._active_queues[model_key].make_disposable() - assert request_dispatcher._active_queues[model_key].can_be_removed - - request_dispatcher._on_iteration() - - assert model_key not in request_dispatcher._active_queues - assert model_key not in request_dispatcher._queues + msg_pump.kill() + + request_dispatcher._active_queues[model_key].make_disposable() + assert request_dispatcher._active_queues[model_key].can_be_removed + + request_dispatcher._on_iteration() + + assert model_key not in request_dispatcher._active_queues + assert model_key not in request_dispatcher._queues + + # Try to remove the dispatcher and free the memory + del request_dispatcher + gc.collect() \ No newline at end of file From eb03f0835c7820f091dd7f9cf3530a324e7ec119 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 27 Aug 2024 13:19:22 -0500 Subject: [PATCH 72/84] Avoid using t.Self --- smartsim/_core/mli/infrastructure/control/devicemanager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py index d716d756e4..74d278c9a9 100644 --- a/smartsim/_core/mli/infrastructure/control/devicemanager.py +++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py @@ -81,7 +81,7 @@ def __contains__(self, key: str) -> bool: return key in self._models @contextmanager - def get(self, key_to_remove: t.Optional[str]) -> t.Iterator[t.Self]: + def get(self, key_to_remove: t.Optional[str]) -> t.Iterator["WorkerDevice"]: yield self if key_to_remove is not None: self.remove_model(key_to_remove) From 1e1b8c910a7f46752ebeea61d6838b7dacaba50c Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 27 Aug 2024 13:31:00 -0500 Subject: [PATCH 73/84] Remove unused timing --- ex/high_throughput_inference/mock_app.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index aaa1ee86ca..0e43caf6a7 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -101,7 +101,6 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.perf_timer.measure_time("build_request") request_bytes = MessageHandler.serialize_request(request) self.perf_timer.measure_time("serialize_request") - self.perf_timer.measure_time("serialize_tensor") with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: to_sendh.send_bytes(request_bytes) for tensor in tensors: From be0b8e0ea675e16e9cfc723ff7f06bd1f9d2a31f Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 27 Aug 2024 13:32:33 -0500 Subject: [PATCH 74/84] Split timing for request and tensors --- ex/high_throughput_inference/mock_app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ex/high_throughput_inference/mock_app.py b/ex/high_throughput_inference/mock_app.py index 0e43caf6a7..517d18fb2f 100644 --- a/ex/high_throughput_inference/mock_app.py +++ b/ex/high_throughput_inference/mock_app.py @@ -103,10 +103,10 @@ def run_model(self, model: bytes | str, batch: torch.Tensor): self.perf_timer.measure_time("serialize_request") with self._to_worker_fli.sendh(timeout=None, stream_channel=self._to_worker_ch) as to_sendh: to_sendh.send_bytes(request_bytes) + self.perf_timer.measure_time("send_request") for tensor in tensors: to_sendh.send_bytes(tensor.tobytes()) #TODO NOT FAST ENOUGH!!! - - self.perf_timer.measure_time("send") + self.perf_timer.measure_time("send_tensors") with self._from_worker_ch.recvh(timeout=None) as from_recvh: resp = from_recvh.recv_bytes(timeout=None) self.perf_timer.measure_time("receive_response") From bc11d92b84bc63c244a6137780c765db2a11d42c Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 27 Aug 2024 14:00:24 -0500 Subject: [PATCH 75/84] Pin watchdog to <5 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 512da78de9..709913eda8 100644 --- a/setup.py +++ b/setup.py @@ -177,7 +177,7 @@ class BuildError(Exception): "filelock>=3.4.2", "protobuf~=3.20", "jinja2>=3.1.2", - "watchdog>=4.0.0", + "watchdog>=4.0.0,<5", "pycapnp==2.0.0", "pydantic==1.10.14", "pyzmq>=25.1.2", From b04f4c155fff42d805aac2b98e9050ab55a2d388 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 27 Aug 2024 14:07:08 -0500 Subject: [PATCH 76/84] Style --- smartsim/_core/mli/infrastructure/control/devicemanager.py | 2 +- smartsim/_core/mli/infrastructure/control/workermanager.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/control/devicemanager.py b/smartsim/_core/mli/infrastructure/control/devicemanager.py index 74d278c9a9..3570bd51ed 100644 --- a/smartsim/_core/mli/infrastructure/control/devicemanager.py +++ b/smartsim/_core/mli/infrastructure/control/devicemanager.py @@ -25,7 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import typing as t -from contextlib import contextmanager, _GeneratorContextManager +from contextlib import _GeneratorContextManager, contextmanager from .....log import get_logger from ...infrastructure.storage.featurestore import FeatureStore diff --git a/smartsim/_core/mli/infrastructure/control/workermanager.py b/smartsim/_core/mli/infrastructure/control/workermanager.py index da65412d23..54a245b813 100644 --- a/smartsim/_core/mli/infrastructure/control/workermanager.py +++ b/smartsim/_core/mli/infrastructure/control/workermanager.py @@ -51,8 +51,8 @@ RequestBatch, ) from ...message_handler import MessageHandler -from .error_handling import build_failure_reply, exception_handler from .devicemanager import DeviceManager, WorkerDevice +from .error_handling import build_failure_reply, exception_handler if t.TYPE_CHECKING: from smartsim._core.mli.mli_schemas.response.response_capnp import Status From 47088f09debf3e4643ce7b15cc5c83106e3a4b4e Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 27 Aug 2024 14:08:12 -0500 Subject: [PATCH 77/84] Other styling fixes --- smartsim/_core/mli/infrastructure/worker/worker.py | 1 + tests/dragon/test_error_handling.py | 5 ++--- tests/dragon/test_request_dispatcher.py | 12 ++++++++---- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/smartsim/_core/mli/infrastructure/worker/worker.py b/smartsim/_core/mli/infrastructure/worker/worker.py index 6ce3323407..25e4dc49f7 100644 --- a/smartsim/_core/mli/infrastructure/worker/worker.py +++ b/smartsim/_core/mli/infrastructure/worker/worker.py @@ -50,6 +50,7 @@ # Placeholder ModelIdentifier = FeatureStoreKey + class InferenceRequest: """Internal representation of an inference request from a client""" diff --git a/tests/dragon/test_error_handling.py b/tests/dragon/test_error_handling.py index 9544768447..b20424866a 100644 --- a/tests/dragon/test_error_handling.py +++ b/tests/dragon/test_error_handling.py @@ -184,9 +184,7 @@ def setup_worker_manager_model_key( tensor_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) output_key = FeatureStoreKey(key="key", descriptor=app_feature_store.descriptor) - model_id = FeatureStoreKey( - key="model key", descriptor=app_feature_store.descriptor - ) + model_id = FeatureStoreKey(key="model key", descriptor=app_feature_store.descriptor) request = InferenceRequest( model_key=model_id, @@ -297,6 +295,7 @@ def setup_request_dispatcher_model_key( return request_dispatcher, integrated_worker_type + def mock_pipeline_stage(monkeypatch: pytest.MonkeyPatch, integrated_worker, stage): def mock_stage(*args, **kwargs): raise ValueError(f"Simulated error in {stage}") diff --git a/tests/dragon/test_request_dispatcher.py b/tests/dragon/test_request_dispatcher.py index 8ccd55f634..c8d97dd7ed 100644 --- a/tests/dragon/test_request_dispatcher.py +++ b/tests/dragon/test_request_dispatcher.py @@ -225,7 +225,7 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: batch_size=2, config_loader=config_loader, worker_type=integrated_worker_type, - mem_pool_size=2*1024**2, + mem_pool_size=2 * 1024**2, ) worker_queue = config_loader.get_queue() @@ -276,7 +276,9 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: transform_result = batch.inputs for transformed, dims, dtype in zip( - transform_result.transformed, transform_result.dims, transform_result.dtypes + transform_result.transformed, + transform_result.dims, + transform_result.dtypes, ): mem_alloc = MemoryAlloc.attach(transformed) mem_allocs.append(mem_alloc) @@ -303,7 +305,9 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: for tensor in tensors: for sample_idx in range(tensor.shape[0]): tensor_in = tensor[sample_idx] - tensor_out = (sample_idx + 1) * torch.ones((2,), dtype=torch.float32) + tensor_out = (sample_idx + 1) * torch.ones( + (2,), dtype=torch.float32 + ) assert torch.equal(tensor_in, tensor_out) except Exception as exc: @@ -324,4 +328,4 @@ def test_request_dispatcher(prepare_environment: pathlib.Path) -> None: # Try to remove the dispatcher and free the memory del request_dispatcher - gc.collect() \ No newline at end of file + gc.collect() From 0609eec4437680bb3fc810558b767df1c13ce006 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 27 Aug 2024 14:20:30 -0500 Subject: [PATCH 78/84] Move tests that require dragon.MemoryPool --- tests/{mli => dragon}/test_core_machine_learning_worker.py | 4 ++-- tests/{mli => dragon}/test_device_manager.py | 2 ++ tests/{mli => dragon}/test_torch_worker.py | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) rename tests/{mli => dragon}/test_core_machine_learning_worker.py (99%) rename tests/{mli => dragon}/test_device_manager.py (98%) rename tests/{mli => dragon}/test_torch_worker.py (98%) diff --git a/tests/mli/test_core_machine_learning_worker.py b/tests/dragon/test_core_machine_learning_worker.py similarity index 99% rename from tests/mli/test_core_machine_learning_worker.py rename to tests/dragon/test_core_machine_learning_worker.py index 7ef4ab259b..145fe5b2cd 100644 --- a/tests/mli/test_core_machine_learning_worker.py +++ b/tests/dragon/test_core_machine_learning_worker.py @@ -42,8 +42,8 @@ from .featurestore import FileSystemFeatureStore, MemoryFeatureStore -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_b +# The tests in this file belong to the group_dragon group +pytestmark = pytest.mark.group_dragon # retrieved from pytest fixtures is_dragon = ( diff --git a/tests/mli/test_device_manager.py b/tests/dragon/test_device_manager.py similarity index 98% rename from tests/mli/test_device_manager.py rename to tests/dragon/test_device_manager.py index 1c8b9172da..b89f286c86 100644 --- a/tests/mli/test_device_manager.py +++ b/tests/dragon/test_device_manager.py @@ -46,6 +46,8 @@ TransformOutputResult, ) +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon class MockWorker(MachineLearningWorkerBase): @staticmethod diff --git a/tests/mli/test_torch_worker.py b/tests/dragon/test_torch_worker.py similarity index 98% rename from tests/mli/test_torch_worker.py rename to tests/dragon/test_torch_worker.py index 1e8bba7e33..4ff4fb9e55 100644 --- a/tests/mli/test_torch_worker.py +++ b/tests/dragon/test_torch_worker.py @@ -45,8 +45,8 @@ from smartsim.log import get_logger logger = get_logger(__name__) -# The tests in this file belong to the group_a group -pytestmark = pytest.mark.group_a +# The tests in this file belong to the group_dragon group +pytestmark = pytest.mark.group_dragon # simple MNIST in PyTorch From 275e102963339d473c865109565aaa127f6a09b7 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 27 Aug 2024 17:51:27 -0500 Subject: [PATCH 79/84] Update tests --- .../test_core_machine_learning_worker.py | 94 +++++++++++-------- tests/dragon/test_device_manager.py | 5 +- tests/dragon/test_torch_worker.py | 83 +++++++++++----- 3 files changed, 116 insertions(+), 66 deletions(-) diff --git a/tests/dragon/test_core_machine_learning_worker.py b/tests/dragon/test_core_machine_learning_worker.py index 145fe5b2cd..5b6056e5b7 100644 --- a/tests/dragon/test_core_machine_learning_worker.py +++ b/tests/dragon/test_core_machine_learning_worker.py @@ -35,6 +35,7 @@ from smartsim._core.mli.infrastructure.worker.worker import ( InferenceRequest, MachineLearningWorkerCore, + RequestBatch, TransformInputResult, TransformOutputResult, ) @@ -42,8 +43,8 @@ from .featurestore import FileSystemFeatureStore, MemoryFeatureStore -# The tests in this file belong to the group_dragon group -pytestmark = pytest.mark.group_dragon +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon # retrieved from pytest fixtures is_dragon = ( @@ -94,9 +95,11 @@ def test_fetch_model_disk(persist_torch_model: pathlib.Path, test_dir: str) -> N fsd = feature_store.descriptor feature_store[str(persist_torch_model)] = persist_torch_model.read_bytes() - request = InferenceRequest(model_key=FeatureStoreKey(key=key, descriptor=fsd)) + model_key = FeatureStoreKey(key=key, descriptor=fsd) + request = InferenceRequest(model_key=model_key) + batch = RequestBatch([request], None, model_key) - fetch_result = worker.fetch_model(request, {fsd: feature_store}) + fetch_result = worker.fetch_model(batch, {fsd: feature_store}) assert fetch_result.model_bytes assert fetch_result.model_bytes == persist_torch_model.read_bytes() @@ -110,10 +113,12 @@ def test_fetch_model_disk_missing() -> None: key = "/path/that/doesnt/exist" - request = InferenceRequest(model_key=FeatureStoreKey(key=key, descriptor=fsd)) + model_key = FeatureStoreKey(key=key, descriptor=fsd) + request = InferenceRequest(model_key=model_key) + batch = RequestBatch([request], None, model_key) with pytest.raises(sse.SmartSimError) as ex: - worker.fetch_model(request, {fsd: feature_store}) + worker.fetch_model(batch, {fsd: feature_store}) # ensure the error message includes key-identifying information assert key in ex.value.args[0] @@ -133,10 +138,11 @@ def test_fetch_model_feature_store(persist_torch_model: pathlib.Path) -> None: fsd = feature_store.descriptor feature_store[key] = persist_torch_model.read_bytes() - request = InferenceRequest( - model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor) - ) - fetch_result = worker.fetch_model(request, {fsd: feature_store}) + model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + request = InferenceRequest(model_key=model_key) + batch = RequestBatch([request], None, model_key) + + fetch_result = worker.fetch_model(batch, {fsd: feature_store}) assert fetch_result.model_bytes assert fetch_result.model_bytes == persist_torch_model.read_bytes() @@ -150,13 +156,13 @@ def test_fetch_model_feature_store_missing() -> None: feature_store = MemoryFeatureStore() fsd = feature_store.descriptor - request = InferenceRequest( - model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor) - ) + model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + request = InferenceRequest(model_key=model_key) + batch = RequestBatch([request], None, model_key) # todo: consider that raising this exception shows impl. replace... with pytest.raises(sse.SmartSimError) as ex: - worker.fetch_model(request, {fsd: feature_store}) + worker.fetch_model(batch, {fsd: feature_store}) # ensure the error message includes key-identifying information assert key in ex.value.args[0] @@ -173,11 +179,11 @@ def test_fetch_model_memory(persist_torch_model: pathlib.Path) -> None: fsd = feature_store.descriptor feature_store[key] = persist_torch_model.read_bytes() - request = InferenceRequest( - model_key=FeatureStoreKey(key=key, descriptor=feature_store.descriptor) - ) + model_key = FeatureStoreKey(key=key, descriptor=feature_store.descriptor) + request = InferenceRequest(model_key=model_key) + batch = RequestBatch([request], None, model_key) - fetch_result = worker.fetch_model(request, {fsd: feature_store}) + fetch_result = worker.fetch_model(batch, {fsd: feature_store}) assert fetch_result.model_bytes assert fetch_result.model_bytes == persist_torch_model.read_bytes() @@ -193,12 +199,16 @@ def test_fetch_input_disk(persist_torch_tensor: pathlib.Path) -> None: request = InferenceRequest( input_keys=[FeatureStoreKey(key=tensor_name, descriptor=fsd)] ) + + model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + batch = RequestBatch([request], None, model_key) + worker = MachineLearningWorkerCore feature_store[tensor_name] = persist_torch_tensor.read_bytes() - fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) - assert fetch_result.inputs is not None + fetch_result = worker.fetch_inputs(batch, {fsd: feature_store}) + assert fetch_result[0].inputs is not None def test_fetch_input_disk_missing() -> None: @@ -212,8 +222,11 @@ def test_fetch_input_disk_missing() -> None: request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) + model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + batch = RequestBatch([request], None, model_key) + with pytest.raises(sse.SmartSimError) as ex: - worker.fetch_inputs(request, {fsd: feature_store}) + worker.fetch_inputs(batch, {fsd: feature_store}) # ensure the error message includes key-identifying information assert key[0] in ex.value.args[0] @@ -236,9 +249,12 @@ def test_fetch_input_feature_store(persist_torch_tensor: pathlib.Path) -> None: # put model bytes into the feature store feature_store[tensor_name] = persist_torch_tensor.read_bytes() - fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) - assert fetch_result.inputs - assert list(fetch_result.inputs)[0][:10] == persist_torch_tensor.read_bytes()[:10] + model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + batch = RequestBatch([request], None, model_key) + + fetch_result = worker.fetch_inputs(batch, {fsd: feature_store}) + assert fetch_result[0].inputs + assert list(fetch_result[0].inputs)[0][:10] == persist_torch_tensor.read_bytes()[:10] @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") @@ -269,9 +285,12 @@ def test_fetch_multi_input_feature_store(persist_torch_tensor: pathlib.Path) -> ] ) - fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) + model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + batch = RequestBatch([request], None, model_key) - raw_bytes = list(fetch_result.inputs) + fetch_result = worker.fetch_inputs(batch, {fsd: feature_store}) + + raw_bytes = list(fetch_result[0].inputs) assert raw_bytes assert raw_bytes[0][:10] == persist_torch_tensor.read_bytes()[:10] assert raw_bytes[1][:10] == body2[:10] @@ -288,8 +307,11 @@ def test_fetch_input_feature_store_missing() -> None: fsd = feature_store.descriptor request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) + model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + batch = RequestBatch([request], None, model_key) + with pytest.raises(sse.SmartSimError) as ex: - worker.fetch_inputs(request, {fsd: feature_store}) + worker.fetch_inputs(batch, {fsd: feature_store}) # ensure the error message includes key-identifying information assert key in ex.value.args[0] @@ -307,21 +329,11 @@ def test_fetch_input_memory(persist_torch_tensor: pathlib.Path) -> None: feature_store[key] = persist_torch_tensor.read_bytes() request = InferenceRequest(input_keys=[FeatureStoreKey(key=key, descriptor=fsd)]) - fetch_result = worker.fetch_inputs(request, {fsd: feature_store}) - assert fetch_result.inputs is not None - - -def test_batch_requests() -> None: - """Verify batch requests handles an empty data set gracefully""" - worker = MachineLearningWorkerCore - result = TransformInputResult([]) - - request = InferenceRequest(batch_size=10) + model_key = FeatureStoreKey(key="test-model", descriptor=fsd) + batch = RequestBatch([request], None, model_key) - with pytest.raises(NotImplementedError): - # NOTE: we expect this to fail since it's not yet implemented. - # TODO: once implemented, replace this expectation of failure... - worker.batch_requests(request, result) + fetch_result = worker.fetch_inputs(batch, {fsd: feature_store}) + assert fetch_result[0].inputs is not None def test_place_outputs() -> None: diff --git a/tests/dragon/test_device_manager.py b/tests/dragon/test_device_manager.py index b89f286c86..71ea844ed8 100644 --- a/tests/dragon/test_device_manager.py +++ b/tests/dragon/test_device_manager.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest import typing as t from smartsim._core.mli.infrastructure.control.devicemanager import ( @@ -129,7 +130,7 @@ def test_device_manager_model_in_request(): request_batch = RequestBatch( [request], TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]), - model_key=model_key, + model_id=model_key, ) with device_manager.get_device( @@ -167,7 +168,7 @@ def test_device_manager_model_key(): request_batch = RequestBatch( [request], TransformInputResult(b"transformed", [slice(0, 1)], [[1, 2]], ["float32"]), - model_key=model_key, + model_id=model_key, ) with device_manager.get_device( diff --git a/tests/dragon/test_torch_worker.py b/tests/dragon/test_torch_worker.py index 4ff4fb9e55..2d10af623d 100644 --- a/tests/dragon/test_torch_worker.py +++ b/tests/dragon/test_torch_worker.py @@ -26,8 +26,15 @@ import io +import numpy as np import pytest import torch +import typing as t + +dragon = pytest.importorskip("dragon") +import dragon.globalservices.pool as dragon_gs_pool +from dragon.managed_memory import MemoryPool, MemoryAlloc + from torch import nn from torch.nn import functional as F @@ -39,14 +46,15 @@ FetchModelResult, InferenceRequest, LoadModelResult, + RequestBatch, TransformInputResult, ) from smartsim._core.mli.message_handler import MessageHandler from smartsim.log import get_logger logger = get_logger(__name__) -# The tests in this file belong to the group_dragon group -pytestmark = pytest.mark.group_dragon +# The tests in this file belong to the dragon group +pytestmark = pytest.mark.dragon # simple MNIST in PyTorch @@ -60,7 +68,7 @@ def __init__(self): self.fc1 = nn.Linear(9216, 128) self.fc2 = nn.Linear(128, 10) - def forward(self, x): + def forward(self, x, y): x = self.conv1(x) x = F.relu(x) x = self.conv2(x) @@ -86,7 +94,7 @@ def get_batch() -> torch.Tensor: def create_torch_model(): n = Net() example_forward_input = get_batch() - module = torch.jit.trace(n, example_forward_input) + module = torch.jit.trace(n, [example_forward_input, example_forward_input]) model_buffer = io.BytesIO() torch.jit.save(module, model_buffer) return model_buffer.getvalue() @@ -112,18 +120,23 @@ def get_request() -> InferenceRequest: batch_size=0, ) +def get_request_batch_from_request(request: InferenceRequest, inputs: t.Optional[TransformInputResult] = None) -> RequestBatch: + + return RequestBatch([request], inputs, request.model_key) sample_request: InferenceRequest = get_request() +sample_request_batch: RequestBatch = get_request_batch_from_request(sample_request) worker = TorchWorker() def test_load_model(mlutils) -> None: fetch_model_result = FetchModelResult(sample_request.raw_model) load_model_result = worker.load_model( - sample_request, fetch_model_result, mlutils.get_test_device().lower() + sample_request_batch, fetch_model_result, mlutils.get_test_device().lower() ) assert load_model_result.model( + get_batch().to(torch_device[mlutils.get_test_device().lower()]), get_batch().to(torch_device[mlutils.get_test_device().lower()]) ).shape == torch.Size((20, 10)) @@ -133,44 +146,68 @@ def test_transform_input(mlutils) -> None: sample_request.raw_inputs, sample_request.input_meta ) + mem_pool = MemoryPool.attach(dragon_gs_pool.create(1024**2).sdesc) + transform_input_result = worker.transform_input( - sample_request, fetch_input_result, mlutils.get_test_device().lower() + sample_request_batch, [fetch_input_result], mem_pool ) - assert all( - transformed.shape == get_batch().shape - for transformed in transform_input_result.transformed - ) + batch = get_batch().numpy() + assert transform_input_result.slices[0] == slice(0, batch.shape[0]) + + for tensor_index in range(2): + assert torch.Size(transform_input_result.dims[tensor_index]) == batch.shape + assert transform_input_result.dtypes[tensor_index] == str(batch.dtype) + mem_alloc = MemoryAlloc.attach(transform_input_result.transformed[tensor_index]) + itemsize = batch.itemsize + tensor = torch.from_numpy( + np.frombuffer( + mem_alloc.get_memview()[0 : np.prod(transform_input_result.dims[tensor_index]) * itemsize], + dtype=transform_input_result.dtypes[tensor_index], + ).reshape(transform_input_result.dims[tensor_index]) + ) + + assert torch.equal(tensor, torch.from_numpy(sample_request.raw_inputs[tensor_index])) + + mem_pool.destroy() def test_execute(mlutils) -> None: load_model_result = LoadModelResult( Net().to(torch_device[mlutils.get_test_device().lower()]) ) - transform_result = TransformInputResult( - [ - get_batch().to(torch_device[mlutils.get_test_device().lower()]) - for _ in range(2) - ] + fetch_input_result = FetchInputResult( + sample_request.raw_inputs, sample_request.input_meta + ) + + request_batch = get_request_batch_from_request(sample_request, fetch_input_result) + + mem_pool = MemoryPool.attach(dragon_gs_pool.create(1024**2).sdesc) + + transform_result = worker.transform_input( + request_batch, [fetch_input_result], mem_pool ) - execute_result = worker.execute(sample_request, load_model_result, transform_result) + execute_result = worker.execute(request_batch, load_model_result, transform_result, mlutils.get_test_device().lower()) assert all( result.shape == torch.Size((20, 10)) for result in execute_result.predictions ) + mem_pool.destroy() + def test_transform_output(mlutils): - execute_result = ExecuteResult([torch.rand((20, 10)) for _ in range(2)]) + tensors = [torch.rand((20, 10)) for _ in range(2)] + execute_result = ExecuteResult(tensors, [slice(0, 20)]) transformed_output = worker.transform_output( - sample_request, execute_result, torch_device[mlutils.get_test_device().lower()] + sample_request_batch, execute_result ) - assert transformed_output.outputs == [ - item.numpy().tobytes() for item in execute_result.predictions + assert transformed_output[0].outputs == [ + item.numpy().tobytes() for item in tensors ] - assert transformed_output.shape == None - assert transformed_output.order == "c" - assert transformed_output.dtype == "float32" + assert transformed_output[0].shape == None + assert transformed_output[0].order == "c" + assert transformed_output[0].dtype == "float32" From b220d99e1180d36c354852b40b8b9e0f52cc4580 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 27 Aug 2024 17:58:11 -0500 Subject: [PATCH 80/84] Style --- .../test_core_machine_learning_worker.py | 4 +- tests/dragon/test_device_manager.py | 4 +- tests/dragon/test_torch_worker.py | 46 +++++++++++-------- 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/tests/dragon/test_core_machine_learning_worker.py b/tests/dragon/test_core_machine_learning_worker.py index 5b6056e5b7..d576997ea9 100644 --- a/tests/dragon/test_core_machine_learning_worker.py +++ b/tests/dragon/test_core_machine_learning_worker.py @@ -254,7 +254,9 @@ def test_fetch_input_feature_store(persist_torch_tensor: pathlib.Path) -> None: fetch_result = worker.fetch_inputs(batch, {fsd: feature_store}) assert fetch_result[0].inputs - assert list(fetch_result[0].inputs)[0][:10] == persist_torch_tensor.read_bytes()[:10] + assert ( + list(fetch_result[0].inputs)[0][:10] == persist_torch_tensor.read_bytes()[:10] + ) @pytest.mark.skipif(not torch_available, reason="Torch backend is not installed") diff --git a/tests/dragon/test_device_manager.py b/tests/dragon/test_device_manager.py index 71ea844ed8..fccb9b42f9 100644 --- a/tests/dragon/test_device_manager.py +++ b/tests/dragon/test_device_manager.py @@ -24,9 +24,10 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pytest import typing as t +import pytest + from smartsim._core.mli.infrastructure.control.devicemanager import ( DeviceManager, WorkerDevice, @@ -50,6 +51,7 @@ # The tests in this file belong to the dragon group pytestmark = pytest.mark.dragon + class MockWorker(MachineLearningWorkerBase): @staticmethod def fetch_model( diff --git a/tests/dragon/test_torch_worker.py b/tests/dragon/test_torch_worker.py index 2d10af623d..88e800240f 100644 --- a/tests/dragon/test_torch_worker.py +++ b/tests/dragon/test_torch_worker.py @@ -25,16 +25,15 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import io +import typing as t import numpy as np import pytest import torch -import typing as t dragon = pytest.importorskip("dragon") import dragon.globalservices.pool as dragon_gs_pool -from dragon.managed_memory import MemoryPool, MemoryAlloc - +from dragon.managed_memory import MemoryAlloc, MemoryPool from torch import nn from torch.nn import functional as F @@ -120,10 +119,14 @@ def get_request() -> InferenceRequest: batch_size=0, ) -def get_request_batch_from_request(request: InferenceRequest, inputs: t.Optional[TransformInputResult] = None) -> RequestBatch: + +def get_request_batch_from_request( + request: InferenceRequest, inputs: t.Optional[TransformInputResult] = None +) -> RequestBatch: return RequestBatch([request], inputs, request.model_key) + sample_request: InferenceRequest = get_request() sample_request_batch: RequestBatch = get_request_batch_from_request(sample_request) worker = TorchWorker() @@ -137,7 +140,7 @@ def test_load_model(mlutils) -> None: assert load_model_result.model( get_batch().to(torch_device[mlutils.get_test_device().lower()]), - get_batch().to(torch_device[mlutils.get_test_device().lower()]) + get_batch().to(torch_device[mlutils.get_test_device().lower()]), ).shape == torch.Size((20, 10)) @@ -161,13 +164,17 @@ def test_transform_input(mlutils) -> None: mem_alloc = MemoryAlloc.attach(transform_input_result.transformed[tensor_index]) itemsize = batch.itemsize tensor = torch.from_numpy( - np.frombuffer( - mem_alloc.get_memview()[0 : np.prod(transform_input_result.dims[tensor_index]) * itemsize], - dtype=transform_input_result.dtypes[tensor_index], - ).reshape(transform_input_result.dims[tensor_index]) - ) - - assert torch.equal(tensor, torch.from_numpy(sample_request.raw_inputs[tensor_index])) + np.frombuffer( + mem_alloc.get_memview()[ + 0 : np.prod(transform_input_result.dims[tensor_index]) * itemsize + ], + dtype=transform_input_result.dtypes[tensor_index], + ).reshape(transform_input_result.dims[tensor_index]) + ) + + assert torch.equal( + tensor, torch.from_numpy(sample_request.raw_inputs[tensor_index]) + ) mem_pool.destroy() @@ -188,7 +195,12 @@ def test_execute(mlutils) -> None: request_batch, [fetch_input_result], mem_pool ) - execute_result = worker.execute(request_batch, load_model_result, transform_result, mlutils.get_test_device().lower()) + execute_result = worker.execute( + request_batch, + load_model_result, + transform_result, + mlutils.get_test_device().lower(), + ) assert all( result.shape == torch.Size((20, 10)) for result in execute_result.predictions @@ -201,13 +213,9 @@ def test_transform_output(mlutils): tensors = [torch.rand((20, 10)) for _ in range(2)] execute_result = ExecuteResult(tensors, [slice(0, 20)]) - transformed_output = worker.transform_output( - sample_request_batch, execute_result - ) + transformed_output = worker.transform_output(sample_request_batch, execute_result) - assert transformed_output[0].outputs == [ - item.numpy().tobytes() for item in tensors - ] + assert transformed_output[0].outputs == [item.numpy().tobytes() for item in tensors] assert transformed_output[0].shape == None assert transformed_output[0].order == "c" assert transformed_output[0].dtype == "float32" From d3ab796004cb1f83f07b4c6f136a78b6956c3f82 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 27 Aug 2024 18:05:39 -0500 Subject: [PATCH 81/84] Import or skip dragon --- tests/dragon/test_core_machine_learning_worker.py | 2 ++ tests/dragon/test_device_manager.py | 1 + 2 files changed, 3 insertions(+) diff --git a/tests/dragon/test_core_machine_learning_worker.py b/tests/dragon/test_core_machine_learning_worker.py index d576997ea9..940c76c8a1 100644 --- a/tests/dragon/test_core_machine_learning_worker.py +++ b/tests/dragon/test_core_machine_learning_worker.py @@ -27,6 +27,8 @@ import pathlib import time +dragon = pytest.importorskip("dragon") + import pytest import torch diff --git a/tests/dragon/test_device_manager.py b/tests/dragon/test_device_manager.py index fccb9b42f9..2b7fa1f549 100644 --- a/tests/dragon/test_device_manager.py +++ b/tests/dragon/test_device_manager.py @@ -27,6 +27,7 @@ import typing as t import pytest +dragon = pytest.importorskip("dragon") from smartsim._core.mli.infrastructure.control.devicemanager import ( DeviceManager, From 14e627e8e91c3e63d664027bc255ec5edd319219 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 27 Aug 2024 18:12:46 -0500 Subject: [PATCH 82/84] Isort --- tests/dragon/test_device_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/dragon/test_device_manager.py b/tests/dragon/test_device_manager.py index 2b7fa1f549..8edeb60fbb 100644 --- a/tests/dragon/test_device_manager.py +++ b/tests/dragon/test_device_manager.py @@ -27,6 +27,7 @@ import typing as t import pytest + dragon = pytest.importorskip("dragon") from smartsim._core.mli.infrastructure.control.devicemanager import ( From bbe97ff8899dbdf5573df19707e321fa09140192 Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 27 Aug 2024 18:20:14 -0500 Subject: [PATCH 83/84] Fix pytest import --- tests/dragon/test_core_machine_learning_worker.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/dragon/test_core_machine_learning_worker.py b/tests/dragon/test_core_machine_learning_worker.py index 940c76c8a1..231a971241 100644 --- a/tests/dragon/test_core_machine_learning_worker.py +++ b/tests/dragon/test_core_machine_learning_worker.py @@ -27,9 +27,10 @@ import pathlib import time +import pytest + dragon = pytest.importorskip("dragon") -import pytest import torch import smartsim.error as sse From eea793e95f5971250da98f758d1c6ee247d3782c Mon Sep 17 00:00:00 2001 From: Al Rigazzi Date: Tue, 27 Aug 2024 18:33:20 -0500 Subject: [PATCH 84/84] Adapt syntax for python 3.9 --- smartsim/_core/utils/timings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/smartsim/_core/utils/timings.py b/smartsim/_core/utils/timings.py index 34595c8586..a61a243220 100644 --- a/smartsim/_core/utils/timings.py +++ b/smartsim/_core/utils/timings.py @@ -56,13 +56,13 @@ def _add_label_to_timings(self, label: str) -> None: self._timings[label] = [] @staticmethod - def _format_number(number: float | int) -> str: + def _format_number(number: t.Union[float, int]) -> str: return f"{number:0.4e}" def start_timings( self, first_label: t.Optional[str] = None, - first_value: t.Optional[float | int] = None, + first_value: t.Optional[t.Union[float, int]] = None, ) -> None: if self._timing_on: if first_label is not None and first_value is not None: @@ -86,7 +86,7 @@ def end_timings(self) -> None: def _make_label(self, label: str) -> str: return self._prefix + label - def _get_delta(self) -> float | int: + def _get_delta(self) -> t.Union[float, int]: if self._interm is None: return 0 return time.perf_counter() - self._interm